# Importing Libraries

pandas (pd): A powerful library for data manipulation and analysis, providing data structures like DataFrame for handling structured data efficiently.
    
numpy (np): A fundamental package for numerical computations in Python, supporting arrays, matrices, and a variety of mathematical functions.

collections: A module offering specialized container datatypes like Counter and OrderedDict for more efficient data handling.

matplotlib.pyplot (plt): A plotting library for creating static, animated, and interactive visualizations, offering a MATLAB-like interface for creating plots and graphs.

seaborn (sns): A data visualization library based on matplotlib, providing a high-level interface for drawing attractive and informative statistical graphics.

warnings: A module for managing warnings in your code, allowing you to display, filter, or ignore them as needed.

sklearn.feature_extraction.text: Part of scikit-learn, this module includes utilities like CountVectorizer for converting text data into numerical features for machine learning.

In [1]:
import pandas as pd
import numpy as np
import collections
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.feature_extraction import text

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (



This line of code reads a CSV file named Evaluation-dataset.csv into a DataFrame without assuming any header information and displays the first five rows. This allows for an initial inspection of the dataset to understand its structure and contents.

# Loading the DataSet

In [2]:
data = pd.read_csv('Evaluation-dataset.csv', header=None)
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,Tires where delivered to the garage of my choi...,garage service positive,ease of booking positive,,,,,,,,,,,,
1,"Easy Tyre Selection Process, Competitive Prici...",garage service positive,value for money positive,,,,,,,,,,,,
2,Very easy to use and good value for money.,value for money positive,,,,,,,,,,,,,
3,Really easy and convenient to arrange,ease of booking positive,,,,,,,,,,,,,
4,It was so easy to select tyre sizes and arrang...,location positive,value for money positive,ease of booking positive,,,,,,,,,,,


In [3]:
# Creating a list of new column names for the sentiment columns
sentiment_array = ['Col_'+ str(i) for i in range(1,15)]

# Defining the new column names, starting with 'Sentence' followed by the sentiment columns
new_col = ['Sentence'] + sentiment_array

# Assigning the new column names to the DataFrame
data.columns = new_col

# Displaying the updated column names
data.columns

Index(['Sentence', 'Col_1', 'Col_2', 'Col_3', 'Col_4', 'Col_5', 'Col_6',
       'Col_7', 'Col_8', 'Col_9', 'Col_10', 'Col_11', 'Col_12', 'Col_13',
       'Col_14'],
      dtype='object')


This code initializes an empty dictionary sent and iterates over all columns in the DataFrame, except the 'Sentence' column. For each sentiment label in the columns, it removes NaN values and counts the occurrences of each sentiment, storing the counts in the sent dictionary.

In [4]:
sent = {}

for i in data.columns:
    if i!='Sentence':
        z = data[i].dropna()
        for l in z:
            if l not in sent.keys():
                sent[l] = 1
            else:
                sent[l]+=1

This code sorts the sent dictionary by its values in descending order and stores the sorted dictionary in dict1. It then iterates through dict1, appending keys with values greater than 30 to list_1. Finally, it outputs the sorted dictionary dict1.







In [5]:
import operator
dict1 = dict( sorted(sent.items(), key=operator.itemgetter(1),reverse=True))

list_1 = []

for key, value in dict1.items():
        
    if value>30:
            
        list_1.append(key)

dict1

{'value for money positive': 4780,
 'garage service positive': 2031,
 'ease of booking positive': 1187,
 'location positive': 1063,
 'length of fitting positive': 657,
 'delivery punctuality positive': 453,
 'tyre quality positive': 434,
 'garage service negative': 423,
 'change of date negative': 277,
 'wait time positive': 274,
 'delivery punctuality negative': 250,
 'advisoragent service positive': 233,
 'ease of booking negative': 227,
 'mobile fitter positive': 225,
 'advisor/agent service positive': 202,
 'value for money negative': 136,
 'wait time negative': 135,
 'damage negative': 127,
 'advisoragent service negative': 125,
 'booking confusion negative': 119,
 'discounts positive': 115,
 'length of fitting negative': 109,
 'extra charges positive': 85,
 'response time negative': 77,
 'late notice negative': 76,
 'incorrect tyres sent negative': 70,
 'advisor/agent service negative': 47,
 'extra charges negative': 46,
 'change of time negative': 42,
 'no stock negative': 42,
 

In [6]:
len(list_1)

33

In [7]:
list_1

['value for money positive',
 'garage service positive',
 'ease of booking positive',
 'location positive',
 'length of fitting positive',
 'delivery punctuality positive',
 'tyre quality positive',
 'garage service negative',
 'change of date negative',
 'wait time positive',
 'delivery punctuality negative',
 'advisoragent service positive',
 'ease of booking negative',
 'mobile fitter positive',
 'advisor/agent service positive',
 'value for money negative',
 'wait time negative',
 'damage negative',
 'advisoragent service negative',
 'booking confusion negative',
 'discounts positive',
 'length of fitting negative',
 'extra charges positive',
 'response time negative',
 'late notice negative',
 'incorrect tyres sent negative',
 'advisor/agent service negative',
 'extra charges negative',
 'change of time negative',
 'no stock negative',
 'tyre quality negative',
 'response time positive',
 'facilities positive']


This function, dataModification, processes the input DataFrame data to create a new DataFrame data2 with columns specified in list_1, filled with binary values indicating the presence (1) or absence (0) of each subtheme in each row. It then combines this processed data with the original 'Sentence' column to form the final_data DataFrame.

In [8]:
def dataModification(data):
    
    data1 = data.iloc[:, 1:]
    
    data2 = pd.DataFrame(index= range(0, data1.shape[0]), columns=list_1)
    
    
    for i, row in data1.iterrows():
        
        l_subthemes = row.dropna().tolist()
        
        for t in l_subthemes:
            
            data2.loc[i][t] = 1
            

    data2 = data2.fillna(0)
    final_data = pd.DataFrame()
    final_data['Sentence'] = data['Sentence']
    for column in data2.columns:
        final_data[column] = data2[column]
    return final_data

In [10]:
import warnings
warnings.filterwarnings('ignore')
final_data = dataModification(data)


The code imports necessary libraries and converts the 'Sentence' column of the final_data DataFrame into a list of sentences, storing it in the variable X. It then prints the first two elements of this list to display sample sentences from the dataset.

In [11]:
final_data.head()

Unnamed: 0,Sentence,value for money positive,garage service positive,ease of booking positive,location positive,length of fitting positive,delivery punctuality positive,tyre quality positive,garage service negative,change of date negative,...,response time negative,late notice negative,incorrect tyres sent negative,advisor/agent service negative,extra charges negative,change of time negative,no stock negative,tyre quality negative,response time positive,facilities positive
0,Tires where delivered to the garage of my choi...,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Easy Tyre Selection Process, Competitive Prici...",1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Very easy to use and good value for money.,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Really easy and convenient to arrange,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,It was so easy to select tyre sizes and arrang...,1,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
import string
import re
from collections import Counter
X = final_data['Sentence'].tolist()
X[:2]

['Tires where delivered to the garage of my choice,the garage notified me when they had been delivered. A day and time was arranged with the garage and I went and had them fitted,a Hassel free experience.',
 'Easy Tyre Selection Process, Competitive Pricing and Excellent Fitting Service']

# NATURAL LANGUAGE PROCESSING


This code performs text preprocessing on the sentences in the dataset. It removes digits, punctuation, and emojis, converts text to lowercase, strips white space, and eliminates stop words. Finally, it removes rare words from the processed sentences, resulting in a cleaned and standardized list of sentences (sents_processed_3).

In [13]:
def full_remove(x, removal_list):
    for w in removal_list:
        x = x.replace(w, ' ')
    return x

## Remove digits ##
digits = [str(x) for x in range(10)]
remove_digits = [full_remove(x, digits) for x in X]

## Remove punctuation ##
remove_punc = [full_remove(x, list(string.punctuation)) for x in remove_digits]

## Make everything lower-case and remove any white space ##
sents_lower = [x.lower() for x in remove_punc]
sents_lower = [x.strip() for x in sents_lower]

def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

## Remove stop words ##
from nltk.corpus import stopwords
stops = stopwords.words("English")

def removeStopWords(stopWords, txt):
    newtxt = ' '.join([word for word in txt.split() if word not in stopWords])
    return newtxt

sents_processed   = [removeStopWords(stops,x) for x in sents_lower]
sents_processed_2 = [remove_emoji(x) for x in sents_processed]



cnt = Counter()

n_rare_words = 10
RAREWORDS = set([w for (w, wc) in cnt.most_common()[:-n_rare_words-1:-1]])
def remove_rarewords(text):
     return " ".join([word for word in str(text).split() if word not in RAREWORDS])
    
sents_processed_3 = [remove_rarewords(x) for x in sents_processed_2]

sents_processed_3[:2]

['tires delivered garage choice garage notified delivered day time arranged garage went fitted hassel free experience',
 'easy tyre selection process competitive pricing excellent fitting service']

In [14]:
sent_processed = pd.DataFrame(sents_processed_3, columns=['Sent_Processed'])
d2 = final_data.iloc[:,1:]
final_data = pd.concat([sent_processed, d2], axis=1)
final_data.head()

Unnamed: 0,Sent_Processed,value for money positive,garage service positive,ease of booking positive,location positive,length of fitting positive,delivery punctuality positive,tyre quality positive,garage service negative,change of date negative,...,response time negative,late notice negative,incorrect tyres sent negative,advisor/agent service negative,extra charges negative,change of time negative,no stock negative,tyre quality negative,response time positive,facilities positive
0,tires delivered garage choice garage notified ...,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,easy tyre selection process competitive pricin...,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,easy use good value money,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,really easy convenient arrange,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,easy select tyre sizes arrange local fitting p...,1,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Model Preparation

In [15]:
y=final_data.drop('Sent_Processed',axis=1)
X=final_data['Sent_Processed']

In [16]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

# Evaluting the Models formed Using Classification Performance Metrics ( Accuracy, F1-Score, Classification Report )

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline

# Assuming you have your data in X (features) and y (labels) format
# X should be a list of strings where each string represents a document or a piece of text
# y should be a list of lists or a 2D array of labels corresponding to each document

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert text data into numerical features using CountVectorizer
vectorizer = CountVectorizer()
X_train_count = vectorizer.fit_transform(X_train)
X_test_count = vectorizer.transform(X_test)

# Initialize the SVM classifier
svm_classifier = SVC(kernel='linear')

# Wrap the classifier in MultiOutputClassifier for multi-label classification
multi_label_classifier = MultiOutputClassifier(svm_classifier, n_jobs=-1)

# Train the classifier on the training data
multi_label_classifier.fit(X_train_count, y_train)

# Predict the labels for the test set
y_pred = multi_label_classifier.predict(X_test_count)

# Calculate the accuracy of the classifier
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Generate a classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.5421805624074988
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.95      0.94       960
           1       0.66      0.55      0.60       410
           2       0.69      0.61      0.65       218
           3       0.84      0.81      0.82       203
           4       0.53      0.41      0.46       132
           5       0.60      0.57      0.59        93
           6       0.52      0.36      0.42        86
           7       0.55      0.47      0.51        79
           8       0.61      0.55      0.58        64
           9       0.39      0.28      0.32        58
          10       0.60      0.62      0.61        60
          11       0.43      0.46      0.44        52
          12       0.45      0.41      0.43        49
          13       0.71      0.74      0.72        39
          14       0.33      0.35      0.34        37
          15       0.08      0.03      0.05        29
          16       0.21      

In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import MultiLabelBinarizer

# Assuming 'Sent_Processed' is your text column and the rest are your target variables
X = final_data['Sent_Processed']
y = final_data.drop(columns=['Sent_Processed'])

# Convert target columns to a list of labels for each sample
y_labels = y.apply(lambda row: [col for col, val in row.items() if val != 0], axis=1)

# Use MultiLabelBinarizer to convert labels to a binary matrix
mlb = MultiLabelBinarizer()
y_binarized = mlb.fit_transform(y_labels)

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_binarized, test_size=0.2, random_state=42)

# Define a pipeline with CountVectorizer and SVM using OneVsRestClassifier
pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('svm', OneVsRestClassifier(SVC(kernel='linear', probability=True)))
])

# Train the SVM model
pipeline.fit(X_train, y_train)

# Make predictions
y_pred_train = pipeline.predict(X_train)
y_pred_test = pipeline.predict(X_test)

# Evaluate the model
accuracy_train = accuracy_score(y_train, y_pred_train)
accuracy_test = accuracy_score(y_test, y_pred_test)
f1_train = f1_score(y_train, y_pred_train, average='micro')
f1_test = f1_score(y_test, y_pred_test, average='micro')

print(f"Training Accuracy: {accuracy_train}")
print(f"Testing Accuracy: {accuracy_test}")
print(f"Training F1-Score: {f1_train}")
print(f"Testing F1-Score: {f1_test}")


Training Accuracy: 0.8583590376310919
Testing Accuracy: 0.5421805624074988
Training F1-Score: 0.9400714743883442
Testing F1-Score: 0.6900091659028414


# RESULT

In [20]:
def analyze_review(review):
    prediction = pipeline.predict([review])
    result = prediction[0]
    sentiments = []
    for idx, val in enumerate(result):
        if val != 0:
            sentiment = mlb.classes_[idx]
            sentiments.append((sentiment, val))
    return sentiments

# Example usage
review = "One tyre went missing, so there was a delay to get the two tyres fitted. The way garage dealtwith it was fantastic"
sentiments = analyze_review(review)
print(f"Review: {review}")
print("\n")
print("Detected sentiments:")
for sentiment, value in sentiments:
    print(sentiment)

Review: One tyre went missing, so there was a delay to get the two tyres fitted. The way garage dealtwith it was fantastic


Detected sentiments:
garage service positive
incorrect tyres sent negative
length of fitting positive
wait time negative


# code to get result of all the reviews

it takes few minutes to run as it analyse the complete dataset

In [None]:
# Function to display results for all reviews in the dataset
def analyze_all_reviews(data):
    results = []
    for review in data['Sent_Processed']:
        sentiments = analyze_review(review)
        results.append((review, sentiments))
    return results

# Analyze all reviews in the final data
all_sentiments = analyze_all_reviews(final_data)

# Print the results
for review, sentiments in all_sentiments:
    print(f"Review: {review}")
    print("\n")
    print("Detected sentiments:")
    for sentiment, value in sentiments:
        print(sentiment)
    print("\n")
