In [1]:
#important libraries

import pandas as pd
import numpy as np

In [None]:
#this dataset is based on customer reviews of thier experience with the Uber rides.We can associate positive and negative
#words and sentiments on reviews.


In [2]:
#read the csv file

df=pd.read_csv('Ubers.csv', encoding = "ISO-8859-1")
df.head()

Unnamed: 0,Date,Stars,Comment
0,10/29/2019,1,I had an accident with an Uber driver in Mexic...
1,10/28/2019,1,I have had my account completely hacked to whe...
2,10/27/2019,1,I requested an 8 mile ride in Boston on a Satu...
3,10/27/2019,1,I've been driving off and on with the company ...
4,10/25/2019,1,Uber is overcharging for Toll fees. When In Fl...


In [3]:
# to find the total no of null values in the columns

df.isnull().sum()

Date       0
Stars      0
Comment    0
dtype: int64

In [4]:
#to convert all uppercase to lowercase

df['Comment'] = df.Comment.map(lambda x: x.lower())

# remove all other symbols 

df['Comment'] = df.Comment.str.replace('[^\w\s]', '')
df

Unnamed: 0,Date,Stars,Comment
0,10/29/2019,1,i had an accident with an uber driver in mexic...
1,10/28/2019,1,i have had my account completely hacked to whe...
2,10/27/2019,1,i requested an 8 mile ride in boston on a satu...
3,10/27/2019,1,ive been driving off and on with the company s...
4,10/25/2019,1,uber is overcharging for toll fees when in flo...
5,10/24/2019,1,i had an airport flight today uber would not a...
6,10/24/2019,1,i worked for uber and lyft for 25 years and al...
7,10/23/2019,1,in july of this year i had sushi delivered to ...
8,10/23/2019,1,my driver rohan was nice but when i tried to a...
9,10/21/2019,1,i had seven fraudulent uber transactions over ...


In [5]:
#to get unique values from the column of comments

df["Comment"].unique()

array(['i had an accident with an uber driver in mexico city the car that i got into had no side mirror the brakes were not working properly either i almost got into an accident twice the drivers conversation was unpleasant being a foreigner he was very curious to ask where i am from and what brought me to mexico i replied to be a tourist and through that conversation is over he became very rude and asked me if i came to look for a mexican husband i never answered and kept quiet he took the wrong route and made several in requested stops having in mind it was uberx he continued asked me whom i sleep with he literally stopped the car and asked me to wait for him text someone i asked him to let me go and take a different driver but he locked the doors and didnt allow me ',
       'i have had my account completely hacked to where i cannot sign in or view it someone spent over 1k in uber rides on my credit and debit cards linked to the account and when i dialed the only number available sa

In [None]:
#Before we explore the dataset we're going to split it into training set and test sets.
#Our goal is to eventually train a sentiment analysis classifier.
#To use sklearn's Stratified ShuffleSplit class, we're going to remove all samples that have NAN in review score, 
#then covert all review scores to integer datatype.

In [6]:
#to split the data into test and train

from sklearn.model_selection import StratifiedShuffleSplit
print("Before {}".format(len(df)))
dataAfter = df.dropna(subset=["Stars"]) # removes all NAN in reviews.rating
print("After {}".format(len(dataAfter)))
dataAfter["Stars"] = dataAfter["Stars"].astype(int)

Before 2347
After 2347


In [7]:
split = StratifiedShuffleSplit(n_splits=5, test_size=0.2)
for train_index, test_index in split.split(dataAfter, dataAfter["Stars"]): 
    strat_train = dataAfter.reindex(train_index)
    strat_test = dataAfter.reindex(test_index)

In [None]:
#Using the features in place, we will build a classifier that can determine a review's sentiment.

In [8]:
#Segregate ratings from 1-5 into positive, neutral, and negative.

def sentiments(rating):
    if (rating == 5) or (rating == 4):
        return "Positive"
    elif rating == 3:
        return "Neutral"
    elif (rating == 2) or (rating == 1):
        return "Negative"
    
# Add sentiments to the data

strat_train["Sentiment"] = strat_train["Stars"].apply(sentiments)
strat_test["Sentiment"] = strat_test["Stars"].apply(sentiments)
strat_train["Sentiment"][:20]

1871    Negative
1189    Negative
70      Negative
459     Negative
919     Negative
1335    Negative
638     Positive
1492    Negative
208     Positive
607     Negative
872     Negative
776      Neutral
1198    Negative
1190    Negative
2248    Negative
819     Negative
1669    Negative
1791    Negative
2334    Negative
793     Negative
Name: Sentiment, dtype: object

In [9]:
#prepare data

X_train = strat_train["Comment"]
X_train_targetSentiment = strat_train["Sentiment"]
X_test = strat_test["Comment"]
X_test_targetSentiment = strat_test["Sentiment"]
print(len(X_train), len(X_test))

1877 470


In [None]:
#Here we will turn content into numerical feature vectors using the Bag of Words strategy.
#In order to implement the Bag of Words strategy, we will use SciKit-Learn's CountVectorizer to perform.

In [10]:
# extract features bag of words count vectorizer

from sklearn.feature_extraction.text import CountVectorizer 
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train) 
X_train_counts.shape

(1877, 9624)

In [None]:
#With longer documents, we typically see higher average count values on words that carry very little meaning,
#this will overshadow shorter documents that have lower average counts with same frequencies, as a result,
#we will use TfidfTransformer to reduce this redundancy.

In [11]:
#tf-idf transformer

from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer(use_idf=False)
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(1877, 9624)

In [None]:
#Multinominal Niave Bayes is most suitable for word counts where data are typically represented as word vector counts 
#(number of times outcome number X[i,j] is observed over the n trials),
#while also ignoring non-occurrences of a feature i


#Naive Bayes is a simplified version of Bayes Theorem, where all features are assumed conditioned independent to each other 
#(the classifiers), P(x|y) where x is the feature and y is the classifier

In [12]:
#Building a Pipeline from the Extracted Features

from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
clf_multiNB_pipe = Pipeline([("vect", CountVectorizer()), ("tfidf", TfidfTransformer()), ("clf_nominalNB", MultinomialNB())])
clf_multiNB_pipe.fit(X_train, X_train_targetSentiment)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf_nominalNB',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [13]:
#Test Model

predictedMultiNB = clf_multiNB_pipe.predict(X_test)
np.mean(predictedMultiNB == X_test_targetSentiment)

0.8063829787234043

In [None]:
#Here we see that our Multinominal Naive Bayes Classifier has a 80.6% accuracy level based on the features.
#Next we will conduct the following:

#Test other models
#Fine tune the best models to avoid over-fitting

In [14]:
#Support Vector Machine Classifier

from sklearn.svm import LinearSVC
clf_linearSVC_pipe = Pipeline([("vect", CountVectorizer()), ("tfidf", TfidfTransformer()), ("clf_linearSVC", LinearSVC())])
clf_linearSVC_pipe.fit(X_train, X_train_targetSentiment)

predictedLinearSVC = clf_linearSVC_pipe.predict(X_test)
np.mean(predictedLinearSVC == X_test_targetSentiment)

0.8978723404255319

In [15]:
#Decision Tree Classifier

from sklearn.tree import DecisionTreeClassifier
clf_decisionTree_pipe = Pipeline([("vect", CountVectorizer()), ("tfidf", TfidfTransformer()), 
                                  ("clf_decisionTree", DecisionTreeClassifier())])
clf_decisionTree_pipe.fit(X_train, X_train_targetSentiment)

predictedDecisionTree = clf_decisionTree_pipe.predict(X_test)
np.mean(predictedDecisionTree == X_test_targetSentiment)

0.8085106382978723

In [16]:
#Random Forest Classifier

from sklearn.ensemble import RandomForestClassifier
clf_randomForest_pipe = Pipeline([("vect", CountVectorizer()), ("tfidf", TfidfTransformer()), ("clf_randomForest", RandomForestClassifier())])
clf_randomForest_pipe.fit(X_train, X_train_targetSentiment)

predictedRandomForest = clf_randomForest_pipe.predict(X_test)
np.mean(predictedRandomForest == X_test_targetSentiment)



0.8361702127659575

In [None]:
#We will use the Support Vector Machine Classifier since it has the highest accuracy level at 89.3%.

In [17]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

print(classification_report(X_test_targetSentiment,predictedLinearSVC))
print('Accuracy: {}'. format(accuracy_score(X_test_targetSentiment, predictedLinearSVC)))

              precision    recall  f1-score   support

    Negative       0.90      0.99      0.94       379
     Neutral       1.00      0.10      0.18        20
    Positive       0.85      0.65      0.74        71

    accuracy                           0.90       470
   macro avg       0.92      0.58      0.62       470
weighted avg       0.90      0.90      0.88       470

Accuracy: 0.8978723404255319


In [18]:

from sklearn import metrics
metrics.confusion_matrix(X_test_targetSentiment, predictedLinearSVC)

array([[374,   0,   5],
       [ 15,   2,   3],
       [ 25,   0,  46]])