Project: 7 Review Sentiment Dashboard

Problem Statement:
Analyze customer product reviews and show sentiment insights in a dashboard.

In [40]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [41]:
os.chdir(os.getcwd().replace('notebooks', 'data'))

df = pd.read_csv('data.csv')
df.columns = ['index', 'name', 'sentiment', 'text']


In [42]:
df.sample(10)


Unnamed: 0,index,name,sentiment,text
67731,7200,johnson&johnson,Negative,ET WE ’ RE AN SUPPOSED TO NO TRUST TO JOHNSON ...
66490,6985,johnson&johnson,Neutral,Johnson & Taylor ends flu shot market in US & ...
53337,10764,RedDeadRedemption(RDR),Positive,Most of you wont care but I honestly have also...
4164,1918,CallOfDutyBlackopsColdWar,Positive,Not only was it like an open space to interact...
60829,4824,GrandTheftAuto(GTA),Positive,I like killing people in GTA
11354,13152,Xbox(Xseries),Positive,I get everyone wants their shiny new PS5s/Xbox...
44762,11687,Verizon,Negative,@verizon Can you make some data overage charge...
73374,8969,Nvidia,Neutral,Nvidia's GeForce Now loses all Activision Bliz...
19681,12569,WorldOfCraft,Neutral,I will jump on these 10 games to find out that...
50085,6197,FIFA,Positive,The fourth greatest Center Forward the game ha...


In [43]:
df = df[['text', 'sentiment']]


In [44]:
df.sample(5)


Unnamed: 0,text,sentiment
46653,Was t just doing at @HomeDepot & R while waiti...,Neutral
40328,A ban for Battlefield 1 player action les occu...,Irrelevant
11627,Nice and clean,Positive
4679,@amazon probably some of the worst customer se...,Negative
66698,New York charges Johnson & Johnson with bank f...,Neutral


In [45]:
df.isna().sum()


text         686
sentiment      0
dtype: int64

In [46]:
df.dropna(inplace=True)


In [47]:
df.shape


(73995, 2)

In [48]:
df.duplicated().sum()


4227

In [49]:
df.drop_duplicates(inplace=True)


In [50]:
df.shape


(69768, 2)

In [51]:
df['sentiment'].value_counts()


sentiment
Negative      21237
Positive      19137
Neutral       17110
Irrelevant    12284
Name: count, dtype: int64

In [52]:
df = df[df['sentiment'] != 'Irrelevant']


In [60]:
df['sentiment'].value_counts()


sentiment
Negative    21237
Positive    19137
Neutral     17110
Name: count, dtype: int64

In [61]:
df['sentiment'] = df['sentiment'].map({'Positive': 2, 'Negative': 1, 'Neutral': 0})


In [62]:
df.shape


(57484, 2)

In [63]:
df.sample(10)


Unnamed: 0,text,sentiment
53617,Anyone who thinks otherwise perfect media can'...,1
11447,UK Retailer Warns Of More Potential Xbox Serie...,0
33843,Fortnite really copyed the superior French bat...,1
10042,For the “next thousands of years”?.. Wow... sm...,2
48378,This is my current job. Please don't romantici...,1
13765,@ N2K whose genius idea was to have the right ...,1
29204,Bro I get killed by flying Caustics RhandlerR ...,1
13426,Get off on the think I'm about to play a game ...,1
57592,Guess it? now it's ready for an amazing stream...,2
19103,@BlizzardCS @Warcraft why cant a multi million...,1


In [64]:
X, y = df['text'], df['sentiment']


In [65]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [66]:

import re
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()


def convert_text(text_series):
    return text_series.apply(lambda text: ' '.join(
        stemmer.stem(word) for word in re.findall(r'\b\w+\b', text.lower())
    ))


In [67]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier


In [68]:
model = Pipeline([
    ('preprocess', FunctionTransformer(convert_text, validate=False)),
    ('tfidf', TfidfVectorizer(stop_words='english', max_features=110000, ngram_range=(1, 2))),
    ('clf', MultinomialNB())
])


In [69]:
model.fit(X_train, y_train)


In [None]:
model.score(X_test, y_test)


0.8496488462089723

In [None]:
# models = {
#     # 'Logistic Regression' : LogisticRegression(),
#     # 'Random Forest' : RandomForestClassifier(),
#     'Decision Tree' : DecisionTreeClassifier(),
#     'ExtraTreeClassifier' : ExtraTreeClassifier(),
#     # 'SVC' : SVC(),
#     # 'Multinomial Naive Bayes' : MultinomialNB(),
#     'Bernoulli Naive Bayes' : BernoulliNB(),
#     'AdaBoost' : AdaBoostClassifier(),
#     'Gradient Boosting' : GradientBoostingClassifier(),
    
# }


# for name, current_model in models.items():
#     print(f"Checking accuracy for {name}")
#     model = Pipeline([
#         ('preprocess', FunctionTransformer(convert_text, validate=False)),
#         ('tfidf', TfidfVectorizer(stop_words='english', max_features=110000, ngram_range=(1, 2))),
#         ('clf', current_model)
#     ])

#     model.fit(X_train, y_train)
#     print(f"Accuracy for {name} is {model.score(X_test, y_test)}")
#     print("*"*50)
#     print("\n")



Checking accuracy for Decision Tree
Accuracy for Decision Tree is 0.8055037981940663
**************************************************


Checking accuracy for ExtraTreeClassifier
Accuracy for ExtraTreeClassifier is 0.7938942238784578
**************************************************


Checking accuracy for Bernoulli Naive Bayes
Accuracy for Bernoulli Naive Bayes is 0.827504658162534
**************************************************


Checking accuracy for AdaBoost




Accuracy for AdaBoost is 0.4842339114232478
**************************************************


Checking accuracy for Gradient Boosting
Accuracy for Gradient Boosting is 0.5517414361473413
**************************************************




In [None]:
model = Pipeline([
    ('preprocess', FunctionTransformer(convert_text, validate=False)),
    ('tfidf', TfidfVectorizer(stop_words='english', max_features=110000, ngram_range=(1, 2))),
    ('clf', LogisticRegression())
])


In [None]:
model.fit(X_train, y_train)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
model.score(X_test, y_test)


0.8791027662319049