# Import Libreries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
import seaborn as sns
color = sns.color_palette()
%matplotlib inline
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import plotly.express as px
import nltk
from nltk.corpus import stopwords
import wordcloud
from wordcloud import WordCloud, STOPWORDS

# Data

In [22]:
df = pd.read_csv('data/Reviews.csv')
df.head(5)

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


Text — This variable contains the complete product review information.

Summary — This is a summary of the entire review.

Score — The product rating provided by the customer.

# Data Analysis

In [None]:
# Product Scores
fig = px.histogram(df, x="Score")
fig.update_traces(marker_color="turquoise",marker_line_color='rgb(8,48,107)',
                  marker_line_width=1.5)
fig.update_layout(title_text='Product Score')
fig.show()

In [None]:

# Create stopword list:

nltk.download('stopwords')
stopwords_1 = set(stopwords.words('english'))
stopwords_1.update(["br", "href"])
textt = " ".join(review for review in df.Text)
wordcloud = WordCloud(stopwords=stopwords_1).generate(textt)

plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.savefig('wordcloud11.png')
# Due to the github limit I can't show the output of the images, so I'll leave them in a separate folder
plt.show()

# Classifying Tweets

In [None]:
# assign reviews with score > 3 as positive sentiment
# score < 3 negative sentiment
# remove score = 3

df = df[df['Score'] != 3]
df['sentiment']  = df['Score'].apply(lambda rating : +1 if rating > 3 else -1)
df.head(5)

In [None]:
# Split df - positive and negative sentiment:

positive = df[df['sentiment'] == 1]
negative = df[df['sentiment'] == -1]

In [None]:
# removed NaN values from column Summary

negative.dropna(subset='Summary', inplace=True)

### Wordcloud - Positive Sentiment

In [None]:
stopwords_1 = set(stopwords.words('english'))
stopwords_1.update(["br", "href", "good", "great"])

## good and great removed beacause they were included in negative sentiment 

pos = " ".join(review for review in positive.Summary)
wordcloud_2 = WordCloud(stopwords=stopwords_1).generate(pos)

plt.imshow(wordcloud_2, interpolation='bilinear')
plt.axis("off")
plt.savefig('wordcloud22.png')
#Due to the github limit I can't show the output of the images, so I'll leave them in a separate folder
plt.show()

### Wordcloud - Negative Sentiment 

In [None]:

neg = " ".join(review for review in (negative.Summary))
wordcloud_3 = WordCloud(stopwords=stopwords_1).generate(neg)

plt.imshow(wordcloud_3, interpolation='bilinear')
plt.axis("off")
plt.savefig('wordcloud33.png')
#Due to the github limit I can't show the output of the images, so I'll leave them in a separate folder
plt.show()

### Plot Positive vs Negative

In [None]:
df['sentimentt'] = df['sentiment'].replace({-1 : 'negative'})
df['sentimentt'] = df['sentimentt'].replace({1 : 'positive'})
fig = px.histogram(df, x="sentimentt")
fig.update_traces(marker_color="indianred",marker_line_color='rgb(8,48,107)',
                  marker_line_width=1.5)
fig.update_layout(title_text='Product Sentiment')
fig.show()

# Buiding the Model

In [None]:
# Data Cleaning
def remove_punctuation(text):
    final = "".join(u for u in text if u not in ("?", ".", ";", ":",  "!",'"'))
    return final

df['Text'] = df['Text'].apply(remove_punctuation)
df = df.dropna(subset=['Summary'])
df['Summary'] = df['Summary'].apply(remove_punctuation)

In [None]:
df_new = df[['Summary', 'sentiment']]

In [None]:
# Random split train and test data

index = df.index
df['random_number'] = np.random.randn(len(index))

train = df[df['random_number'] <= 0.8]
test = df[df['random_number'] > 0.8]

In [None]:
# Count vectorizer:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')

train_matrix = vectorizer.fit_transform(train['Summary'])
test_matrix = vectorizer.transform(test['Summary'])

In [None]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(solver='lbfgs', max_iter=500)

In [None]:
## Split target and independent variables
X_train = train_matrix
X_test = test_matrix
y_train = train['sentiment']
y_test = test['sentiment']

In [None]:
# Fit model
lr.fit(X_train,y_train)

In [None]:
predictions = lr.predict(X_test)

# Testing

In [23]:
# find accuracy, precision, recall:
from sklearn.metrics import confusion_matrix,classification_report
new = np.asarray(y_test)
confusion_matrix(predictions,y_test)

array([[11573,  2337],
       [ 5866, 91703]], dtype=int64)

In [24]:
print(classification_report(predictions,y_test))

              precision    recall  f1-score   support

          -1       0.66      0.83      0.74     13910
           1       0.98      0.94      0.96     97569

    accuracy                           0.93    111479
   macro avg       0.82      0.89      0.85    111479
weighted avg       0.94      0.93      0.93    111479

