In [57]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import string
from sklearn.linear_model import LogisticRegression

df = pd.read_csv('Musical_instruments_reviews.csv')
df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A2IBPI20UZIR0U,1384719342,"cassandra tu ""Yeah, well, that's just like, u...","[0, 0]","Not much to write about here, but it does exac...",5.0,good,1393545600,"02 28, 2014"
1,A14VAT5EAX3D9S,1384719342,Jake,"[13, 14]",The product does exactly as it should and is q...,5.0,Jake,1363392000,"03 16, 2013"
2,A195EZSQDW3E21,1384719342,"Rick Bennette ""Rick Bennette""","[1, 1]",The primary job of this device is to block the...,5.0,It Does The Job Well,1377648000,"08 28, 2013"
3,A2C00NNG1ZQQG2,1384719342,"RustyBill ""Sunday Rocker""","[0, 0]",Nice windscreen protects my MXL mic and preven...,5.0,GOOD WINDSCREEN FOR THE MONEY,1392336000,"02 14, 2014"
4,A94QU4C90B1AX,1384719342,SEAN MASLANKA,"[0, 0]",This pop filter is great. It looks and perform...,5.0,No more pops when I record my vocals.,1392940800,"02 21, 2014"


This dataset contains information about reviews of musical instruments on Amazon.
The dataset conatains 9 columns:
* reviewerID - ID of the reviewer
* asin - ID of the product
* reviewerName - name of the reviewer
* helpful - helpfulness rating of the review
* reviewText - text of the review
* overall - rating of the product
* summary - summary of the review
* unixReviewTime - time of the review (unix time)
* reviewTime - time of the review 

Firstly, we can drop unimportant columns that will not be used in sentiment analysis.

In [58]:
df = df.drop(['reviewerID', 'asin', 'reviewerName', 'helpful', 'unixReviewTime', 'reviewTime', 'summary'], axis=1)
df.head()

Unnamed: 0,reviewText,overall
0,"Not much to write about here, but it does exac...",5.0
1,The product does exactly as it should and is q...,5.0
2,The primary job of this device is to block the...,5.0
3,Nice windscreen protects my MXL mic and preven...,5.0
4,This pop filter is great. It looks and perform...,5.0


Checking for missing values:

In [59]:
df.isna().sum()

reviewText    7
overall       0
dtype: int64

In [60]:
df.reviewText.fillna("", inplace=True)

Checking overall review score statistics:

In [61]:
df.overall.value_counts()

5.0    6938
4.0    2084
3.0     772
2.0     250
1.0     217
Name: overall, dtype: int64

The overall product scoring in review ranges from 1-5. The scores 1 and 2 have negative sentiment as they are the lowest scores. The scores 5 and 4 have positive sentiment as they are the best scores. Scores with value 3 have neutral sentiment as they are the middle of score range. This means we can drop all reviews that contain overall score of 3.

In [62]:
print('Before: ', sum(df['overall'] == 3))
df = df[df.overall != 3.0]
print('After: ', sum(df['overall'] == 3))

Before:  772
After:  0


Now we can replace the scoring by sentiment meaning: 1 - positive, 0 - negative

In [63]:
def scoring(score):
    if score == 5 or score == 4:
        return 1
    else:
        return 0

df.overall = df.overall.apply(scoring)
df.overall.value_counts()

1    9022
0     467
Name: overall, dtype: int64

As we can see - the majority of reviews have positive sentiment rating.

In order to analyze sentiment of words in reviews we need to remove whole punctuation in the reviews.

In [64]:
def remove_punctuation(text):
    import string
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

df['reviewText'] = df['reviewText'].apply(lambda x: remove_punctuation(str(x)))
df.head()

Unnamed: 0,reviewText,overall
0,Not much to write about here but it does exact...,1
1,The product does exactly as it should and is q...,1
2,The primary job of this device is to block the...,1
3,Nice windscreen protects my MXL mic and preven...,1
4,This pop filter is great It looks and performs...,1


Perparing reviews to enable further analysis:

In [65]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
X = df['reviewText']
y = df['overall']
reviews_train, reviews_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=43)
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(reviews_train)
X_test = vectorizer.transform(reviews_test)
model = LogisticRegression(max_iter=1000)
%timeit model.fit(X_train, y_train)

2.54 s ± 369 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


By processing the reviewText data with CountVectorizer we have test amd train sets that contain numerical values assigned to strings. After that, I created and trained LogisticRegression model.

Now, we can look for most positive and negative words according to its sentiment value:

In [66]:
coef = model.coef_.tolist().pop()
indexes_neg = sorted(range(len(coef)), key=lambda x: coef[x])[:10]
indexes_pos = sorted(range(len(coef)), key=lambda x: coef[x])[-10:]
count_vec_negative_words = [vectorizer.get_feature_names()[i] for i in indexes_neg]
count_vec_positive_words = [vectorizer.get_feature_names()[i] for i in indexes_pos]
print(f'Most negative words: {count_vec_negative_words}')
print(f'Most positive words: {count_vec_positive_words}')

Most negative words: ['returned', 'useless', 'something', 'junk', 'spend', 'month', 'crap', 'doesnt', 'disappointed', 'failed']
Most positive words: ['works', 'easy', 'perfectly', 'using', 'needed', 'best', 'perfect', 'nice', 'great', 'little']




In [67]:
coef = model.coef_.tolist().pop()
for i, x in enumerate(count_vec_negative_words+count_vec_positive_words):
    print(f'{x} : {coef[i]}')

returned : 0.019967714969177715
useless : 2.574249787596529e-05
something : 4.141674507952641e-06
junk : 8.32139501419072e-08
spend : 0.0036765509426043815
month : 2.7468123022842894e-07
crap : 0.06653983592746941
doesnt : 1.3884166960476483e-06
disappointed : 8.32139501419072e-08
failed : 1.93460271463079e-05
works : 6.942083480238241e-07
easy : 0.004808003307980032
perfectly : 0.014624376573282782
using : 0.0050089434706300775
needed : 0.0002581402543912615
best : 0.004997305347148677
perfect : 0.018225604807868577
nice : 0.014624376573282782
great : 0.014866617475243621
little : 0.024513660570898564


The results of most positive and negative words contain words that are associated with these sentiments. The results are satysfying.

The model should predict the sentiment of reviews. We can test that by using test data reviews:

In [68]:
sentiment = model.predict(X_test)
sentiment_prob = model.predict_proba(X_test)

neg_sentiment = sorted(range(len(sentiment_prob)), key=lambda x:sentiment_prob[:, 0][x])[-3:]
pos_sentiment = sorted(range(len(sentiment_prob)), key=lambda x:sentiment_prob[:, 1][x])[-3:]
print(f'Most negative reviews indices: {neg_sentiment}')
print(f'Most positive reviews indices: {pos_sentiment}')
reviews_neg = [np.array(df['reviewText'])[x] for x in neg_sentiment]
reviews_pos = [np.array(df['reviewText'])[x] for x in pos_sentiment]
print('-----------------------------------')
print('Most negative reviews:')
for i in range(len(reviews_neg)):
    print(f'Most negative review {i+1}: {reviews_neg[i]}')
    print('-----------------------------------')
print('Most positive reviews:')
for i in range(len(reviews_pos)):
    print(f'Most positive review {i+1}: {reviews_pos[i]}')
    print('-----------------------------------')

Most negative reviews indices: [771, 952, 1219]
Most positive reviews indices: [1327, 2434, 4547]
-----------------------------------
Most negative reviews:
Most negative review 1: ive used pick when i play most of time and i used white color thinnest one i like to use thin ones because i play strumming most of time and this one works perfect very easy to grip and last long one thing to point out is that print on pick fades away very quickly but i do not mind highly recommand this for anyone needs pick
-----------------------------------
Most negative review 2: This is just what it looks like  a very economical strap  The great thing about it is the length  As any other tall person will tell you finding a strap that adjusts to longer than normal can be frustrating  This strap is very long if you need it to be  For the price very hard to beat this  Works well withDunlop Dual Design Straplok System Blacktoo  Overall a good buy for a backup strap or your primary
--------------------------

Comparing positive and negative reviews: most of these reviews are positive sentiment - the cause are the weightings of words used in the review that make such review negative. The results are expected to be as they are given beacause of that. In addition, most of the reviews in dataset are positive sentiment.

In [69]:
acc = model.score(X_test, y_test)
print(f'Accuracy: {acc}')

Accuracy: 0.9517386722866175


The accuracy of the LogisticRegression model is high so the model seems well trained.

Trying different method: TfidVectorizer
TFIDF - Term Frequency-Inverse Document Frequency - it is a technique to quantify a word in documents, generally computing a weight to each word which signifies the importance of word in the document and corpus.

In [70]:
from sklearn.feature_extraction.text import TfidfVectorizer
X = df['reviewText']
y = df['overall']
reviews_train, reviews_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=43)
tvectorizer = TfidfVectorizer()
X_train = tvectorizer.fit_transform(reviews_train)
X_test = tvectorizer.transform(reviews_test)
model_t = LogisticRegression(max_iter=1000)
model_t.fit(X_train, y_train)

In [71]:
sentiment = model_t.predict(X_test)
sentiment_prob = model_t.predict_proba(X_test)

In [72]:
acc_tf = model_t.score(X_test, y_test)
print(f'Accuracy CountVectorizer: {acc}')
print(f'Accuracy TfidtVectorizer: {acc_tf}')

Accuracy CountVectorizer: 0.9517386722866175
Accuracy TfidtVectorizer: 0.9515279241306639


The accuracy of model that uses TfidVectorizer method is similar to the model that used CountVectorizer (Both used LogisticRegression).

Comparing the most positive and negative words to previous results:

In [73]:
coef = model_t.coef_.tolist().pop()
indexes_neg = sorted(range(len(coef)), key=lambda x: coef[x])[:10]
indexes_pos = sorted(range(len(coef)), key=lambda x: coef[x])[-10:]
tfid_negative_words = [vectorizer.get_feature_names()[i] for i in indexes_neg]
tfid_positive_words = [vectorizer.get_feature_names()[i] for i in indexes_pos]
print(f'Most negative words: {tfid_negative_words}')
print(f'Most positive words: {tfid_positive_words}')
# Results given from method that used CountVectorizer
# Most negative words: ['returned', 'useless', 'something', 'junk', 'spend', 'month', 'crap', 'doesnt', 'disappointed', 'failed']
# Most positive words: ['works', 'easy', 'perfectly', 'using', 'needed', 'best', 'perfect', 'nice', 'great', 'little']

Most negative words: ['not', 'but', 'returned', 'was', 'defective', 'too', 'the', 'useless', 'out', 'were']
Most positive words: ['my', 'for', 'and', 'works', 'well', 'good', 'easy', 'nice', 'little', 'great']




As we can see the results are different from the method that used CountVectorizer. It is interesting that there are much more words that originally have no positive or negative sentiment such as 'my' 'for' 'and' 'the'.

Using different models in search of better accuracy/score:

Support vector machine:

In [74]:
from sklearn import svm
X = df['reviewText']
y = df['overall']
reviews_train, reviews_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=43)
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(reviews_train)
X_test = vectorizer.transform(reviews_test)
model_svm = svm.SVC()
%timeit model_svm.fit(X_train, y_train)
sentiment = model_svm.predict(X_test)
acc_svm = model_svm.score(X_test, y_test)
print(f'Model score: {acc_svm}')

4.32 s ± 247 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Model score: 0.9515279241306639


Support Vector Machine with CountVectorizer achieved a bit worse score than LogisticRegression in longer time.

KNeighborsClassifier:

In [75]:
from sklearn.neighbors import KNeighborsClassifier
X = df['reviewText']
y = df['overall']
reviews_train, reviews_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=43)
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(reviews_train)
X_test = vectorizer.transform(reviews_test)
model_kn = KNeighborsClassifier(n_neighbors=10)
%timeit model_kn.fit(X_train, y_train)
sentiment = model_kn.predict(X_test)
acc_kn = model_kn.score(X_test, y_test)
print(f'Model score: {acc_kn}')

794 µs ± 57.2 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
Model score: 0.9511064278187565


KNeighborsClassifier achieved a bit worse score than LogisticRegression, however it took much less time. We could improve the score by increasing the number of neighbors. By looking at the time/efficiency of models it shows us that K neighbors model is the best.