In [48]:
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
# nltk.download()

In [49]:
df = pd.read_csv(r'Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products_May19.csv')

In [50]:
# remove NaN
dataAfter = df.dropna(subset=["reviews.rating"])

# convert the rating to integer
dataAfter["reviews.rating"] = dataAfter["reviews.rating"].astype(int)
df = dataAfter

In [51]:
import numpy as np

# implement sentiment analysis
analyser = SentimentIntensityAnalyzer()
def get_sentiment(text):
    sent = analyser.polarity_scores(text)
    max = np.max(sent)
    if sent['neg'] > sent['neu'] and sent['neg'] > sent['pos']:
        return "NEGATIVE"
    if sent['neu'] > sent['pos'] and sent['neu'] > sent['neg']:
        return "NEUTRAL"
    if sent['pos'] > sent['neu'] and sent['pos'] > sent['neg']:
        return "POSITIVE"
    # return np.array([sent['neg'], sent['neu'], sent['pos']])


df['reviews.sentiment'] = df['reviews.text'].apply(get_sentiment)
df['reviews.sent_text'] = df['reviews.sentiment'] + " " + df['reviews.text']


In [52]:
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]

stop_words = set(stopwords.words('english'))
stop_words.add('review')
stop_words.add('that')

# to lowercase
df['reviews.text'] = df['reviews.text'].str.lower()

# Lemmatization
df['reviews.text'] = df['reviews.text'].apply(lemmatize_text)

# Remove stop words
df['reviews.text'] = df['reviews.text'].apply(lambda x: [item for item in x if item not in stop_words])

# Rejoing the words into a sentence
df['reviews.text'] = df['reviews.text'].str.join(' ')

In [53]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier 
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# print(df['reviews.sent_text'])

cv = CountVectorizer(binary=True)
cv.fit(df['reviews.text'])
X = cv.transform(df['reviews.text'])
# X = list(df['reviews.sentiment'])
y = df['reviews.rating']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [54]:
regressor = RandomForestClassifier(n_estimators=100, random_state=0)
regressor.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [55]:
y_pred = regressor.predict(X_test)

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[ 115    1    3    3   75]
 [  14   47    2    8   55]
 [   2    8  133    9   92]
 [   1    0    3  691  433]
 [   8    1    4   59 3900]]
              precision    recall  f1-score   support

           1       0.82      0.58      0.68       197
           2       0.82      0.37      0.51       126
           3       0.92      0.55      0.68       244
           4       0.90      0.61      0.73      1128
           5       0.86      0.98      0.91      3972

    accuracy                           0.86      5667
   macro avg       0.86      0.62      0.70      5667
weighted avg       0.87      0.86      0.85      5667

0.8621845773778013


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

temp = df.sample(len(df))

cv = CountVectorizer(binary=True)
cv.fit(df['reviews.text'])
X = cv.transform(df['reviews.text'])
X_test = cv.transform(df['reviews.text'])

target = temp['reviews.rating']

X_train, X_val, y_train, y_val = train_test_split(
    X, target, train_size = 0.75
)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))

Accuracy for C=0.01: 0.6950444726810674
Accuracy for C=0.05: 0.6915148948185798


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Accuracy for C=0.25: 0.6817732599181138


In [None]:
final_model = LogisticRegression(C=0.01)
final_model.fit(X, target)
print ("Final Accuracy: %s" 
       % accuracy_score(target, final_model.predict(X_test)))

feature_to_coef = {
    word: coef for word, coef in zip(
        cv.get_feature_names(), final_model.coef_[0]
    )
}

In [None]:
for best_positive in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1], 
    reverse=True)[:5]:
    print (best_positive)
   

In [None]:
for best_negative in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1])[:5]:
    print (best_negative)

In [None]:
df["reviews.rating"].value_counts()/len(df)   #percentage of ratings

In [None]:
reviews = df.copy()

In [None]:
reviews.groupby("asins")["name"].unique()

In [None]:
%matplotlib inline 

In [None]:
fig = plt.figure(figsize=(16,10))
ax1 = plt.subplot(211)
#ax2 = plt.subplot(212, sharex = ax1)
reviews["asins"].value_counts().plot(kind="bar", ax=ax1, title="ASIN Frequency")

In [None]:
corr_matrix = reviews.corr()
corr_matrix
# Here we can analyze reviews.ratings with asins

In [None]:
counts = reviews["asins"].value_counts().to_frame()
counts.head()

In [None]:
avg_rating = reviews.groupby("asins")["reviews.rating"].mean().to_frame()
avg_rating.head()

In [None]:
table = counts.join(avg_rating)
table.head(30)

In [None]:
plt.scatter("asins", "reviews.rating", data=table)
table.corr()
