# This notebook will show the implementation of NLP upon text reviews in our database of Amazon reviews. 

## We will explain each step and their results.

We import some of the main libraries we will work with:

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

Now, let us upload the database and observe it:

In [None]:
amazon_reviews = pd.read_csv("Amazon Reviews 1.csv")
#amazon_reviews

In [None]:
amazon_reviews.info()

In [None]:
amazon_reviews.dropna(subset = ["reviews.text"])
amazon_reviews.dropna(subset = ["reviews.title"])
amazon_reviews.dropna(subset = ["reviews.rating"])

amazon_reviews.describe()

## The data is unevenly distributed along the ratings, we will solve this issue:

In [None]:
def random_select(df, n, rating, column): #choose the database and the number of samples with the indicated rating (specify the 
                                            # name of the column)
    rating_df = df[df[column] == rating]
    dff = rating_df.sample(n=n, random_state=42) 

    return dff

df3 = random_select(amazon_reviews, 400, 3, 'reviews.rating')
df4 = random_select(amazon_reviews, 350, 4, 'reviews.rating')
df5 = random_select(amazon_reviews, 1000, 5, 'reviews.rating')

In [None]:
df3.describe()

In [None]:
df4.describe()

In [None]:
df5.describe()

In [None]:
df12 = amazon_reviews[amazon_reviews['reviews.rating'].isin([1, 2])]

In [None]:
df12.describe()

Now, we observe that the columns we will work with are id (or name, they are correlated) and reviews.text. We will create a new database with these two and add a column based on the reviews.rating :

- 5 : good (2)
- 3, 4 : neutral (1)
- 1, 2 : bad

### Now let us create the final database we will actually work with:

In [None]:
amazon = pd.concat([df12, df3, df4, df5], ignore_index=True)
amazon.describe()

## We now need to create the label:

In [None]:

def label(r):
    if r == 5.0:
        return 2
    elif r == 3.0 or r == 4.0:
        return 1
    return 0

amazon['label'] = amazon['reviews.rating'].apply(label)


# This will be the database for our Sentiment Analysis:

In [None]:
amazon = amazon[['id', 'reviews.text', 'reviews.rating', 'label']]
amazon.describe()

## Let us observe how our database behaves:

In [None]:
data = amazon['label']
min_bin = - 0.5
max_bin = 2.5
bins = np.arange(min_bin, max_bin + 1, 1)  

counts, edges = np.histogram(data, bins=bins)

bar_width = 0.75

for left, height in zip(edges[:-1], counts):
    plt.bar(left + 0.5 * (1 - bar_width), height, width=bar_width, align='edge', color='#9b00d9')

plt.xlabel('Value')
plt.ylabel('Frequency')
plt.title('Histogram with spacing between bars')
plt.show()

## Now, let us part our database in the three categories we have defined

In [None]:
good = amazon[amazon['label'] == 2]
neutral = amazon[amazon['label'] == 1]
bad = amazon[amazon['label'] == 0]

### In order to better visualise these, we can plot some wordclouds

In [None]:
!pip install wordcloud
from wordcloud import WordCloud

In [None]:
goods = good['reviews.text'].tolist()
goods = [str(x) for x in goods]
goods_sentence = " ".join(goods)
plt.figure(figsize=(20,20))
plt.imshow(WordCloud().generate(goods_sentence))

In [None]:
neutrals = neutral['reviews.text'].tolist()
neutrals = [str(x) for x in neutrals]
neutrals_sentence = " ".join(neutrals)
plt.figure(figsize=(20,20))
plt.imshow(WordCloud().generate(neutrals_sentence))

In [None]:
bads = bad['reviews.text'].tolist()
bads = [str(x) for x in bads]
bads_sentence = " ".join(bads)
plt.figure(figsize=(20,20))
plt.imshow(WordCloud().generate(bads_sentence))

# Now it's time for some data cleaning!

## Punctuation

In [None]:
import string
string.punctuation

In [None]:
# For good
g = []
for s in goods:
    sentence = [c for c in s if c not in string.punctuation]
    s = ''.join(sentence)
    g.append(s)
goods = g

In [None]:
# For neutral
n = []
for s in neutrals:
    sentence = [c for c in s if c not in string.punctuation]
    s = ''.join(sentence)
    n.append(s)
neutrals = n

In [None]:
# For bad
b = []
for s in bads:
    sentence = [c for c in s if c not in string.punctuation]
    s = ''.join(sentence)
    b.append(s)
bads = b

## Stopwords

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords.words('english')

In [None]:
# For good
"""
g = []
for s in goods:
    sentence = [word for word in s.split() if word not in stopwords.words('english')]
    s = ' '.join(sentence)
    g.append(s)

goods = g
"""

In [None]:
# For neutral
"""
n = []
for s in neutrals:
    sentence = [word for word in s.split() if word not in stopwords.words('english')]
    s = ' '.join(sentence)
    g=n.append(s)

neutrals = n
"""

In [None]:
# For bad
"""
b = []
for s in bads:
    sentence = [word for word in s.split() if word not in stopwords.words('english')]
    s = ' '.join(sentence)
    b.append(s)

bads = b
"""

## Now, let us put it all together into a pipeline:

In [None]:
def data_cleaning(text):

    Test_punc_removed = [char for char in text if char not in string.punctuation]
    Test_punc_removed_join = ''.join(Test_punc_removed)
    Test_punc_removed_join_clean = [word for word in Test_punc_removed_join.split() if word.lower() not in stopwords.words('english')]
    
    return Test_punc_removed_join_clean    

# Now we can perform tokenization!

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
# amazon_clean = amazon['reviews.text'].apply(data_cleaning)

In [None]:
# print(amazon_clean[11])

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
vectorizer = CountVectorizer(analyzer = data_cleaning, dtype = np.uint8)

In [None]:
amazon_countvectorizer = vectorizer.fit_transform(amazon['reviews.text'])

In [None]:
print(amazon_countvectorizer.toarray())  

In [None]:
amazon_countvectorizer.shape

In [None]:
X = pd.DataFrame(amazon_countvectorizer.toarray())

In [None]:
X

# We will now use Naive Bayes in order to continue:

In [None]:
y = amazon['label']

In [None]:
X.shape

In [None]:
y.shape

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
from sklearn.naive_bayes import MultinomialNB

NB_classifier = MultinomialNB()
NB_classifier.fit(X_train, y_train)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
# Predicting the Test set results
y_predict_test = NB_classifier.predict(X_test)
cm = confusion_matrix(y_test, y_predict_test)
sns.heatmap(cm, annot=True, cmap="PiYG")

In [None]:
print(classification_report(y_test, y_predict_test))

# More Methods:

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

cm = confusion_matrix(y_pred, y_test)
sns.heatmap(cm, annot = True, cmap="coolwarm")

print(classification_report(y_test, y_pred))

# At this point we can observe that the accuracy is worryingly low, how the model works best on extremes, but has really big troubles in detecting the neutral labels. We will therefore train on the whole database for the sake of it and look for a better accuracy - at least for the extreme labels.

In [None]:
amazon_reviews = pd.read_csv("Amazon Reviews 1.csv")
amazon_reviews.dropna(subset = ["reviews.text"])
amazon_reviews.dropna(subset = ["reviews.title"])
amazon_reviews.dropna(subset = ["reviews.rating"])

In [None]:
amazon = amazon_reviews[['id', 'reviews.text', 'reviews.rating']]
amazon.describe()

## In case we have NaN entries for the reviews' text, we just "clean" them:

In [None]:
amazon['reviews.text'] = amazon['reviews.text'].fillna('')

In [None]:
def label(r):
    if r == 5.0:
        return 2
    elif r == 3.0 or r == 4.0:
        return 1
    return 0

amazon['label'] = amazon['reviews.rating'].apply(label)
amazon.describe()

In [None]:
data = amazon['label']
min_bin = - 0.5
max_bin = 2.5
bins = np.arange(min_bin, max_bin + 1, 1)  

counts, edges = np.histogram(data, bins=bins)

bar_width = 0.75

for left, height in zip(edges[:-1], counts):
    plt.bar(left + 0.5 * (1 - bar_width), height, width=bar_width, align='edge', color='#9b00d9')

plt.xlabel('Value')
plt.ylabel('Frequency')
plt.title('Histogram with spacing between bars')
plt.show()

In [None]:
vectorizer = CountVectorizer(analyzer = data_cleaning, dtype = np.uint8)
amazon_countvectorizer = vectorizer.fit_transform(amazon['reviews.text'])
print(amazon_countvectorizer.toarray())  

In [None]:
X = pd.DataFrame(amazon_countvectorizer.toarray())
X

In [None]:
y = amazon['label']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
from sklearn.naive_bayes import MultinomialNB

NB_classifier = MultinomialNB()
NB_classifier.fit(X_train, y_train)

In [None]:
# Predicting the Test set results
y_predict_test = NB_classifier.predict(X_test)
cm = confusion_matrix(y_test, y_predict_test)
sns.heatmap(cm, annot=True, cmap="PiYG")

In [None]:
print(classification_report(y_test, y_predict_test))

## For more methods:

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

cm = confusion_matrix(y_pred, y_test)
sns.heatmap(cm, annot = True, cmap="coolwarm")

print(classification_report(y_test, y_pred))