<a href="https://colab.research.google.com/github/HoseinBakhshian/Amazon-DataSet-Sentiment-analysis-LogisticRegression/blob/main/LogisticRegression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import numpy as np
import pandas as pd
import sklearn as skl

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import re
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
#read dataset from google drive and skip on the bad lines
data = pd.read_csv('/content/drive/MyDrive/amazon_reviews.txt', sep='\t', on_bad_lines='skip' )

#select a subset of dataset contain mentioned columns below
data=data[['review_body','star_rating']]

In [None]:
#determine the sentiment for each 'review_body' and add it to sentiment column
data['Sentiment']=''
data['Sentiment']=np.where((data['star_rating'] < 3), "Negetive", data['Sentiment'])
data['Sentiment']=np.where((data['star_rating'] > 3 ), "Positive", data['Sentiment'])
data['Sentiment']=np.where((data['star_rating'] == 3), "Neutral", data['Sentiment'])

In [None]:
#show sentiments in diagram
sns.countplot(x='Sentiment', data=data)

In [None]:
#define a preprocessing method
def text_Preprocessing(text):
  reviews=[text.lower() for text in text]                      #convert text to lower case
  reviews=[re.sub(r'\d+','',text) for text in reviews]         #remove digits
  reviews=[re.sub(r'\S+@\S+','',text) for text in reviews]     #remove email
  reviews=[re.sub(r'^\w\s','',text) for text in reviews]       #remove alphanumeric values
  reviews=[re.sub(r'[^A-Za-z]+',' ',text) for text in reviews] #remove non words characters
  reviews=[text.strip() for text in reviews]                   #remove extra spaces

  stop_words=set(stopwords.words('english'))   #define stopwords in english with nltk
  cleaned_reviews=[]
  lem_reviews=[]

  for review in reviews:
    tokens=[word for word in word_tokenize(review) if not word in stop_words] #if review is not a stopword tokenize it and add it to tokens
    cleaned_reviews.append(" ".join(tokens)) # append cleaned review to cleaned_reviews
  lemmatizer=WordNetLemmatizer()
  for review in cleaned_reviews:
    lem_reviews.append(" ".join(list(map(lemmatizer.lemmatize, word_tokenize(review))))) #lemmatize words and apend them to lem_reviews
  return lem_reviews

In [None]:
#convert review_body format to String
data.review_body = data.review_body.astype(str)

#select review_body column and apply preprocessing method to each row
b=data['review_body']
data['review_body']=text_Preprocessing(b)

In [None]:
# CountVectorizer used to transform a given text into a vector on the basis of the frequency (count) of each word that occurs in the entire text
X = CountVectorizer().fit_transform(data['review_body'])
Y = data['Sentiment']

In [None]:
#split data into validation, test, train
X_main,X_test,Y_main,Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
X_train,X_val,Y_train,Y_val = train_test_split(X_main, Y_main, test_size=0.2, random_state=42)

In [None]:
#classification with LogisticRegression
regression = LogisticRegression()
regression.fit(X_train, Y_train)

In [None]:
#predict the validation data
regression_pred = regression.predict(X_val)
regression_acc = accuracy_score(regression_pred, Y_val)
print("validation accuracy: {:.2f}%".format(regression_acc*100))

validation accuracy: 85.84%


In [None]:
#show predicted sentiments
df = pd.DataFrame(regression_pred)
df

In [None]:
#predict the test data
regression_pred = regression.predict(X_test)
regression_acc = accuracy_score(regression_pred, Y_test)
print("test accuracy: {:.2f}%".format(regression_acc*100))

test accuracy: 85.76%


In [None]:
#show predicted sentiments
df = pd.DataFrame(regression_pred)
df

In [None]:
# use classification_report to show recall, precision, f1
print(classification_report(Y_test, regression_pred))

              precision    recall  f1-score   support

    Negetive       0.74      0.67      0.71     27554
     Neutral       0.42      0.12      0.18     16062
    Positive       0.89      0.97      0.93    148425

    accuracy                           0.86    192041
   macro avg       0.68      0.59      0.61    192041
weighted avg       0.83      0.86      0.83    192041

