#Uzipping the files

In [None]:
!unzip "/content/drive/MyDrive/Colab Notebooks/txt_reviews.zip" -d "/content/Text"

#Importing required libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import os
from datetime import datetime
import nltk
nltk.download("stopwords")
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from tqdm.notebook import tqdm
tqdm.pandas()

# reading text files

In [None]:
file_names = os.listdir('/content/Text/txt_reviews')

print('Total number of files in the dataset:', len(file_names))

Total number of files in the dataset: 568454


# Reading the text files and appeding them into the list

In [None]:
ProductId = []
UserId = []
ProfileName=[]
HelpfulnessNumerator=[]
HelpfulnessDenominator=[]
Score=[]
Time=[]
ReviewSummary=[]
ReviewText=[]
for file in file_names:
  with open("/content/Text/txt_reviews/"+file,"r") as f:
    lines=f.readlines()
    ProductId.append(lines[0].split(":")[1].strip("\n"))
    UserId.append(lines[1].split(":")[1].strip("\n"))
    ProfileName.append(lines[2].split(":")[1].strip("\n"))
    HelpfulnessNumerator.append(lines[3].split(":")[1].strip("\n"))
    HelpfulnessDenominator.append(lines[4].split(":")[1].strip("\n"))
    Score.append(lines[5].split(":")[1].strip("\n"))
    Time.append(lines[6].split(":")[1].strip("\n"))
    ReviewSummary.append(lines[7].split(":")[1].strip("\n"))
    ReviewText.append(lines[8].split(":")[1].strip("\n"))


# Creating the DataFrame

In [None]:
review_text=pd.DataFrame({"ProductId":ProductId,"UserId":UserId,"ProfileName":ProfileName,
                          "HelpfulnessNumerator":HelpfulnessNumerator,"HelpfulnessDenominator":HelpfulnessDenominator,
                          "Score":Score,"Time":Time,"ReviewSummary":ReviewSummary,"ReviewText":ReviewText})
review_text.head()

Unnamed: 0,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,ReviewSummary,ReviewText
0,B000F2RIQC,A1QRQSAPZJDWOG,Christie A. Phillips,0,0,5,1292889600,Amazing!,This is the absolute best seasoning for chick...
1,B003C5NN8Q,A1KNNOM3EB1BIW,Poodlemom,0,0,5,1325376000,Poodle Party,My two poodles love the Dogswell Mellow Mut f...
2,B000255OIG,A1R87YSXT4SDS4,Robert J. Ruddy,10,10,5,1139270400,Freeze dried liver treats,My dog has never stopped loving these treats ...
3,B000X61Y60,A2XGVBSTZXCMZ9,K. Connolly,0,0,5,1346371200,"Great taste, less sodium",The flavor of this natural sea salt is more n...
4,B006G7XV7A,A2XL93OUH15DUG,Victor Wen,0,1,2,1285545600,Not very good,It's not very tasty. Chewing this beef jerky ...


# Cleaning Steps as below

In [None]:
review_text[["HelpfulnessNumerator","HelpfulnessDenominator","Score","Time"]]=review_text[["HelpfulnessNumerator","HelpfulnessDenominator","Score","Time"]].apply(pd.to_numeric)

In [None]:
review_text["Helpfulness"]=(review_text["HelpfulnessNumerator"]/review_text["HelpfulnessDenominator"])*100

In [None]:
review_text["Helpfulness"]=review_text["Helpfulness"].fillna(0)

In [None]:
review_text["Helpfulness"]=review_text["Helpfulness"].astype(float)

In [None]:
review_text["Time"]=review_text["Time"].apply(lambda x:''.join(re.findall('(.*)T',datetime.fromtimestamp(x).isoformat())))

In [None]:
review_text["Time"]=pd.to_datetime(review_text["Time"])

In [None]:
review_text.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 568454 entries, 0 to 568453
Data columns (total 10 columns):
 #   Column                  Non-Null Count   Dtype         
---  ------                  --------------   -----         
 0   ProductId               568454 non-null  object        
 1   UserId                  568454 non-null  object        
 2   ProfileName             568454 non-null  object        
 3   HelpfulnessNumerator    568454 non-null  int64         
 4   HelpfulnessDenominator  568454 non-null  int64         
 5   Score                   568454 non-null  int64         
 6   Time                    568454 non-null  datetime64[ns]
 7   ReviewSummary           568454 non-null  object        
 8   ReviewText              568454 non-null  object        
 9   Helpfulness             568454 non-null  float64       
dtypes: datetime64[ns](1), float64(1), int64(3), object(5)
memory usage: 43.4+ MB


In [None]:
review_text.duplicated().sum()

287

In [None]:
review_text.drop_duplicates(inplace=True)

# Saving into CSV file

In [None]:
review_text.to_csv("text1.csv")

# Segregation of Score into positive (1) and Negative(0) sentiments

In [None]:
review_text.loc[review_text['Score']<3, 'Score'] = 0
review_text.loc[review_text['Score']>3, 'Score'] = 1

In [None]:
review_text=review_text[review_text["Score"]!=3]

In [None]:
review_text.to_csv("text.csv")

In [None]:
stemmer=PorterStemmer()
lemmatizer=WordNetLemmatizer()

# Reading the CSV file

In [None]:
df=pd.read_csv("/content/drive/MyDrive/Colab Notebooks/text.csv")

# Defining the function for all data cleaning steps under preprocess

In [None]:
def preprocess1(text,flag):
    sentence=re.sub("[^a-zA-Z]"," ",text)
    sentence=sentence.lower()
    tokens=sentence.split()
    clean_tokens=[t for t in tokens if t not in stopwords.words("english")]
    if flag=="stem":
        clean_tokens=[stemmer.stem(word) for word in clean_tokens]
    else:
        clean_tokens=[lemmatizer.lemmatize(word) for word in clean_tokens]
    return " ".join(clean_tokens)

# Defining the input and output 

In [None]:
X=df[["ReviewText","ReviewSummary"]]
y=df["Score"]

In [None]:
from tqdm.notebook import tqdm
tqdm.pandas()

# Splitting the Data into train and test

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=50)

# Applying the data cleaning steps for the text data

In [None]:
X_train["ReviewText"]=X_train["ReviewText"].progress_apply(lambda x : preprocess1(x,"lemma"))


  0%|          | 0/420442 [00:00<?, ?it/s]

In [None]:
X_train["ReviewSummary"]=X_train["ReviewSummary"].progress_apply(lambda x : preprocess1(x,"lemma"))

  0%|          | 0/420442 [00:00<?, ?it/s]

In [None]:
X_test["ReviewText"]=X_test["ReviewText"].progress_apply(lambda x : preprocess1(x,"lemma"))


  0%|          | 0/105111 [00:00<?, ?it/s]

In [None]:
X_test["ReviewSummary"]=X_test["ReviewSummary"].progress_apply(lambda x : preprocess1(x,"lemma"))

  0%|          | 0/105111 [00:00<?, ?it/s]

In [None]:
X_train1.to_csv("X_train_clean.csv")
X_test1.to_csv("X_test_clean.csv")

# Extracting the numerical features from the text data

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
vocab=TfidfVectorizer()
X_train_trans1=vocab.fit_transform(X_train["ReviewText"])
X_test_trans1=vocab.transform(X_test["ReviewText"])

In [None]:
X_train_trans2=vocab.fit_transform(X_train["ReviewSummary"])
X_test_trans2=vocab.transform(X_test["ReviewSummary"])

# Concating the dtm's using scipy

In [None]:
import scipy

In [None]:
from scipy.sparse import hstack
X_train_trans=hstack((X_train_trans1, X_train_trans2))

<420442x25973 sparse matrix of type '<class 'numpy.float64'>'
	with 1181544 stored elements in Compressed Sparse Row format>

In [None]:
X_test_trans=hstack((X_test_trans1, X_test_trans2))

# LogisticRegression

In [None]:
from sklearn.linear_model import LogisticRegression
log=LogisticRegression()
log.fit(X_train_trans,y_train)
y_test_pred=log.predict(X_test_trans)
from sklearn import metrics
metrics.accuracy_score(y_test,y_test_pred)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.9525454043820343

# GradientBoostingClassifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
GB=GradientBoostingClassifier()
GB.fit(X_train_trans,y_train)
y_test_pred=GB.predict(X_test_trans)
from sklearn import metrics
metrics.accuracy_score(y_test,y_test_pred)

0.8805358145198885

# RandomForestClassifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
Ran=RandomForestClassifier()
Ran.fit(X_train_trans,y_train)
y_test_pred=Ran.predict(X_test_trans)
from sklearn import metrics
metrics.accuracy_score(y_test,y_test_pred)

0.936685979583488

# AdaBoostClassifier

In [None]:
from sklearn.ensemble import AdaBoostClassifier
Ada=AdaBoostClassifier()
Ada.fit(X_train_trans,y_train)
y_test_pred=Ada.predict(X_test_trans)
from sklearn import metrics
metrics.accuracy_score(y_test,y_test_pred)

0.8830189038254798

# KNeighborsClassifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier
Knn=KNeighborsClassifier()
Knn.fit(X_train_trans,y_train)
y_test_pred=Knn.predict(X_test_trans)
from sklearn import metrics
metrics.accuracy_score(y_test,y_test_pred)

0.908553814538916

# DecisionTreeClassifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt=DecisionTreeClassifier()
dt.fit(X_train_trans,y_train)
y_test_pred=dt.predict(X_test_trans)
from sklearn import metrics
metrics.accuracy_score(y_test,y_test_pred)

0.9202557296572195

# Conclusion
- From the above all accuracy scores we say that LogisticRegression is the best fit model for sentiment prediction