In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import re
import matplotlib.pyplot as plt

In [2]:
df=pd.read_csv("reviews.csv")
df

Unnamed: 0.1,Unnamed: 0,review,rating
0,0,Good product.. I thought its not comfort to me...,5
1,1,Good quality and quick delivery. Material is s...,5
2,2,Nice product from Butterfly used a good qualit...,4
3,3,It's Very useful and Time saver. But it's ver...,5
4,4,I was ordering this for my motherShe loved it ...,5
...,...,...,...
6031,9975,Well and good... Handle like arrangement may h...,4
6032,9976,Wow awesome product 😍😍READ MORE,1
6033,9977,Good.... Nothing mixer for batter then...READ ...,3
6034,9978,It goodREAD MORE,3


In [3]:
df.drop("Unnamed: 0",axis=1,inplace=True)

# Text preprocessing

In [4]:
# importing neccesary libraries
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("omw-1.4")
    
lemmatizer=WordNetLemmatizer()

[nltk_data] Downloading package stopwords to C:\Users\mounika
[nltk_data]     katla\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\mounika
[nltk_data]     katla\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\mounika
[nltk_data]     katla\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [5]:
def preprocessing(text):
    
    
    # removing READ MORE word
    sentance=re.sub("READ MORE","",text) 
    
    # removing urls
    sentance=re.sub("https?://\S+|www\.S\+","",sentance) 
    
    # removing html tags
    sentace=re.sub(r"<.*?>","",sentance) 
    
    # removing special characters and numbers
    sentance=re.sub("[^A-Za-z]"," ",sentance) 
    
    # convert the sentance into uniform case
    sentance=sentance.lower()
    
    # convert the sentance into word token
    tokens=sentance.split()
    
    # removing stopwords
    clean_tokens=[t for t in tokens if not t in stopwords.words("english")]
    
    # lemmatize the words
    clean_tokens=[lemmatizer.lemmatize(word) for word in clean_tokens]
    
    # return the lemmatize the word in sentence format
    return " ".join(clean_tokens)

In [6]:
df["lemsentances"]=df["review"].apply(preprocessing)

In [7]:
df

Unnamed: 0,review,rating,lemsentances
0,Good product.. I thought its not comfort to me...,5,good product thought comfort really comfort cook
1,Good quality and quick delivery. Material is s...,5,good quality quick delivery material stronger ...
2,Nice product from Butterfly used a good qualit...,4,nice product butterfly used good quality plast...
3,It's Very useful and Time saver. But it's ver...,5,useful time saver sharp blade beawar cleaning ...
4,I was ordering this for my motherShe loved it ...,5,ordering mothershe loved lot work fluently cut...
...,...,...,...
6031,Well and good... Handle like arrangement may h...,4,well good handle like arrangement may help bet...
6032,Wow awesome product 😍😍READ MORE,1,wow awesome product
6033,Good.... Nothing mixer for batter then...READ ...,3,good nothing mixer batter
6034,It goodREAD MORE,3,good


## spliting the data into train and test

In [8]:
X=df[["lemsentances"]]
y=df[["rating"]]

In [9]:
import sklearn 
from sklearn.model_selection import train_test_split

In [10]:
xTrain,xTest,yTrain,yTest=train_test_split(X,y,
                                          test_size=0.25)

# creating a model for text data using CountVectorizer

## CountVectorizer for Bag of Unigrams

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

countvectorizer=CountVectorizer()

In [12]:
countXTrain=countvectorizer.fit_transform(xTrain["lemsentances"])

In [13]:
countXTrain

<4527x2123 sparse matrix of type '<class 'numpy.int64'>'
	with 19369 stored elements in Compressed Sparse Row format>

In [14]:
countXTest=countvectorizer.transform(xTest["lemsentances"])

In [15]:
countXTest

<1509x2123 sparse matrix of type '<class 'numpy.int64'>'
	with 6077 stored elements in Compressed Sparse Row format>

### creating a Logistic Regression Model

In [16]:
from sklearn.linear_model import LogisticRegression
LR=LogisticRegression()

In [None]:
LR.fit(countXTrain,yTrain)

  y = column_or_1d(y, warn=True)


In [None]:
YTrainpred=LR.predict(countXTrain)
YTestpred=LR.predict(countXTest)

In [None]:
from sklearn.metrics import accuracy_score as acs

In [None]:
acs(yTrain,YTrainpred),acs(yTest,YTestpred)

## CountVectorizer for Bag of Uni-Bi grams

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

countvectorizer=CountVectorizer(ngram_range=(1,2))

In [None]:
countXTrain=countvectorizer.fit_transform(xTrain["lemsentances"])

In [None]:
countXTrain

In [None]:
countXTest=countvectorizer.transform(xTest["lemsentances"])

In [None]:
countXTest

### creating a Logistic Regression Model

In [None]:
from sklearn.linear_model import LogisticRegression
LR=LogisticRegression()

In [None]:
LR.fit(countXTrain,yTrain)

In [None]:
YTrainpred=LR.predict(countXTrain)
YTestpred=LR.predict(countXTest)

In [None]:
acs(yTrain,YTrainpred),acs(yTest,YTestpred)

# creating a model for text data using TfidfVectorizer

## TfidfVectorizer for Bag of Unigrams

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidfVec=TfidfVectorizer()

tfidfXTrain=tfidfVec.fit_transform(xTrain["lemsentances"])

tfdifXTest=tfidfVec.transform(xTest["lemsentances"])


### creating a Logistic Regression Model

In [None]:
from sklearn.linear_model import LogisticRegression
LR=LogisticRegression()
LR.fit(tfidfXTrain,yTrain)


In [None]:
YTrainpred=LR.predict(tfidfXTrain)
YTestpred=LR.predict(tfdifXTest)

In [None]:
acs(yTrain,YTrainpred),acs(yTest,YTestpred)

## TfidfVectorizer for Bag of Uni-Bi grams

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidfVec=TfidfVectorizer(ngram_range=(1,2))

tfidfXTrain=tfidfVec.fit_transform(xTrain["lemsentances"])

tfdifXTest=tfidfVec.transform(xTest["lemsentances"])


### creating a Logistic Regression Model

In [None]:
from sklearn.linear_model import LogisticRegression
LR=LogisticRegression()
LR.fit(tfidfXTrain,yTrain)


In [None]:
YTrainpred=LR.predict(tfidfXTrain)
YTestpred=LR.predict(tfdifXTest)

In [None]:
acs(yTrain,YTrainpred),acs(yTest,YTestpred)

# Conclusion