In [96]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

In [97]:
#Read the csv file downloaded from Kaggle
df = pd.read_csv("Womens Clothing E-Commerce Reviews.csv")
df.head(15)

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses
5,5,1080,49,Not for the very petite,"I love tracy reese dresses, but this one is no...",2,0,4,General,Dresses,Dresses
6,6,858,39,Cagrcoal shimmer fun,I aded this in my basket at hte last mintue to...,5,1,1,General Petite,Tops,Knits
7,7,858,39,"Shimmer, surprisingly goes with lots","I ordered this in carbon for store pick up, an...",4,1,4,General Petite,Tops,Knits
8,8,1077,24,Flattering,I love this dress. i usually get an xs but it ...,5,1,0,General,Dresses,Dresses
9,9,1077,34,Such a fun dress!,"I'm 5""5' and 125 lbs. i ordered the s petite t...",5,1,0,General,Dresses,Dresses


In [98]:
#Drop Columns that has no high impact on the sentiment analysis process
df.drop(['Unnamed: 0','Age','Title','Division Name','Department Name','Class Name'],axis =1,inplace=True)
df.head()

Unnamed: 0,Clothing ID,Review Text,Rating,Recommended IND,Positive Feedback Count
0,767,Absolutely wonderful - silky and sexy and comf...,4,1,0
1,1080,Love this dress! it's sooo pretty. i happene...,5,1,4
2,1077,I had such high hopes for this dress and reall...,3,0,0
3,1049,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0
4,847,This shirt is very flattering to all due to th...,5,1,6


In [99]:
df.shape

(23486, 5)

In [100]:
#Check how many null rows we have in the entire dataset
df.isna().sum()

Clothing ID                  0
Review Text                845
Rating                       0
Recommended IND              0
Positive Feedback Count      0
dtype: int64

In [101]:
#Drop null values in the specified column
df.dropna(subset=['Review Text'], inplace = True)

In [102]:
df.isna().sum()

Clothing ID                0
Review Text                0
Rating                     0
Recommended IND            0
Positive Feedback Count    0
dtype: int64

In [None]:
#Apply preprocess data cleaning and transformation on sentences in the Review Text column

In [103]:
df['Review Text'] = df['Review Text'].str.lower()    

In [104]:
df['Review Text'] = df['Review Text'].replace(to_replace= r'\d+', value = '', regex=True)

In [105]:
df['Review Text']= df['Review Text'].replace(to_replace= r'[^\w\s]', value='', regex=True)

In [108]:
df['Review Text']= df['Review Text'].replace(to_replace= r'<[^>]+>', value='', regex=True)

In [109]:
df['Review Text']= df['Review Text'].replace(to_replace= r'http\S+', value='', regex=True)

In [110]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [111]:
sw = set(stopwords.words('english'))

In [112]:
sw

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [114]:
df['Review Text']= df['Review Text'].apply(lambda x: " ".join([word for word in word_tokenize(x) if word not in sw]))

In [116]:
L = WordNetLemmatizer()

In [117]:
df['Review Text']= df['Review Text'].apply(lambda x: " ".join([L.lemmatize(word) for word in word_tokenize(x)]))

In [119]:
df

Unnamed: 0,Clothing ID,Review Text,Rating,Recommended IND,Positive Feedback Count
0,767,absolutely wonderful silky sexy comfortable,4,1,0
1,1080,love dress sooo pretty happened find store im ...,5,1,4
2,1077,high hope dress really wanted work initially o...,3,0,0
3,1049,love love love jumpsuit fun flirty fabulous ev...,5,1,0
4,847,shirt flattering due adjustable front tie perf...,5,1,6
...,...,...,...,...,...
23481,1104,happy snag dress great price easy slip flatter...,5,1,0
23482,862,reminds maternity clothes soft stretchy shiny ...,3,1,0
23483,1104,fit well top see never would worked im glad ab...,3,0,1
23484,1084,bought dress wedding summer cute unfortunately...,3,1,2


In [None]:
#Split the data as dependent and independet  with test size = 20%

In [120]:
x = df['Review Text']
y = df['Recommended IND']

In [121]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2)

In [None]:
#Count Vectorization on top of "x" data which is the column that contain the review text as the machine accept only numeric values

In [123]:
vec = CountVectorizer()
x_train = vec.fit_transform(x_train).toarray()
x_test = vec.transform(x_test)

In [None]:
#Define the model with fitting

In [124]:
model = MultinomialNB()

In [125]:
model.fit(x_train,y_train)

In [127]:
y_pred = model.predict(x_test)

In [133]:
x_train.shape

(18112, 14984)

In [134]:
y_train.shape

(18112,)

In [135]:
x_test.shape

(4529, 14984)

In [136]:
y_test.shape

(4529,)

In [137]:
y_pred

array([1, 1, 0, ..., 1, 1, 1], dtype=int64)

In [None]:
#Calculating the model mean accuracy 

In [139]:
model.score(x_test,y_test)

0.8812099801280636

In [145]:
model.predict(vec.transform(['This product was faraway from expectations, i will not try it again']))

array([0], dtype=int64)

In [147]:
model.predict_proba(vec.transform(['This product was faraway from expectations, i will not try it again']))

array([[0.56823352, 0.43176648]])