# Dress reviews

In [14]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

First I load in the data set, select only the reviews for dresses, and drop all the rows with missing or NaN values.

In [22]:
df = pd.read_csv('dress_reviews.csv')
df = df.loc[(df['Class Name'] == 'Dresses')]
df = df.dropna()
df.head()

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
5,5,1080,49,Not for the very petite,"I love tracy reese dresses, but this one is no...",2,0,4,General,Dresses,Dresses
8,8,1077,24,Flattering,I love this dress. i usually get an xs but it ...,5,1,0,General,Dresses,Dresses
9,9,1077,34,Such a fun dress!,"I'm 5""5' and 125 lbs. i ordered the s petite t...",5,1,0,General,Dresses,Dresses
10,10,1077,53,Dress looks like it's made of cheap material,Dress runs small esp where the zipper area run...,3,0,14,General,Dresses,Dresses


In [27]:
df['PosNeg'] = 'x'
df.loc[(df['Rating'] <= 3), 'PosNeg'] = 'Negative'
df.loc[(df['Rating'] >= 4), 'PosNeg'] = 'Positive'
df

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name,PosNeg
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses,Negative
5,5,1080,49,Not for the very petite,"I love tracy reese dresses, but this one is no...",2,0,4,General,Dresses,Dresses,Negative
8,8,1077,24,Flattering,I love this dress. i usually get an xs but it ...,5,1,0,General,Dresses,Dresses,Positive
9,9,1077,34,Such a fun dress!,"I'm 5""5' and 125 lbs. i ordered the s petite t...",5,1,0,General,Dresses,Dresses,Positive
10,10,1077,53,Dress looks like it's made of cheap material,Dress runs small esp where the zipper area run...,3,0,14,General,Dresses,Dresses,Negative
...,...,...,...,...,...,...,...,...,...,...,...,...
23478,23478,1104,32,Unflattering,I was surprised at the positive reviews for th...,1,0,0,General Petite,Dresses,Dresses,Negative
23481,23481,1104,34,Great dress for many occasions,I was very happy to snag this dress at such a ...,5,1,0,General Petite,Dresses,Dresses,Positive
23483,23483,1104,31,"Cute, but see through","This fit well, but the top was very see throug...",3,0,1,General Petite,Dresses,Dresses,Negative
23484,23484,1084,28,"Very cute dress, perfect for summer parties an...",I bought this dress for a wedding i have this ...,3,1,2,General,Dresses,Dresses,Negative


The dresses have 75% positive reviews so it's a decent webshop.

In [32]:
df['PosNeg'].value_counts(normalize=True)

Positive    0.753119
Negative    0.246881
Name: PosNeg, dtype: float64

First, let's start with the code to generate a document-feature matrix

In [28]:
text = df['Review Text'].values.astype('U') #Taking the text from the df. We need to convert it to Unicode
vect = CountVectorizer(stop_words='english') #Create the CV object, with English stop words
vect = vect.fit(text) #We fit the model with the words from the review text
docu_feat = vect.transform(text) # make a matrix

Now, we will use the Naïve Bayes classifier from `sklearn`.

In [30]:
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB() #create the model
X = docu_feat #the document-feature matrix is the X matrix
y = df['PosNeg'] #creating the y vector

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) #split the data and store it

nb = nb.fit(X_train, y_train) #fit the model X=features, y=character

#Evaluate the model
y_test_p = nb.predict(X_test)
nb.score(X_test, y_test)

0.8542183622828784

The accuracy is 59% if we try to predict the rating itself (1-5) which is not great.
When we predict categorical values (Positive/Negative) we get 85% accuracy which is good.

In [45]:
incorrect = pd.DataFrame(y_test_p != y_test)
incorrect = incorrect[incorrect.PosNeg.astype(str).str.contains('True')]
print(incorrect)

       PosNeg
2956     True
1207     True
9765     True
8413     True
15259    True
...       ...
13688    True
16840    True
6017     True
3305     True
18198    True

[235 rows x 1 columns]


In [46]:
# indices = [i for i in range(len(y_test)) if y_test[i] != y_pred[i]]
# wrong_predictions = df.iloc[indices,:]

KeyError: 0

In [34]:
cm = confusion_matrix(y_test, y_test_p)
cm = pd.DataFrame(cm, index=['Positive', 'Negative'], columns=['Positive', 'Negative'])
cm

Unnamed: 0,Positive,Negative
Positive,238,168
Negative,67,1139
