# women Clothing Review 
This is a Women’s Clothing E-Commerce dataset revolving around the reviews written by customers. 


In [1]:
import pandas as pd

In [2]:
reviews = pd.read_csv('Womens Clothing E-Commerce Reviews.csv')
reviews.head()

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses


In [3]:
print(reviews.shape)

(23486, 11)


In [4]:
#checking for missing values and dropping them
reviews.isna().sum()

Unnamed: 0                    0
Clothing ID                   0
Age                           0
Title                      3810
Review Text                 845
Rating                        0
Recommended IND               0
Positive Feedback Count       0
Division Name                14
Department Name              14
Class Name                   14
dtype: int64

In [5]:
reviews.dropna(inplace=True)
reviews.isna().sum()

Unnamed: 0                 0
Clothing ID                0
Age                        0
Title                      0
Review Text                0
Rating                     0
Recommended IND            0
Positive Feedback Count    0
Division Name              0
Department Name            0
Class Name                 0
dtype: int64

In [6]:
reviews.reset_index(inplace=True)
x=reviews['Review Text']
y=reviews['Recommended IND']

In [7]:
x.head()

0    I had such high hopes for this dress and reall...
1    I love, love, love this jumpsuit. it's fun, fl...
2    This shirt is very flattering to all due to th...
3    I love tracy reese dresses, but this one is no...
4    I aded this in my basket at hte last mintue to...
Name: Review Text, dtype: object

In [8]:
y.head()

0    0
1    1
2    1
3    0
4    1
Name: Recommended IND, dtype: int64

### Text Preprocessing 
* removing punctuation
* lower casing

### Text Normalization--Think of this step as converting human readble language into a form that is machine readable.
* Tokenization
* stopword removal
* Stemming

In [9]:
import nltk
import re
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to C:\Users\Krishna
[nltk_data]     Vardhan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
ps=PorterStemmer()
corpus=[] #storing the text after preprocesing in corpus 
for i in range (0,len(x)):
    review = re.sub('[^a-zA-Z]', ' ',x[i])
    review = review.lower() 
    review = review.split()
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

['high hope dress realli want work initi order petit small usual size found outrag small small fact could zip reorder petit medium ok overal top half comfort fit nice bottom half tight layer sever somewhat cheap net layer imo major design flaw net layer sewn directli zipper c',
 'love love love jumpsuit fun flirti fabul everi time wear get noth great compliment',
 'shirt flatter due adjust front tie perfect length wear leg sleeveless pair well cardigan love shirt',
 'love traci rees dress one petit feet tall usual wear p brand dress pretti packag lot dress skirt long full overwhelm small frame stranger alter shorten narrow skirt would take away embellish garment love color idea style work return dress',
 'ade basket hte last mintu see would look like person store pick went teh darkler color pale hte color realli gorgeou turn mathc everythi tri prefectli littl baggi hte xs hte msallet size bummer petit decid jkeep though said matvehd everyth ejan pant skirt waa tri kept oop',
 'order ca

### Feature Extraction -- we cannot work with text directly when using machine learning algoriths.Instead, we need to convert the text to numbers 
* TF-IDF stands for Term Frequency-Inverse Document Frequency. 
* Term Frequency: This summarizes how often a given word appears within a document.
* Inverse Document Frequency: This downscales words that appear a lot across documents.


In [12]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X = cv.fit_transform(corpus)

In [13]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer()

In [14]:
x1 = tfidf.fit_transform(X)

In [15]:
x1.shape

(19662, 8897)

In [16]:
# splitting the data 
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x1, y, test_size = 0.2, random_state = 0)

### fitting pipeline of models 

In [17]:
# Imporing all required lobraries
from sklearn.model_selection import cross_val_score

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
from sklearn.model_selection import KFold 

In [18]:
# validation  options and evaluation metric
num_folds = 10
#num_instances = len(X_train)
seed = 7
scoring = 'accuracy'


### Model Pipeline

In [19]:
# Spot Check Algorithms
models = []
models.append(('LR', LogisticRegression()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('NB', BernoulliNB()))
models.append(('SVM', SVC()))
models.append(('RF', RandomForestClassifier()))


In [25]:
# Finding best model
results = []
names = []
for name, model in models:
    kfold = KFold(n_splits=num_folds, random_state=seed)
    cv_results = cross_val_score(model, x_train, y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)


LR: 0.882637 (0.007320)
KNN: 0.842139 (0.007232)
NB: 0.872719 (0.006152)
SVM: 0.817407 (0.008503)
RF: 0.849704 (0.007349)


Logistic Regression has high mean value 

In [24]:
import warnings 
warnings.filterwarnings('ignore')

In [32]:
#fitting the model 
lr = LogisticRegression()
lr.fit(x_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [33]:
#estimate accuracy on train data 
pred = lr.predict(x_train)
print(accuracy_score(y_train,pred))
print(confusion_matrix(y_train,pred))

0.904507597431496
[[ 1669  1203]
 [  299 12558]]


In [34]:
#estimate accuracy on test data 
pred1 = lr.predict(x_test)
print(accuracy_score(y_test,pred1))
print(confusion_matrix(y_test,pred1))

0.8853292651919654
[[ 365  338]
 [ 113 3117]]


### we can see that train data has 90 % accuracy and test data has 88% accuracy , since train and test data accuracy are more or less similar we conclude that the obtained model is generalised model.