# Text Analysis for Women's E-Commerce Clothing Reviews

## Libraries and Settings

In [1]:
import os
import tqdm
import pandas as pd

import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import PorterStemmer

%config Completer.use_jedi = False

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ivannardini/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Variables

In [30]:
INTERIM_DIR = os.path.join(os.pardir, 'data', 'interim')
RANDOM_STATE = 8
LANG = 'english'
PROCESSED_DIR = os.path.join(os.pardir, 'data', 'processed')

## Helpers

In [3]:
def load_data(path, filename):
    data_path = os.path.join(path, filename)
    df = pd.read_csv(data_path)
    return df
    
def remove_sw(words_list):
    stop_words=stopwords.words("english")
    return [word for word in words_list if word not in stop_words]

def stemmer(words_list):
    ps = PorterStemmer()
    return [ps.stem(word) for word in words_list]

def save_data(df, path, filename):
    data_path = os.path.join(path, filename)
    df.to_csv(data_path)

## Load data

In [20]:
train = load_data(INTERIM_DIR, 'train.csv')
test = load_data(INTERIM_DIR, 'test.csv')
val = load_data(INTERIM_DIR, 'val.csv')

In [21]:
train.head()

Unnamed: 0,clothing_id,review_text,recommended_ind
0,867,I have been admiring this piece for awhile and...,1
1,1081,This dress looks great on me. it gives a slend...,1
2,862,I love this! i agree with previous post that s...,1
3,1081,Not sure why this dress was once backordered? ...,0
4,1020,"Unlike the other reviewers, i did not have any...",1


In [22]:
train.head()

Unnamed: 0,clothing_id,review_text,recommended_ind
0,867,I have been admiring this piece for awhile and...,1
1,1081,This dress looks great on me. it gives a slend...,1
2,862,I love this! i agree with previous post that s...,1
3,1081,Not sure why this dress was once backordered? ...,0
4,1020,"Unlike the other reviewers, i did not have any...",1


## Data Preparation

In [23]:
# lower
train['review_lower'] = train['review_text'].apply(lambda x: x.lower())
# remove punctualizations
train['review_nopct'] = train['review_lower'].apply(lambda x: x.translate(str.maketrans('','',string.punctuation)))
# remove digit
train['review_nodg'] = train['review_nopct'].apply(lambda x: x.translate(str.maketrans('','',string.digits)))
# split 
train['review_word_tokens'] = train['review_nodg'].apply(word_tokenize)
# remove stopwords
train['review_no_sw'] = train['review_word_tokens'].apply(remove_sw)
# stemming
train['review_stem'] = train['review_no_sw'].apply(stemmer)

In [24]:
train.head()

Unnamed: 0,clothing_id,review_text,recommended_ind,review_lower,review_nopct,review_nodg,review_word_tokens,review_no_sw,review_stem
0,867,I have been admiring this piece for awhile and...,1,i have been admiring this piece for awhile and...,i have been admiring this piece for awhile and...,i have been admiring this piece for awhile and...,"[i, have, been, admiring, this, piece, for, aw...","[admiring, piece, awhile, finally, decided, pu...","[admir, piec, awhil, final, decid, purchas, ha..."
1,1081,This dress looks great on me. it gives a slend...,1,this dress looks great on me. it gives a slend...,this dress looks great on me it gives a slende...,this dress looks great on me it gives a slende...,"[this, dress, looks, great, on, me, it, gives,...","[dress, looks, great, gives, slender, appearan...","[dress, look, great, give, slender, appear, hi..."
2,862,I love this! i agree with previous post that s...,1,i love this! i agree with previous post that s...,i love this i agree with previous post that sa...,i love this i agree with previous post that sa...,"[i, love, this, i, agree, with, previous, post...","[love, agree, previous, post, say, much, bette...","[love, agre, previou, post, say, much, better,..."
3,1081,Not sure why this dress was once backordered? ...,0,not sure why this dress was once backordered? ...,not sure why this dress was once backordered i...,not sure why this dress was once backordered i...,"[not, sure, why, this, dress, was, once, backo...","[sure, dress, backordered, big, chested, basic...","[sure, dress, backord, big, chest, basic, fall..."
4,1020,"Unlike the other reviewers, i did not have any...",1,"unlike the other reviewers, i did not have any...",unlike the other reviewers i did not have any ...,unlike the other reviewers i did not have any ...,"[unlike, the, other, reviewers, i, did, not, h...","[unlike, reviewers, problem, sizing, fit, leng...","[unlik, review, problem, size, fit, length, sk..."


In [25]:
test['review_lower'] = test['review_text'].apply(lambda x: x.lower())
test['review_nopct'] = test['review_lower'].apply(lambda x: x.translate(str.maketrans('','',string.punctuation)))
test['review_nodg'] = test['review_nopct'].apply(lambda x: x.translate(str.maketrans('','',string.digits)))
test['review_word_tokens'] = test['review_nodg'].apply(word_tokenize)
test['review_no_sw'] = test['review_word_tokens'].apply(remove_sw)
test['review_stem'] = test['review_no_sw'].apply(stemmer)

In [26]:
test.head()

Unnamed: 0,clothing_id,review_text,recommended_ind,review_lower,review_nopct,review_nodg,review_word_tokens,review_no_sw,review_stem
0,927,This looks so cute...but don't be fooled. it i...,0,this looks so cute...but don't be fooled. it i...,this looks so cutebut dont be fooled it is so ...,this looks so cutebut dont be fooled it is so ...,"[this, looks, so, cutebut, dont, be, fooled, i...","[looks, cutebut, dont, fooled, poofy, look, re...","[look, cutebut, dont, fool, poofi, look, real,..."
1,820,This is the most unflattering peplum top i hav...,0,this is the most unflattering peplum top i hav...,this is the most unflattering peplum top i hav...,this is the most unflattering peplum top i hav...,"[this, is, the, most, unflattering, peplum, to...","[unflattering, peplum, top, ever, tried, fit, ...","[unflatt, peplum, top, ever, tri, fit, perfect..."
2,936,"Let me start this review by saying, 75% of my ...",0,"let me start this review by saying, 75% of my ...",let me start this review by saying 75 of my cl...,let me start this review by saying of my clot...,"[let, me, start, this, review, by, saying, of,...","[let, start, review, saying, clothing, retaile...","[let, start, review, say, cloth, retail, love,..."
3,862,Love the petroleum blue\nunique twist design\n...,1,love the petroleum blue\nunique twist design\n...,love the petroleum blue\nunique twist design\n...,love the petroleum blue\nunique twist design\n...,"[love, the, petroleum, blue, unique, twist, de...","[love, petroleum, blue, unique, twist, design,...","[love, petroleum, blue, uniqu, twist, design, ..."
4,1061,I ordered a small and the top part was huge ye...,0,i ordered a small and the top part was huge ye...,i ordered a small and the top part was huge ye...,i ordered a small and the top part was huge ye...,"[i, ordered, a, small, and, the, top, part, wa...","[ordered, small, top, part, huge, yet, bottom,...","[order, small, top, part, huge, yet, bottom, l..."


In [27]:
val['review_lower'] = val['review_text'].apply(lambda x: x.lower())
val['review_nopct'] = val['review_lower'].apply(lambda x: x.translate(str.maketrans('','',string.punctuation)))
val['review_nodg'] = val['review_nopct'].apply(lambda x: x.translate(str.maketrans('','',string.digits)))
val['review_word_tokens'] = val['review_nodg'].apply(word_tokenize)
val['review_no_sw'] = val['review_word_tokens'].apply(remove_sw)
val['review_stem'] = val['review_no_sw'].apply(stemmer)

In [28]:
val.head()

Unnamed: 0,clothing_id,review_text,recommended_ind,review_lower,review_nopct,review_nodg,review_word_tokens,review_no_sw,review_stem
0,1066,"I tried these on in the store, and they are su...",1,"i tried these on in the store, and they are su...",i tried these on in the store and they are sup...,i tried these on in the store and they are sup...,"[i, tried, these, on, in, the, store, and, the...","[tried, store, super, cute, run, small, typica...","[tri, store, super, cute, run, small, typic, w..."
1,1055,I took the advise of a reviewer and sized down...,0,i took the advise of a reviewer and sized down...,i took the advise of a reviewer and sized down...,i took the advise of a reviewer and sized down...,"[i, took, the, advise, of, a, reviewer, and, s...","[took, advise, reviewer, sized, xs, could, tak...","[took, advis, review, size, xs, could, take, w..."
2,1066,I ordered these in my usual pilcro petite size...,0,i ordered these in my usual pilcro petite size...,i ordered these in my usual pilcro petite size...,i ordered these in my usual pilcro petite size...,"[i, ordered, these, in, my, usual, pilcro, pet...","[ordered, usual, pilcro, petite, size, impossi...","[order, usual, pilcro, petit, size, imposs, bu..."
3,863,"Great top! i'm 5'7"", 135lbs and normally wear ...",1,"great top! i'm 5'7"", 135lbs and normally wear ...",great top im 57 135lbs and normally wear eithe...,great top im lbs and normally wear either an ...,"[great, top, im, lbs, and, normally, wear, eit...","[great, top, im, lbs, normally, wear, either, ...","[great, top, im, lb, normal, wear, either, xs,..."
4,872,"I have more of an athletic figure, and the sle...",0,"i have more of an athletic figure, and the sle...",i have more of an athletic figure and the slee...,i have more of an athletic figure and the slee...,"[i, have, more, of, an, athletic, figure, and,...","[athletic, figure, sleeves, tend, come, closer...","[athlet, figur, sleev, tend, come, closer, bod..."


## Store processed data

In [31]:
dfs = [train, test, val]
fnames = ['train.csv', 'test.csv', 'val.csv']

os.mkdir(PROCESSED_DIR)
for df, fname in zip(dfs, fnames):
    save_data(df=df, path=PROCESSED_DIR, filename=fname)

# Comments