# Text Analysis for Women's E-Commerce Clothing Reviews

## Libraries and Settings

In [1]:
import os
import tqdm
import pandas as pd
from pathlib import Path

import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

%config Completer.use_jedi = False

[nltk_data] Downloading package punkt to /Users/inardini/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/inardini/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/inardini/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Variables

In [2]:
INTERIM_DIR = os.path.join(os.pardir, 'data', 'interim')
TEXT_VAR = 'review_text'
RANDOM_STATE = 8
LANG = 'english'
PROCESSED_DIR = os.path.join(os.pardir, 'data', 'processed')

## Helpers

In [3]:
def load_data(path, filename):
    data_path = os.path.join(path, filename)
    df = pd.read_csv(data_path)
    return df

def remove_pct(s):
    return s.translate(str.maketrans('','',string.punctuation))

def remove_dg(s):
    return s.translate(str.maketrans('','',string.digits))
    
def remove_sw(words_list):
    stop_words=stopwords.words("english")
    return [word for word in words_list if word not in stop_words]

def stemmer(words_list):
    ps = PorterStemmer()
    return [ps.stem(word) for word in words_list]

def lemmer(words_list):
        lem = WordNetLemmatizer()
        return [lem.lemmatize(word) for word in words_list]
    
def get_data_prepared(df, text_var):
    df = df.copy()
    
    # lower
    df['review_text_processed'] = df[text_var].apply(lambda x: x.lower())
    # remove punctualizations
    df['review_text_processed'] = df['review_text_processed'].apply(remove_pct)
    # remove digit
    df['review_text_processed'] = df['review_text_processed'].apply(remove_dg)
    # split 
    df['review_text_processed'] = df['review_text_processed'].apply(word_tokenize)
    # remove stopwords
    df['review_text_processed'] = df['review_text_processed'].apply(remove_sw)
    # stemming
    df['review_text_processed'] = df['review_text_processed'].apply(stemmer)
    # lemming
    df['review_text_processed'] = df['review_text_processed'].apply(lemmer)
    
    return df[['clothing_id', 'review_text', 'review_text_processed', 'recommended_ind']]

def save_data(df, path, filename):
    data_path = os.path.join(path, filename)
    df.to_csv(data_path, index=False)

## Load data

In [4]:
train = load_data(INTERIM_DIR, 'train_interim.csv')
test = load_data(INTERIM_DIR, 'test_interim.csv')
val = load_data(INTERIM_DIR, 'val_interim.csv')

In [5]:
train.head()

Unnamed: 0,clothing_id,review_text,recommended_ind
0,867,I have been admiring this piece for awhile and...,1
1,1081,This dress looks great on me. it gives a slend...,1
2,862,I love this! i agree with previous post that s...,1
3,1081,Not sure why this dress was once backordered? ...,0
4,1020,"Unlike the other reviewers, i did not have any...",1


In [6]:
train.head()

Unnamed: 0,clothing_id,review_text,recommended_ind
0,867,I have been admiring this piece for awhile and...,1
1,1081,This dress looks great on me. it gives a slend...,1
2,862,I love this! i agree with previous post that s...,1
3,1081,Not sure why this dress was once backordered? ...,0
4,1020,"Unlike the other reviewers, i did not have any...",1


## Data Preparation

In [7]:
train_prepared = get_data_prepared(train, TEXT_VAR)

In [8]:
train_prepared.head()

Unnamed: 0,clothing_id,review_text,review_text_processed,recommended_ind
0,867,I have been admiring this piece for awhile and...,"[admir, piec, awhil, final, decid, purchas, ha...",1
1,1081,This dress looks great on me. it gives a slend...,"[dress, look, great, give, slender, appear, hi...",1
2,862,I love this! i agree with previous post that s...,"[love, agre, previou, post, say, much, better,...",1
3,1081,Not sure why this dress was once backordered? ...,"[sure, dress, backord, big, chest, basic, fall...",0
4,1020,"Unlike the other reviewers, i did not have any...","[unlik, review, problem, size, fit, length, sk...",1


In [9]:
test_prepared = get_data_prepared(test, TEXT_VAR)

In [10]:
test_prepared.head()

Unnamed: 0,clothing_id,review_text,review_text_processed,recommended_ind
0,927,This looks so cute...but don't be fooled. it i...,"[look, cutebut, dont, fool, poofi, look, real,...",0
1,820,This is the most unflattering peplum top i hav...,"[unflatt, peplum, top, ever, tri, fit, perfect...",0
2,936,"Let me start this review by saying, 75% of my ...","[let, start, review, say, cloth, retail, love,...",0
3,862,Love the petroleum blue\nunique twist design\n...,"[love, petroleum, blue, uniqu, twist, design, ...",1
4,1061,I ordered a small and the top part was huge ye...,"[order, small, top, part, huge, yet, bottom, l...",0


In [11]:
val_prepared = get_data_prepared(val, TEXT_VAR)

In [12]:
val_prepared.head()

Unnamed: 0,clothing_id,review_text,review_text_processed,recommended_ind
0,1066,"I tried these on in the store, and they are su...","[tri, store, super, cute, run, small, typic, w...",1
1,1055,I took the advise of a reviewer and sized down...,"[took, advis, review, size, x, could, take, wi...",0
2,1066,I ordered these in my usual pilcro petite size...,"[order, usual, pilcro, petit, size, imposs, bu...",0
3,863,"Great top! i'm 5'7"", 135lbs and normally wear ...","[great, top, im, lb, normal, wear, either, x, ...",1
4,872,"I have more of an athletic figure, and the sle...","[athlet, figur, sleev, tend, come, closer, bod...",0


## Store processed data

In [13]:
dfs = [train_prepared, test_prepared, val_prepared]
fnames = ['train_processed.csv', 'test_processed.csv', 'val_processed.csv']

p = Path(PROCESSED_DIR)
if not p.exists():
    os.mkdir(p)
for df, fname in zip(dfs, fnames):
    save_data(df=df, path=PROCESSED_DIR, filename=fname)

# Comments