# Text Analysis for Women's E-Commerce Clothing Reviews

## Libraries and Settings

In [1]:
import os
import numpy as np
import pandas as pd
from pathlib import Path

# Feature engineering
import string
import nltk
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer

%config Completer.use_jedi = False

## Variables

In [2]:
PROCESSED_DIR = os.path.join(os.pardir, 'data', 'processed')
RANDOM_STATE = 8
VARIABLES_DROP = ['review_text', 'review_lower',
       'review_nopct', 'review_nodg', 'review_word_tokens', 'review_no_sw',
       'review_stem', 'review_lem']
TARGET = 'recommended_ind'
FEATURED_DIR = os.path.join(os.pardir, 'data', 'featured')

## Helpers

In [3]:
def load_data(path, filename):
    data_path = os.path.join(path, filename)
    df = pd.read_csv(data_path)
    return df

def get_count_words(s):
    return len(str(s).split(" "))

def get_count_char(s):
    return sum(len(w) for w in str(s).split(" "))

def get_count_sents(s):
    return len(str(s).split("."))

def get_count_exc_marks(s):
    return s.count('!')

def get_count_question_marks(s):
    return s.count('?')
    
def get_count_pct(s):
    return len([w for w in s if w in '"#$%&\'()*+,-./:;<=>@[\\]^_`{|}~'])

def get_count_cap(s):
    return sum(1 for w in s if w.isupper())

def get_polarity(s):
    tb = TextBlob(s)
    return tb.sentiment.polarity

def get_subjectivity(s):
    tb = TextBlob(s)
    return tb.sentiment.subjectivity

def get_text_features(df):
    df_copy = df.copy()
    
    # word count
    df_copy['word_count'] = df_copy['review_text'].apply(get_count_words)
    # character count
    df_copy['char_count'] = df_copy['review_text'].apply(get_count_char)
    # sentence count
    df_copy['sentence_count'] = df_copy['review_text'].apply(get_count_sents)
    # count capitals
    df_copy['capitals_count'] = df_copy['review_text'].apply(get_count_cap)
    # count puncts
    df_copy['punc_count'] = df_copy['review_text'].apply(get_count_pct)
    df_copy['exc_marks_count'] = df_copy['review_text'].apply(get_count_exc_marks)
    df_copy['question_marks_count'] = df_copy['review_text'].apply(get_count_question_marks)
    # avg word len
    df_copy['avg_word_len'] = df_copy['char_count'] / df_copy['word_count']
    # avg sentence len
    df_copy['avg_sentence_len'] = df_copy['word_count'] / df_copy['sentence_count']
    # avg cap
    df_copy['avg_cap_len']= df_copy.apply(lambda row: float(row['capitals_count'])/float(row['word_count']), axis=1)
    
    return df_copy

def get_nlp_features(df):
    df_copy = df.copy()
    
    # polarity
    df_copy['polarity'] = df_copy['review_text'].apply(get_polarity)
    # subjectivity
    df_copy['subjectivity'] = df_copy['review_text'].apply(get_subjectivity)
    
    return df_copy

def get_abt_df(df, tfidf, features, target, drop_cols):
    df = df.copy()
    tfidf_plain = tfidf.toarray()
    tfidf_df = pd.DataFrame(tfidf_plain, columns=features)
    df = df.drop(columns=drop_cols)
    abt_df = pd.merge(df, tfidf_df, left_index=True, right_index=True)
    cols = [col for col in abt_df if col != target] + [target]
    abt_df = abt_df[cols]
    return abt_df

def save_data(df, path, filename):
    data_path = os.path.join(path, filename)
    df.to_csv(data_path, index=False)

## Load Data

In [4]:
train = load_data(PROCESSED_DIR, 'train_processed.csv')
test = load_data(PROCESSED_DIR, 'test_processed.csv')
val = load_data(PROCESSED_DIR, 'val_processed.csv')

In [5]:
train.head()

Unnamed: 0,clothing_id,review_text,recommended_ind,review_lower,review_nopct,review_nodg,review_word_tokens,review_no_sw,review_stem,review_lem
0,867,I have been admiring this piece for awhile and...,1,i have been admiring this piece for awhile and...,i have been admiring this piece for awhile and...,i have been admiring this piece for awhile and...,"['i', 'have', 'been', 'admiring', 'this', 'pie...","['admiring', 'piece', 'awhile', 'finally', 'de...","['admir', 'piec', 'awhil', 'final', 'decid', '...","['admir', 'piec', 'awhil', 'final', 'decid', '..."
1,1081,This dress looks great on me. it gives a slend...,1,this dress looks great on me. it gives a slend...,this dress looks great on me it gives a slende...,this dress looks great on me it gives a slende...,"['this', 'dress', 'looks', 'great', 'on', 'me'...","['dress', 'looks', 'great', 'gives', 'slender'...","['dress', 'look', 'great', 'give', 'slender', ...","['dress', 'look', 'great', 'give', 'slender', ..."
2,862,I love this! i agree with previous post that s...,1,i love this! i agree with previous post that s...,i love this i agree with previous post that sa...,i love this i agree with previous post that sa...,"['i', 'love', 'this', 'i', 'agree', 'with', 'p...","['love', 'agree', 'previous', 'post', 'say', '...","['love', 'agre', 'previou', 'post', 'say', 'mu...","['love', 'agre', 'previou', 'post', 'say', 'mu..."
3,1081,Not sure why this dress was once backordered? ...,0,not sure why this dress was once backordered? ...,not sure why this dress was once backordered i...,not sure why this dress was once backordered i...,"['not', 'sure', 'why', 'this', 'dress', 'was',...","['sure', 'dress', 'backordered', 'big', 'chest...","['sure', 'dress', 'backord', 'big', 'chest', '...","['sure', 'dress', 'backord', 'big', 'chest', '..."
4,1020,"Unlike the other reviewers, i did not have any...",1,"unlike the other reviewers, i did not have any...",unlike the other reviewers i did not have any ...,unlike the other reviewers i did not have any ...,"['unlike', 'the', 'other', 'reviewers', 'i', '...","['unlike', 'reviewers', 'problem', 'sizing', '...","['unlik', 'review', 'problem', 'size', 'fit', ...","['unlik', 'review', 'problem', 'size', 'fit', ..."


## Feature Engineering

### Text Features

In [6]:
train_text_feats = get_text_features(train)

In [7]:
train_text_feats.head()

Unnamed: 0,clothing_id,review_text,recommended_ind,review_lower,review_nopct,review_nodg,review_word_tokens,review_no_sw,review_stem,review_lem,word_count,char_count,sentence_count,capitals_count,punc_count,exc_marks_count,question_marks_count,avg_word_len,avg_sentence_len,avg_cap_len
0,867,I have been admiring this piece for awhile and...,1,i have been admiring this piece for awhile and...,i have been admiring this piece for awhile and...,i have been admiring this piece for awhile and...,"['i', 'have', 'been', 'admiring', 'this', 'pie...","['admiring', 'piece', 'awhile', 'finally', 'de...","['admir', 'piec', 'awhil', 'final', 'decid', '...","['admir', 'piec', 'awhil', 'final', 'decid', '...",87,371,5,1,13,2,0,4.264368,17.4,0.011494
1,1081,This dress looks great on me. it gives a slend...,1,this dress looks great on me. it gives a slend...,this dress looks great on me it gives a slende...,this dress looks great on me it gives a slende...,"['this', 'dress', 'looks', 'great', 'on', 'me'...","['dress', 'looks', 'great', 'gives', 'slender'...","['dress', 'look', 'great', 'give', 'slender', ...","['dress', 'look', 'great', 'give', 'slender', ...",22,95,3,1,3,0,0,4.318182,7.333333,0.045455
2,862,I love this! i agree with previous post that s...,1,i love this! i agree with previous post that s...,i love this i agree with previous post that sa...,i love this i agree with previous post that sa...,"['i', 'love', 'this', 'i', 'agree', 'with', 'p...","['love', 'agree', 'previous', 'post', 'say', '...","['love', 'agre', 'previou', 'post', 'say', 'mu...","['love', 'agre', 'previou', 'post', 'say', 'mu...",75,284,7,1,8,1,0,3.786667,10.714286,0.013333
3,1081,Not sure why this dress was once backordered? ...,0,not sure why this dress was once backordered? ...,not sure why this dress was once backordered i...,not sure why this dress was once backordered i...,"['not', 'sure', 'why', 'this', 'dress', 'was',...","['sure', 'dress', 'backordered', 'big', 'chest...","['sure', 'dress', 'backord', 'big', 'chest', '...","['sure', 'dress', 'backord', 'big', 'chest', '...",39,182,5,1,4,0,1,4.666667,7.8,0.025641
4,1020,"Unlike the other reviewers, i did not have any...",1,"unlike the other reviewers, i did not have any...",unlike the other reviewers i did not have any ...,unlike the other reviewers i did not have any ...,"['unlike', 'the', 'other', 'reviewers', 'i', '...","['unlike', 'reviewers', 'problem', 'sizing', '...","['unlik', 'review', 'problem', 'size', 'fit', ...","['unlik', 'review', 'problem', 'size', 'fit', ...",66,256,5,1,10,0,2,3.878788,13.2,0.015152


In [8]:
test_text_feats = get_text_features(test)
val_text_feats = get_text_features(val)

### More NLP based features 

**TODO: Add Part to Speech**

In [9]:
train_nlp_feats = get_nlp_features(train_text_feats)

In [10]:
train_nlp_feats.head()

Unnamed: 0,clothing_id,review_text,recommended_ind,review_lower,review_nopct,review_nodg,review_word_tokens,review_no_sw,review_stem,review_lem,...,sentence_count,capitals_count,punc_count,exc_marks_count,question_marks_count,avg_word_len,avg_sentence_len,avg_cap_len,polarity,subjectivity
0,867,I have been admiring this piece for awhile and...,1,i have been admiring this piece for awhile and...,i have been admiring this piece for awhile and...,i have been admiring this piece for awhile and...,"['i', 'have', 'been', 'admiring', 'this', 'pie...","['admiring', 'piece', 'awhile', 'finally', 'de...","['admir', 'piec', 'awhil', 'final', 'decid', '...","['admir', 'piec', 'awhil', 'final', 'decid', '...",...,5,1,13,2,0,4.264368,17.4,0.011494,0.463272,0.659877
1,1081,This dress looks great on me. it gives a slend...,1,this dress looks great on me. it gives a slend...,this dress looks great on me it gives a slende...,this dress looks great on me it gives a slende...,"['this', 'dress', 'looks', 'great', 'on', 'me'...","['dress', 'looks', 'great', 'gives', 'slender'...","['dress', 'look', 'great', 'give', 'slender', ...","['dress', 'look', 'great', 'give', 'slender', ...",...,3,1,3,0,0,4.318182,7.333333,0.045455,0.544444,0.794444
2,862,I love this! i agree with previous post that s...,1,i love this! i agree with previous post that s...,i love this i agree with previous post that sa...,i love this i agree with previous post that sa...,"['i', 'love', 'this', 'i', 'agree', 'with', 'p...","['love', 'agree', 'previous', 'post', 'say', '...","['love', 'agre', 'previou', 'post', 'say', 'mu...","['love', 'agre', 'previou', 'post', 'say', 'mu...",...,7,1,8,1,0,3.786667,10.714286,0.013333,0.299444,0.581806
3,1081,Not sure why this dress was once backordered? ...,0,not sure why this dress was once backordered? ...,not sure why this dress was once backordered i...,not sure why this dress was once backordered i...,"['not', 'sure', 'why', 'this', 'dress', 'was',...","['sure', 'dress', 'backordered', 'big', 'chest...","['sure', 'dress', 'backord', 'big', 'chest', '...","['sure', 'dress', 'backord', 'big', 'chest', '...",...,5,1,4,0,1,4.666667,7.8,0.025641,-0.057937,0.411111
4,1020,"Unlike the other reviewers, i did not have any...",1,"unlike the other reviewers, i did not have any...",unlike the other reviewers i did not have any ...,unlike the other reviewers i did not have any ...,"['unlike', 'the', 'other', 'reviewers', 'i', '...","['unlike', 'reviewers', 'problem', 'sizing', '...","['unlik', 'review', 'problem', 'size', 'fit', ...","['unlik', 'review', 'problem', 'size', 'fit', ...",...,5,1,10,0,2,3.878788,13.2,0.015152,0.420833,0.591667


In [11]:
test_nlp_feats = get_nlp_features(test_text_feats)
val_nlp_feats = get_nlp_features(val_text_feats)

### TF-IDF Feature


In [12]:
tf_idf_vectorizer = TfidfVectorizer()
tf_idf_vectorizer = tf_idf_vectorizer.fit(train_nlp_feats['review_lem'])
tf_idf_train_matrix = tf_idf_vectorizer.transform(train_nlp_feats['review_lem'])

In [13]:
tf_idf_train_matrix

<22248x11173 sparse matrix of type '<class 'numpy.float64'>'
	with 578756 stored elements in Compressed Sparse Row format>

In [14]:
tf_idf_test_matrix = tf_idf_vectorizer.transform(test_nlp_feats['review_lem'])
tf_idf_val_matrix = tf_idf_vectorizer.transform(val_nlp_feats['review_lem'])

### TODO: Add Topic Modelling

### TODO: Word Embeddings

## ABTs

In [15]:
FEATURES = tf_idf_vectorizer.get_feature_names()
abt_train = get_abt_df(train_nlp_feats, tf_idf_train_matrix, FEATURES, TARGET, VARIABLES_DROP)
abt_test = get_abt_df(test_nlp_feats, tf_idf_test_matrix, FEATURES, TARGET, VARIABLES_DROP)
abt_val = get_abt_df(val_nlp_feats, tf_idf_val_matrix, FEATURES, TARGET, VARIABLES_DROP)

In [16]:
abt_train.head()

Unnamed: 0,clothing_id,word_count,char_count,sentence_count,capitals_count,punc_count,exc_marks_count,question_marks_count,avg_word_len,avg_sentence_len,...,ziphoodi,ziploc,zipper,zipperi,zombi,zone,zooland,zoom,zuma,recommended_ind
0,867,87,371,5,1,13,2,0,4.264368,17.4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,1081,22,95,3,1,3,0,0,4.318182,7.333333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,862,75,284,7,1,8,1,0,3.786667,10.714286,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,1081,39,182,5,1,4,0,1,4.666667,7.8,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,1020,66,256,5,1,10,0,2,3.878788,13.2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [17]:
abt_test.head()

Unnamed: 0,clothing_id,word_count,char_count,sentence_count,capitals_count,punc_count,exc_marks_count,question_marks_count,avg_word_len,avg_sentence_len,...,ziphoodi,ziploc,zipper,zipperi,zombi,zone,zooland,zoom,zuma,recommended_ind
0,927,38,152,9,1,9,0,0,4.0,4.222222,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,820,93,408,8,1,15,1,0,4.387097,11.625,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,936,96,407,6,1,15,0,0,4.239583,16.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,862,27,153,1,1,2,0,0,5.666667,27.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,1061,31,116,5,1,5,0,0,3.741935,6.2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [18]:
abt_val.head()

Unnamed: 0,clothing_id,word_count,char_count,sentence_count,capitals_count,punc_count,exc_marks_count,question_marks_count,avg_word_len,avg_sentence_len,...,ziphoodi,ziploc,zipper,zipperi,zombi,zone,zooland,zoom,zuma,recommended_ind
0,1066,54,197,6,1,16,0,0,3.648148,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,1055,63,260,6,1,10,1,0,4.126984,10.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,1066,60,272,4,1,8,1,0,4.533333,15.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,863,67,269,5,1,12,1,0,4.014925,13.4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,872,48,197,4,1,8,1,0,4.104167,12.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


## Store Featured data

In [19]:
abts = [abt_train, abt_test, abt_val]
fnames = ['train_abt.csv', 'test_abt.csv', 'val_abt.csv']

p = Path(FEATURED_DIR)
if not p.exists():
    os.mkdir(p)
for df, fname in zip(abts, fnames):
    save_data(df=df, path=FEATURED_DIR, filename=fname)

## Comments