# Text Analysis for Women's E-Commerce Clothing Reviews

## Libraries and Settings

In [1]:
import os
import numpy as np
import pandas as pd
from pathlib import Path

# Feature engineering
import string
import nltk
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler

%config Completer.use_jedi = False

## Variables

In [38]:
PROCESSED_DIR = os.path.join(os.pardir, 'data', 'processed')
RANDOM_STATE = 8
TARGET = 'recommended_ind'
TEXT_VARIABLES = ['review_text', 'review_text_processed']
FEATURED_DIR = os.path.join(os.pardir, 'data', 'featured')

## Helpers

In [68]:
def load_data(path, filename):
    data_path = os.path.join(path, filename)
    df = pd.read_csv(data_path)
    return df

def get_count_words(s):
    return len(str(s).split(" "))

def get_count_char(s):
    return sum(len(w) for w in str(s).split(" "))

def get_count_sents(s):
    return len(str(s).split("."))

def get_count_exc_marks(s):
    return s.count('!')

def get_count_question_marks(s):
    return s.count('?')
    
def get_count_pct(s):
    return len([w for w in s if w in '"#$%&\'()*+,-./:;<=>@[\\]^_`{|}~'])

def get_count_cap(s):
    return sum(1 for w in s if w.isupper())

def get_polarity(s):
    tb = TextBlob(s)
    return tb.sentiment.polarity

def get_subjectivity(s):
    tb = TextBlob(s)
    return tb.sentiment.subjectivity

def get_text_features(df, text_var):
    df_copy = df.copy()
    
    # word count
    df_copy['word_count'] = df_copy[text_var].apply(get_count_words)
    # character count
    df_copy['char_count'] = df_copy[text_var].apply(get_count_char)
    # sentence count
    df_copy['sentence_count'] = df_copy[text_var].apply(get_count_sents)
    # count capitals
    df_copy['capitals_count'] = df_copy[text_var].apply(get_count_cap)
    # count puncts
    df_copy['punc_count'] = df_copy[text_var].apply(get_count_pct)
    df_copy['exc_marks_count'] = df_copy[text_var].apply(get_count_exc_marks)
    df_copy['question_marks_count'] = df_copy[text_var].apply(get_count_question_marks)
    # avg word len
    df_copy['avg_word_len'] = df_copy['char_count'] / df_copy['word_count']
    # avg sentence len
    df_copy['avg_sentence_len'] = df_copy['word_count'] / df_copy['sentence_count']
    # avg cap
    df_copy['avg_cap_len']= df_copy.apply(lambda row: float(row['capitals_count'])/float(row['word_count']), axis=1)
    
    return df_copy

def get_nlp_features(df, text_var):
    df_copy = df.copy()
    # polarity
    df_copy['polarity'] = df_copy[text_var].apply(get_polarity)
    # subjectivity
    df_copy['subjectivity'] = df_copy[text_var].apply(get_subjectivity)
    return df_copy

def get_tfidf_df(df, text_cols, tfidf_matrix, cols):
    df_copy = df.copy()
    df_copy = df_copy.drop(text_cols, axis=1)
    tfidf_plain = tfidf_matrix.toarray()
    tfidf = pd.DataFrame(tfidf_plain, columns=cols)
    tfidf_df = pd.merge(df_copy, tfidf, how="left", left_index=True, right_index=True)
    return tfidf_df

def save_data(x_df, y_df, path, filename):
    df = pd.merge(x_df, y_df, how="left", left_index=True, right_index=True)
    data_path = os.path.join(path, filename)
    df.to_csv(data_path, index=False)

## Load Data

In [48]:
train = load_data(PROCESSED_DIR, 'train_processed.csv')
test = load_data(PROCESSED_DIR, 'test_processed.csv')
val = load_data(PROCESSED_DIR, 'val_processed.csv')

In [49]:
train.head()

Unnamed: 0,clothing_id,review_text,review_text_processed,recommended_ind
0,867,I have been admiring this piece for awhile and...,"['admir', 'piec', 'awhil', 'final', 'decid', '...",1
1,1081,This dress looks great on me. it gives a slend...,"['dress', 'look', 'great', 'give', 'slender', ...",1
2,862,I love this! i agree with previous post that s...,"['love', 'agre', 'previou', 'post', 'say', 'mu...",1
3,1081,Not sure why this dress was once backordered? ...,"['sure', 'dress', 'backord', 'big', 'chest', '...",0
4,1020,"Unlike the other reviewers, i did not have any...","['unlik', 'review', 'problem', 'size', 'fit', ...",1


## Feature Engineering

In [50]:
X_train = train[train.columns.difference([TARGET])]
y_train = train[TARGET]

X_test = test[test.columns.difference([TARGET])]
y_test = test[TARGET]

X_val = val[val.columns.difference([TARGET])]
y_val = val[TARGET]

### Text Features

In [51]:
X_train_text_feats = get_text_features(X_train, TEXT_VARIABLES[0])
X_test_text_feats = get_text_features(X_test, TEXT_VARIABLES[0])
X_val_text_feats = get_text_features(X_val, TEXT_VARIABLES[0])

In [52]:
X_train_text_feats.head()

Unnamed: 0,clothing_id,review_text,review_text_processed,word_count,char_count,sentence_count,capitals_count,punc_count,exc_marks_count,question_marks_count,avg_word_len,avg_sentence_len,avg_cap_len
0,867,I have been admiring this piece for awhile and...,"['admir', 'piec', 'awhil', 'final', 'decid', '...",87,371,5,1,13,2,0,4.264368,17.4,0.011494
1,1081,This dress looks great on me. it gives a slend...,"['dress', 'look', 'great', 'give', 'slender', ...",22,95,3,1,3,0,0,4.318182,7.333333,0.045455
2,862,I love this! i agree with previous post that s...,"['love', 'agre', 'previou', 'post', 'say', 'mu...",75,284,7,1,8,1,0,3.786667,10.714286,0.013333
3,1081,Not sure why this dress was once backordered? ...,"['sure', 'dress', 'backord', 'big', 'chest', '...",39,182,5,1,4,0,1,4.666667,7.8,0.025641
4,1020,"Unlike the other reviewers, i did not have any...","['unlik', 'review', 'problem', 'size', 'fit', ...",66,256,5,1,10,0,2,3.878788,13.2,0.015152


### More NLP based features 

**TODO: Add Part to Speech**

In [53]:
X_train_nlp_feats = get_nlp_features(X_train_text_feats, TEXT_VARIABLES[0])
X_test_nlp_feats = get_nlp_features(X_test_text_feats, TEXT_VARIABLES[0])
X_val_nlp_feats = get_nlp_features(X_val_text_feats, TEXT_VARIABLES[0])

In [54]:
X_train_nlp_feats.head()

Unnamed: 0,clothing_id,review_text,review_text_processed,word_count,char_count,sentence_count,capitals_count,punc_count,exc_marks_count,question_marks_count,avg_word_len,avg_sentence_len,avg_cap_len,polarity,subjectivity
0,867,I have been admiring this piece for awhile and...,"['admir', 'piec', 'awhil', 'final', 'decid', '...",87,371,5,1,13,2,0,4.264368,17.4,0.011494,0.463272,0.659877
1,1081,This dress looks great on me. it gives a slend...,"['dress', 'look', 'great', 'give', 'slender', ...",22,95,3,1,3,0,0,4.318182,7.333333,0.045455,0.544444,0.794444
2,862,I love this! i agree with previous post that s...,"['love', 'agre', 'previou', 'post', 'say', 'mu...",75,284,7,1,8,1,0,3.786667,10.714286,0.013333,0.299444,0.581806
3,1081,Not sure why this dress was once backordered? ...,"['sure', 'dress', 'backord', 'big', 'chest', '...",39,182,5,1,4,0,1,4.666667,7.8,0.025641,-0.057937,0.411111
4,1020,"Unlike the other reviewers, i did not have any...","['unlik', 'review', 'problem', 'size', 'fit', ...",66,256,5,1,10,0,2,3.878788,13.2,0.015152,0.420833,0.591667


### TF-IDF Feature


In [55]:
tf_idf_vectorizer = TfidfVectorizer()
tf_idf_vectorizer = tf_idf_vectorizer.fit(X_train_nlp_feats[TEXT_VARIABLES[1]])

In [56]:
tf_idf_train_matrix = tf_idf_vectorizer.transform(X_train_nlp_feats[TEXT_VARIABLES[1]])
tf_idf_test_matrix = tf_idf_vectorizer.transform(X_test_nlp_feats[TEXT_VARIABLES[1]])
tf_idf_val_matrix = tf_idf_vectorizer.transform(X_val_nlp_feats[TEXT_VARIABLES[1]])

In [57]:
tf_idf_train_matrix

<22248x11173 sparse matrix of type '<class 'numpy.float64'>'
	with 578756 stored elements in Compressed Sparse Row format>

### TODO: Add Topic Modelling

### TODO: Word Embeddings

In [58]:
tfidf_train_df = get_tfidf_df(X_train_nlp_feats, TEXT_VARIABLES, tf_idf_train_matrix, tf_idf_vectorizer.get_feature_names())

In [59]:
tfidf_train_df.head()

Unnamed: 0,clothing_id,word_count,char_count,sentence_count,capitals_count,punc_count,exc_marks_count,question_marks_count,avg_word_len,avg_sentence_len,...,zipbutton,ziphoodi,ziploc,zipper,zipperi,zombi,zone,zooland,zoom,zuma
0,867,87,371,5,1,13,2,0,4.264368,17.4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1081,22,95,3,1,3,0,0,4.318182,7.333333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,862,75,284,7,1,8,1,0,3.786667,10.714286,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1081,39,182,5,1,4,0,1,4.666667,7.8,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1020,66,256,5,1,10,0,2,3.878788,13.2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [60]:
tfidf_test_df = get_tfidf_df(X_test_nlp_feats, TEXT_VARIABLES, tf_idf_test_matrix, tf_idf_vectorizer.get_feature_names())
tfidf_val_df = get_tfidf_df(X_val_nlp_feats, TEXT_VARIABLES, tf_idf_val_matrix, tf_idf_vectorizer.get_feature_names())

### Scale variables

In [63]:
scaler = MinMaxScaler()
scaler.fit(tfidf_train_df)
X_train_scaled = scaler.transform(tfidf_train_df)
X_train_scaled = pd.DataFrame(X_train_scaled, columns=tfidf_train_df.columns)

In [64]:
X_train_scaled.head()

Unnamed: 0,clothing_id,word_count,char_count,sentence_count,capitals_count,punc_count,exc_marks_count,question_marks_count,avg_word_len,avg_sentence_len,...,zipbutton,ziphoodi,ziploc,zipper,zipperi,zombi,zone,zooland,zoom,zuma
0,0.719269,0.758929,0.868421,0.102564,1.0,0.216667,0.04878,0.0,0.252395,0.160784,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.89701,0.178571,0.208134,0.051282,1.0,0.05,0.0,0.0,0.261364,0.062092,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.715116,0.651786,0.660287,0.153846,1.0,0.133333,0.02439,0.0,0.172778,0.095238,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.89701,0.330357,0.416268,0.102564,1.0,0.066667,0.0,0.166667,0.319444,0.066667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.846346,0.571429,0.593301,0.102564,1.0,0.166667,0.0,0.333333,0.188131,0.119608,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [65]:
X_test_scaled = scaler.transform(tfidf_test_df)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=tfidf_test_df.columns)

In [66]:
X_val_scaled = scaler.transform(tfidf_val_df)
X_val_scaled = pd.DataFrame(X_val_scaled, columns=tfidf_val_df.columns)

## Store Featured data

In [70]:
X_dfs = [X_train_scaled, X_test_scaled, X_val_scaled]
y_dfs = [y_train, y_test, y_val]

p = Path(FEATURED_DIR)
if not p.exists():
    os.mkdir(p)
df_names = ['train_abt.csv', 'test_abt.csv', 'val_abt.csv']
for x_df, y_df, df_name in zip(X_dfs, y_dfs, df_names):
    save_data(x_df, y_df, FEATURED_DIR, df_name)

## Comments