# Text Analysis for Women's E-Commerce Clothing Reviews

## Libraries and Settings

In [None]:
import os
import numpy as np
import pandas as pd
from pathlib import Path

# Feature engineering
import string
import nltk
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer

%config Completer.use_jedi = False

## Variables

In [None]:
PROCESSED_DIR = os.path.join(os.pardir, 'data', 'processed')
RANDOM_STATE = 8
VARIABLES_DROP = ['review_text', 'review_lower',
       'review_nopct', 'review_nodg', 'review_word_tokens', 'review_no_sw',
       'review_stem', 'review_lem']
TARGET = 'recommended_ind'
FEATURED_DIR = os.path.join(os.pardir, 'data', 'featured')

## Helpers

In [None]:
def load_data(path, filename):
    data_path = os.path.join(path, filename)
    df = pd.read_csv(data_path)
    return df

def get_count_words(s):
    return len(str(s).split(" "))

def get_count_char(s):
    return sum(len(w) for w in str(s).split(" "))

def get_count_sents(s):
    return len(str(s).split("."))

def get_count_exc_marks(s):
    return s.count('!')

def get_count_question_marks(s):
    return s.count('?')
    
def get_count_pct(s):
    return len([w for w in s if w in '"#$%&\'()*+,-./:;<=>@[\\]^_`{|}~'])

def get_count_cap(s):
    return sum(1 for w in s if w.isupper())

def get_polarity(s):
    tb = TextBlob(s)
    return tb.sentiment.polarity

def get_subjectivity(s):
    tb = TextBlob(s)
    return tb.sentiment.subjectivity

def get_text_features(df):
    df_copy = df.copy()
    
    # word count
    df_copy['word_count'] = df_copy['review_text'].apply(get_count_words)
    # character count
    df_copy['char_count'] = df_copy['review_text'].apply(get_count_char)
    # sentence count
    df_copy['sentence_count'] = df_copy['review_text'].apply(get_count_sents)
    # count capitals
    df_copy['capitals_count'] = df_copy['review_text'].apply(get_count_cap)
    # count puncts
    df_copy['punc_count'] = df_copy['review_text'].apply(get_count_pct)
    df_copy['exc_marks_count'] = df_copy['review_text'].apply(get_count_exc_marks)
    df_copy['question_marks_count'] = df_copy['review_text'].apply(get_count_question_marks)
    # avg word len
    df_copy['avg_word_len'] = df_copy['char_count'] / df_copy['word_count']
    # avg sentence len
    df_copy['avg_sentence_len'] = df_copy['word_count'] / df_copy['sentence_count']
    # avg cap
    df_copy['avg_cap_len']= df_copy.apply(lambda row: float(row['capitals_count'])/float(row['word_count']), axis=1)
    
    return df_copy

def get_nlp_features(df):
    df_copy = df.copy()
    
    # polarity
    df_copy['polarity'] = df_copy['review_text'].apply(get_polarity)
    # subjectivity
    df_copy['subjectivity'] = df_copy['review_text'].apply(get_subjectivity)
    
    return df_copy

def get_abt_df(df, tfidf, features, target, drop_cols):
    df = df.copy()
    tfidf_plain = tfidf.toarray()
    tfidf_df = pd.DataFrame(tfidf_plain, columns=features)
    df = df.drop(columns=drop_cols)
    abt_df = pd.merge(df, tfidf_df, left_index=True, right_index=True)
    cols = [col for col in abt_df if col != target] + [target]
    abt_df = abt_df[cols]
    return abt_df

def save_data(df, path, filename):
    data_path = os.path.join(path, filename)
    df.to_csv(data_path, index=False)

## Load Data

In [None]:
train = load_data(PROCESSED_DIR, 'train.csv')
test = load_data(PROCESSED_DIR, 'test.csv')
val = load_data(PROCESSED_DIR, 'val.csv')

In [None]:
train.head()

## Feature Engineering

### Text Features

In [None]:
train_text_feats = get_text_features(train)

In [None]:
train_text_feats.head()

In [None]:
test_text_feats = get_text_features(test)
val_text_feats = get_text_features(val)

### More NLP based features 

**TODO: Add Part to Speech**

In [None]:
train_nlp_feats = get_nlp_features(train_text_feats)

In [None]:
train_nlp_feats.head()

In [None]:
test_nlp_feats = get_nlp_features(test_text_feats)
val_nlp_feats = get_nlp_features(val_text_feats)

### TF-IDF Feature


In [None]:
tf_idf_vectorizer = TfidfVectorizer()
tf_idf_vectorizer = tf_idf_vectorizer.fit(train_nlp_feats['review_lem'])
tf_idf_train_matrix = tf_idf_vectorizer.transform(train_nlp_feats['review_lem'])

In [None]:
tf_idf_train_matrix

In [None]:
tf_idf_test_matrix = tf_idf_vectorizer.transform(test_nlp_feats['review_lem'])
tf_idf_val_matrix = tf_idf_vectorizer.transform(val_nlp_feats['review_lem'])

### TODO: Add Topic Modelling

### TODO: Word Embeddings

## ABTs

In [None]:
FEATURES = tf_idf_vectorizer.get_feature_names()
abt_train = get_abt_df(train_nlp_feats, tf_idf_train_matrix, FEATURES, TARGET, VARIABLES_DROP)
abt_test = get_abt_df(test_nlp_feats, tf_idf_test_matrix, FEATURES, TARGET, VARIABLES_DROP)
abt_val = get_abt_df(val_nlp_feats, tf_idf_val_matrix, FEATURES, TARGET, VARIABLES_DROP)

In [None]:
abt_train.head()

In [None]:
abt_test.head()

In [None]:
abt_val.head()

## Store Featured data

In [None]:
abts = [abt_train, abt_test, abt_val]
fnames = ['train.csv', 'test.csv', 'val.csv']

p = Path(FEATURED_DIR)
if not p.exists():
    os.mkdir(p)
for df, fname in zip(abts, fnames):
    save_data(df=df, path=FEATURED_DIR, filename=fname)

## Comments