In [16]:
import spacy
import pandas as pd
import glob
import os
import numpy as np

import seaborn as sns
sns.set_style('darkgrid')

import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
#from nltk.corpus import twitter_samples, stopwords
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
from nltk import FreqDist, classify, NaiveBayesClassifier
nltk.download('punkt')

from wordcloud import WordCloud


import re, string, random

%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib

#setting pandas display options
pd.set_option('display.max_columns', None)  # or 1000
pd.set_option('display.max_rows', None)  # or 1000
pd.set_option('display.max_colwidth', -1)  # or 199

BASE_DIR = "/Users/karinalopez/Desktop/ds_projects/nlp/data/"

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/karinalopez/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  pd.set_option('display.max_colwidth', -1)  # or 199


In [5]:
os.chdir(BASE_DIR + 'intermediate/')
df = pd.read_csv('athleisure_ethics_dataset_message_text_preprocessed.csv')


In [7]:
df.columns

Index(['Unnamed: 0', 'brand_name', 'parent_ company', 'notes', 'message_text',
       'message_page_name', 'message_link', 'message_collected_data',
       'good_on_you_date_sourced', 'pricepoint', 'overall_ethic_rating',
       'good_on_you _link', 'Twitter Followers', 'FB Page Likes (total)',
       'IG Followers', 'pinterest', 'social_media_numbers date sourced',
       '# social media accounts on website (youtube, facebook, twitter, instagram, pinterest, tiktok, linkedin, strava, vimeo, tumbler, spotify, snapchat or weibo)',
       'notes (optional)',
       'get a number of different accounts posted; might need to hover to get the real number',
       'message_text_sentence_count', 'message_text_preprocessed', 'tokenized',
       'message_text_words_count', 'token_words_count'],
      dtype='object')

In [32]:
text_df = df[['brand_name', 'message_text', 'overall_ethic_rating']]

# split text by periods into new rows

In [33]:
# Getting s as pandas series which has split on full stop and new sentence a new line
s = text_df["message_text"].str.split('.').apply(pd.Series,1).stack()
s.index = s.index.droplevel(-1) # to line up with df's index
s.name = 'message_text' # needs a name to join


In [34]:
# There are blank or emplty cell values after above process. Removing them
s.replace('', np.nan, inplace = True)
s.dropna(inplace = True) 
 

In [35]:
del text_df['message_text']
text_df = text_df.join(s)
text_df.head(10)


Unnamed: 0,brand_name,overall_ethic_rating,message_text
0,girlfriend collective,great,When we started Girlfriend Collective our first goal was to be as transparent as possible
0,girlfriend collective,great,So many companies tout transparency but only offer flashy headlines instead of substance
0,girlfriend collective,great,"We chose every part of our process, from our raw materials to our facilities to our partners, with care"
0,girlfriend collective,great,"We also discovered quickly that high end fit and feel is not a matter of cost, it’s a matter of time"
0,girlfriend collective,great,We take the time to make sure every single one of our designs is so beautiful that you won’t cycle through it the next time you look through your closet
0,girlfriend collective,great,"Beyond that, we wanted to find a community of people who cared about where their clothes come from as much as how they look"
0,girlfriend collective,great,We're lucky to have found you
0,girlfriend collective,great,"Take a look around, we're glad you're here"
0,girlfriend collective,great,"Is everything eco-friendly? We're like the earth's number one fan, so being eco-friendly is at the top of our priorities, as is giving you as much information as possible"
0,girlfriend collective,great,Here’s a breakdown of each of our products and how it stacks up


In [37]:
os.chdir(BASE_DIR + 'intermediate/')
#text_df.to_csv('sentence_level_text_data.csv', index = False)

# Quick and dirty preprocessing

In [43]:
os.chdir(BASE_DIR + 'intermediate/')
#df = pd.read_csv('sentence_level_text_data.csv')
df = pd.read_csv('sentence_level_text_data.csv',
                 lineterminator = '\n')


In [51]:
# remove trailing white space
df['message_text'] = df['message_text'].str.strip()

# remove rows that contain only numbers
df = df[~df['message_text'].str.isnumeric()]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['message_text'] = df['message_text'].str.strip()


In [None]:
os.chdir(BASE_DIR + 'intermediate/')
#df.to_csv('athleisure_ethics_dataset_message_text_preprocessed_2.csv', index = False)


# ran preprocessing row data script here

# map new values to each rating

In [97]:
os.chdir(BASE_DIR + 'intermediate/')
df = pd.read_csv('athleisure_ethics_dataset_message_text_preprocessed_3.csv')



In [94]:
#df['overall_ethic_rating'] = df['overall_ethic_rating'].str.replace("'", '')



In [95]:
df['overall_ethic_rating'].value_counts()

not good enough    4050
good               2285
its a start        1469
we avoid           961 
great              630 
Name: overall_ethic_rating, dtype: int64

In [98]:
rating_dict = {'we avoid': 'not ethical', 'not good enough': 'not ethical', "it's a start": 'ethical', 'good': 'ethical', 'great': 'ethical'}
df['overall_ethic_rating'].value_counts()


not good enough    4050
good               2285
it's a start       1469
we avoid           961 
great              630 
Name: overall_ethic_rating, dtype: int64

In [99]:
df['likely_rating'] = df['overall_ethic_rating'].replace(rating_dict)


In [100]:
df['likely_rating'].value_counts()

not ethical    5011
ethical        4384
Name: likely_rating, dtype: int64

In [112]:
# Check which ones are null
null_df = df[df.isna().any(axis=1)]
#display(null_df)


# Build a dirty model

In [122]:
#Now lets split the data
from sklearn.model_selection import train_test_split

data = df[['title_preprocessed','likely_rating']]

In [123]:
# Drop rows with nulls in 'title_preprocessed','likely_rating'
data = df.dropna() 



In [124]:
print(data.shape)

(7091, 6)


In [125]:
train, test = train_test_split(data, shuffle = True, stratify = data.likely_rating, train_size = 50/data.shape[0], random_state = 50)


In [126]:
test, _ = train_test_split(test, shuffle = True, 
                           stratify = test.likely_rating, train_size = 7000/test.shape[0], random_state = 50)



In [128]:
train.shape, test.shape







((50, 6), (7000, 6))

In [137]:
# Adverserial Validation
from scipy import sparse
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import StratifiedKFold


def adversarial_validation(X, Y, n_splits = 10):
    
    # Combine both datasets
    sparse_merge = sparse.vstack((X, Y))
    
    # Label the datasets
    y = np.array([0 for _ in range(X.shape[0])] + [1 for _ in range(Y.shape[0])])
    
    # Do 10 Fold CV 
    kfold = StratifiedKFold(n_splits = n_splits, shuffle = True)

    lr_auc = np.array([])
    rf_auc = np.array([])
    for train_idx, test_idx in kfold.split(sparse_merge, y):
    
        #Run Log Reg
        x_train, y_train = sparse_merge[train_idx], y[train_idx]
        x_test, y_test = sparse_merge[test_idx], y[test_idx]
        
        log_reg = SGDClassifier(loss = 'log')
        log_reg.fit(x_train, y_train)
        y_test_prob = log_reg.predict_proba(x_test)[:,1]
        lr_auc = np.append(lr_auc, roc_auc_score(y_test, y_test_prob))
        # Run RF
        rf = RandomForestClassifier(n_estimators = 100, n_jobs = -1)
        rf.fit(x_train, y_train)
        y_test_prob = rf.predict_proba(x_test)[:,1]
        rf_auc = np.append(rf_auc, roc_auc_score(y_test, y_test_prob))

    
    # Display results
    print('Logisitic Regression AUC : {:.3f}'.format(lr_auc.mean()))
    print('Random Forest AUC : {:.3f}'.format(rf_auc.mean()))
    
    

In [138]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.utils import shuffle 

bow = CountVectorizer()
x_train = bow.fit_transform(train.title_preprocessed.values)
x_test = bow.transform(test.title_preprocessed.values)

adversarial_validation(x_train, x_test[:50])


Logisitic Regression AUC : 0.374
Random Forest AUC : 0.454


In [None]:
# Check how balanced datasets are

https://towardsdatascience.com/text-classification-with-extremely-small-datasets-333d322caee2