In [1]:
import time
import fasttext
from io import StringIO
import csv
import pandas as pd
from scipy import stats
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

import statsmodels.api as sm

from sklearn.metrics import roc_auc_score, roc_curve, accuracy_score, classification_report, confusion_matrix
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE

<h2>FastText IMDB Model</h2>

In [14]:
# open fasttext-formatted corpus file
with open("imdb_data/fasttext_imdb_corpus.txt", "r", encoding="utf-8") as f:
    content = f.read()
    
# get content of corpus as a dataframe
# saving corpus to a text file and then splitting it later (in this file) allows for different random splits to be made if desired
content = content.replace("|||", '').replace("__label__positive", "__label__positive|||").replace("__label__negative", "__label__negative|||")
df = pd.read_csv(StringIO(content), delimiter="\|\|\|\s*", header=None, names=['sentiment', 'review'], quoting=3, engine='python')

In [15]:
# split data into predictor (review) and sentiment to be predicted
X = df['review']
y = df['sentiment']

# divide dataset into train, test, and validation sets (70, 20, 10 split)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_test, X_tune, y_test, y_tune = train_test_split(X_temp, y_temp, test_size=0.333, random_state=42, stratify=y_temp)

# create list of dataframes to iterate through later
sets = []
setnames = ['train', 'test', 'tune']

train = y_train.to_frame().join(X_train) # create training set
sets.append(train)

test = y_test.to_frame().join(X_test) # create testing set
sets.append(test)

tune = y_tune.to_frame().join(X_tune) # create validation set
sets.append(tune)

In [16]:
i = 0
for set in sets:
    # write set contents to file
    set.to_csv(f"imdb_data/fasttext_{setnames[i]}_temp.txt", index=False, sep=" ", header=False, escapechar=" ", quoting=csv.QUOTE_NONE)
    
    with open(f"imdb_data/fasttext_{setnames[i]}_temp.txt", "r", encoding="utf-8") as f:
        lines = f.readlines() # read set contents from file
    
    cleaned_lines = []
    for line in lines:
        line = line.replace("  ", " ") # clean space padding
        cleaned_lines.append(line)

    # write fully cleaned set to file for fasttext to use
    if setnames[i] == 'tune':
        with open(f"imdb_data/fasttext_{setnames[i]}.valid", "w", encoding="utf-8") as f:
            f.writelines(cleaned_lines)
    else:
        with open(f"imdb_data/fasttext_{setnames[i]}.txt", "w", encoding="utf-8") as f:
            f.writelines(cleaned_lines)
    
    i += 1

In [17]:
imdb_model = fasttext.train_supervised('imdb_data/fasttext_train.txt', autotuneValidationFile='imdb_data/fasttext_tune.valid')

In [18]:
imdb_model.test("imdb_data/fasttext_test.txt")

(10005, 0.9014492753623189, 0.9014492753623189)

<h2>FastText Yelp Polarity Model</h2>

In [8]:
# open fasttext-formatted corpus file
with open("yelp_data/fasttext_yelp_corpus.txt", "r", encoding="utf-8") as f:
    content = f.read()
    
# get content of corpus as a dataframe
# saving corpus to a text file and then splitting it later (in this file) allows for different random splits to be made if desired
content = content.replace("|||", '').replace("__label__positive", "__label__positive|||").replace("__label__negative", "__label__negative|||")
df = pd.read_csv(StringIO(content), delimiter="\|\|\|\s*", header=None, names=['sentiment', 'review'], quoting=3, engine='python')

In [9]:
# split data into predictor (review) and sentiment to be predicted
X = df['review']
y = df['sentiment']

# divide dataset into train, test, and validation sets (70, 20, 10 split)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_test, X_tune, y_test, y_tune = train_test_split(X_temp, y_temp, test_size=0.333, random_state=42, stratify=y_temp)

# create list of dataframes to iterate through later
sets = []
setnames = ['train', 'test', 'tune']

train = y_train.to_frame().join(X_train) # create training set
sets.append(train)

test = y_test.to_frame().join(X_test) # create testing set
sets.append(test)

tune = y_tune.to_frame().join(X_tune) # create validation set
sets.append(tune)

In [10]:
i = 0
for set in sets:
    set.to_csv(f"yelp_data/fasttext_{setnames[i]}_temp.txt", index=False, sep=" ", header=False, escapechar=" ", quoting=csv.QUOTE_NONE)
    
    with open(f"yelp_data/fasttext_{setnames[i]}_temp.txt", "r", encoding="utf-8") as f:
        lines = f.readlines()
    
    cleaned_lines = []
    for line in lines:
        line = line.replace("  ", " ")
        cleaned_lines.append(line)

    # final fasttext training corpus stored in "fasttext_train.txt"
    with open(f"yelp_data/fasttext_{setnames[i]}.txt", "w", encoding="utf-8") as f:
        f.writelines(cleaned_lines)
    
    i += 1

In [11]:
yelp_model = fasttext.train_supervised('yelp_data/fasttext_train.txt')

In [13]:
yelp_model.test("yelp_data/fasttext_test.txt")

(112056, 0.9309006211180124, 0.9309006211180124)