## Import Libraries and Load Data

In [1]:
# Import libraries
import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import Lasso
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

# Set seed
SEED = 4031
np.random.seed(SEED)

In [2]:
# Data file locations and names

project_root_dir = "Data"
project_subdir_prefix = "split_"
train_data_filename = "train.tsv"
test_data_filename = "test.tsv"
test_y_data_filename = "test_y.tsv"


# The number of train/test data folders and the target RMSE for each
# train/test split in each folder

n_datasets = 5

In [3]:
# Get list of data subfolders, each with a separate training and test set.

os_walk = os.walk(project_root_dir)
data_subdir_list = [subdirs for root, subdirs, files in os_walk][0]
n_subdirs = len(data_subdir_list)

assert(n_subdirs == n_datasets)

In [4]:
# Lists for training and test datasets

train_datasets = []
test_datasets = []
test_y_datasets = []


# Loop over subfolders and read in training/test datasets and test weekly sales.
# Use a loop instead of using os.walk directly to avoid "fold10" immediately following "fold1".

for subdir_num in np.arange(n_subdirs) + 1:
    subdir_num_str = str(subdir_num)
    train_datasets.append(pd.read_csv(os.path.join(project_root_dir,
                                                   project_subdir_prefix + subdir_num_str,
                                                   train_data_filename), sep='\t', header=0, dtype=str))
    test_datasets.append(pd.read_csv(os.path.join(project_root_dir,
                                                   project_subdir_prefix + subdir_num_str,
                                                   test_data_filename), sep='\t', header=0, dtype=str))
    test_y_datasets.append(pd.read_csv(os.path.join(project_root_dir,
                                                   project_subdir_prefix + subdir_num_str,
                                                   test_y_data_filename), sep='\t', header=0, dtype=str))

## Data Preprocessing

### Remove punctuations

In [5]:
for i in range(len(train_datasets)):
    train_datasets[i]['review'] = train_datasets[i]['review'].str.replace('&lt;.*?&gt;', ' ', regex=True)
    test_datasets[i]['review'] = test_datasets[i]['review'].str.replace('&lt;.*?&gt;', ' ', regex=True)

### Tokeninzation

In [6]:
#pool all datasets into one large set
all_words = []
for i in range(len(train_datasets)):
    for j in range(len(train_datasets[i]['review'])):
        all_words.append(train_datasets[i]['review'][j])
    for j in range(len(test_datasets[i]['review'])):
        all_words.append(test_datasets[i]['review'][j])


In [7]:
len(all_words)

250000

In [8]:
#stop_words = ["i", "me", "my", "myself"]

In [77]:

vectorizer = CountVectorizer(
    preprocessor=lambda x: x.lower(),  # Convert to lowercase
    stop_words='english',             # Remove stop words
    ngram_range=(1, 4),               # Use 1- to 4-grams
    min_df=0.001,                        # Minimum term frequency
    max_df=0.5,                       # Maximum document frequency
    token_pattern=r'\b\w+\b'          # Use word tokenizer
)

dtm_train = vectorizer.fit_transform(all_words)

In [78]:
dtm_train

<250000x16719 sparse matrix of type '<class 'numpy.int64'>'
	with 24022745 stored elements in Compressed Sparse Row format>

### Perform Lasso Regression to find the most predicitive tokens

### Prepare all training data into one dataframe

In [79]:
all_train = pd.DataFrame()

for train_df in train_datasets:
    all_train = pd.concat([all_train, train_df], axis=0)

### Perform Lasso Regression

In [80]:
def find_best_tokens(num, c):
    for i in range(10):
        lasso_log_model = LogisticRegression(C=c, penalty='l1', solver='liblinear', max_iter=100000)  # very high max iter to ensure converge
        X_train = vectorizer.transform(all_train['review'])
        Y_train = all_train['sentiment']
        lasso_log_model.fit(X_train, Y_train)

        best_tokens = [[i, coef] for i, coef in enumerate(lasso_log_model.coef_[0]) if coef != 0]

        num_tokens = len(best_tokens)
        print(f'number of tokens: {num_tokens}')
        print(f'old c: {c}')

        if num_tokens in range(num, num+100):
            return best_tokens
        elif num_tokens > num+100:
            c = c-(num_tokens/num*0.1*c)
        elif num_tokens < num:
            c = c+(num/num_tokens*0.1*c)

        print(f'new c: {c}')
    print("Bad initial c value, try another value")
    raise Exception

In [83]:
#Find all tokens with non-zero coefficients after lasso
best_tokens = find_best_tokens(num=2000, c=0.0455)
print(f'Total tokens after regularization: {len(best_tokens)}')

copy1_best_tokens = best_tokens[:]
copy2_best_tokens = best_tokens[:]

#Sort tokens by coefficients, more extreme = more predictive
new_best_tokens = list(map(lambda x: [x[0], x[1]], copy1_best_tokens))
sorted_best_tokens = sorted(new_best_tokens, key=lambda x: x[1], reverse=True)


abs_best_tokens = list(map(lambda x: [x[0], abs(x[1])], copy2_best_tokens))
sorted_abs_best_tokens = sorted(abs_best_tokens, key=lambda x: x[1], reverse=True)

#Take the top 2000 tokens regardless of pos/neg
top_2000_tokens = sorted_abs_best_tokens[:2000]

#Take the top 50 from pos and neg for explainability
positive_tokens = sorted_best_tokens[0:50]
negative_tokens = sorted_best_tokens[-51:-1]

feature_names = vectorizer.get_feature_names_out()

#Make tokens for model
model_tokens = []

for token in top_2000_tokens:
    model_tokens.append(feature_names[token[0]])


positive_predictors = []
negative_predictors = []
#Make tokens for explainability:
for (pos, neg) in zip(positive_tokens, negative_tokens):
    positive_predictors.append(feature_names[pos[0]])
    negative_predictors.append(feature_names[neg[0]])

1519
0.0325
0.03677913100724161
1696
0.03677913100724161
0.04111629268262387
1869
0.04111629268262387
0.04551610998413526
2049
0.04551610998413526
Total tokens after regularization: 2049


## Create Customer Tokenizer from only the 2000 best tokens

In [84]:
def custom_tokenizer(text):
    tokens = text.split()
    filtered_tokens = [token for token in tokens if token in model_tokens]
    return filtered_tokens

custom_vectorizer = CountVectorizer(tokenizer=custom_tokenizer, binary=True)

In [85]:
custom_vectorizer = CountVectorizer(binary=True, vocabulary=model_tokens)

## Predict

In [86]:
accuracies = []
aucs = []

for i in range(len(train_datasets)):
    model = LogisticRegression(max_iter=10000)

    X_train = custom_vectorizer.fit_transform(train_datasets[i]['review'])
    Y_train = train_datasets[i]['sentiment']
    model.fit(X_train, Y_train)

    X_test = custom_vectorizer.fit_transform(test_datasets[i]['review'])
    Y_test = test_y_datasets[i]['sentiment']
    Y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(Y_test, Y_pred)
    auc = roc_auc_score(Y_test, Y_pred)
    accuracies.append(accuracy)
    aucs.append(auc)

In [87]:
for i, (accuracy, auc) in enumerate(zip(accuracies, aucs)):
    #print(f"Accuracy of fold {i+1}: {accuracy}")
    print(f"AUC of fold {i+1}: {auc}")

AUC of fold 1: 0.8849712137216947
AUC of fold 2: 0.8819008615866393
AUC of fold 3: 0.8840851923729133
AUC of fold 4: 0.8849234143509852
AUC of fold 5: 0.8812036964142502


## Explore Tokens for Explainability

In [88]:
for i, token in enumerate(positive_predictors):
    print(f'Most predictive positive token #{i+1}: {token}')

Most predictive positive token #1: 7 10
Most predictive positive token #2: 8 10
Most predictive positive token #3: 10 10
Most predictive positive token #4: refreshing
Most predictive positive token #5: t disappointed
Most predictive positive token #6: 9 10
Most predictive positive token #7: superb
Most predictive positive token #8: excellent
Most predictive positive token #9: wonderfully
Most predictive positive token #10: funniest
Most predictive positive token #11: brilliantly
Most predictive positive token #12: gem
Most predictive positive token #13: amazing
Most predictive positive token #14: finest
Most predictive positive token #15: outstanding
Most predictive positive token #16: definitely worth
Most predictive positive token #17: hilarious
Most predictive positive token #18: pleasantly surprised
Most predictive positive token #19: highly recommended
Most predictive positive token #20: subtle
Most predictive positive token #21: fantastic
Most predictive positive token #22: incre

In [89]:
for i, token in enumerate(negative_predictors):
    print(f'Most predictive negative token #{i+1}: {token}')

Most predictive negative token #1: annoying
Most predictive negative token #2: badly
Most predictive negative token #3: ridiculous
Most predictive negative token #4: hoping
Most predictive negative token #5: t recommend
Most predictive negative token #6: t funny
Most predictive negative token #7: wooden
Most predictive negative token #8: bland
Most predictive negative token #9: wasted
Most predictive negative token #10: trite
Most predictive negative token #11: worse
Most predictive negative token #12: mess
Most predictive negative token #13: horrible
Most predictive negative token #14: boring
Most predictive negative token #15: stinker
Most predictive negative token #16: sorry
Most predictive negative token #17: lame
Most predictive negative token #18: pathetic
Most predictive negative token #19: lousy
Most predictive negative token #20: pretentious
Most predictive negative token #21: pointless
Most predictive negative token #22: mildly
Most predictive negative token #23: terrible
Mos