## Import Libraries and Load Data

In [101]:
# Import libraries
import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import Lasso
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

# Set seed
SEED = 4031
np.random.seed(SEED)

In [10]:
# Data file locations and names

project_root_dir = "Data"
project_subdir_prefix = "split_"
train_data_filename = "train.tsv"
test_data_filename = "test.tsv"
test_y_data_filename = "test_y.tsv"


# The number of train/test data folders and the target RMSE for each
# train/test split in each folder

n_datasets = 5

In [11]:
# Get list of data subfolders, each with a separate training and test set.

os_walk = os.walk(project_root_dir)
data_subdir_list = [subdirs for root, subdirs, files in os_walk][0]
n_subdirs = len(data_subdir_list)

assert(n_subdirs == n_datasets)

In [12]:
# Lists for training and test datasets

train_datasets = []
test_datasets = []
test_y_datasets = []


# Loop over subfolders and read in training/test datasets and test weekly sales.
# Use a loop instead of using os.walk directly to avoid "fold10" immediately following "fold1".

for subdir_num in np.arange(n_subdirs) + 1:
    subdir_num_str = str(subdir_num)
    train_datasets.append(pd.read_csv(os.path.join(project_root_dir,
                                                   project_subdir_prefix + subdir_num_str,
                                                   train_data_filename), sep='\t', header=0, dtype=str))
    test_datasets.append(pd.read_csv(os.path.join(project_root_dir,
                                                   project_subdir_prefix + subdir_num_str,
                                                   test_data_filename), sep='\t', header=0, dtype=str))
    test_y_datasets.append(pd.read_csv(os.path.join(project_root_dir,
                                                   project_subdir_prefix + subdir_num_str,
                                                   test_y_data_filename), sep='\t', header=0, dtype=str))

## Data Preprocessing

### Remove punctuations

In [13]:
for i in range(len(train_datasets)):
    train_datasets[i]['review'] = train_datasets[i]['review'].str.replace('&lt;.*?&gt;', ' ', regex=True)
    test_datasets[i]['review'] = test_datasets[i]['review'].str.replace('&lt;.*?&gt;', ' ', regex=True)

### Tokeninzation

In [19]:
#pool all datasets into one large set
all_words = []
for i in range(len(train_datasets)):
    for j in range(len(train_datasets[i]['review'])):
        all_words.append(train_datasets[i]['review'][j])
    for j in range(len(test_datasets[i]['review'])):
        all_words.append(test_datasets[i]['review'][j])


In [21]:
len(all_words)

250000

In [22]:
#stop_words = ["i", "me", "my", "myself"]

In [24]:

vectorizer = CountVectorizer(
    preprocessor=lambda x: x.lower(),  # Convert to lowercase
    stop_words={'english'},             # Remove stop words
    ngram_range=(1, 4),               # Use 1- to 4-grams
    min_df=0.001,                        # Minimum term frequency
    max_df=0.5,                       # Maximum document frequency
    token_pattern=r'\b\w+\b'          # Use word tokenizer
)

dtm_train = vectorizer.fit_transform(all_words)

In [25]:
dtm_train

<250000x49489 sparse matrix of type '<class 'numpy.int64'>'
	with 64668585 stored elements in Compressed Sparse Row format>

### Perform Lasso Regression to find the most predicitive tokens

In [86]:
lasso_model = Lasso(alpha=0.00025, max_iter=100000)  # very high max iter to ensure converge
X_train = vectorizer.transform(train_datasets[0]['review'])
Y_train = train_datasets[0]['sentiment']
lasso_model.fit(X_train, Y_train)

In [92]:
best_tokens = [[i, coef] for i, coef in enumerate(lasso_model.coef_) if coef != 0]

In [93]:
len(best_tokens)

1551

In [94]:

abs_best_tokens = list(map(lambda x: [x[0], abs(x[1])], best_tokens))
sorted_abs_best_tokens = sorted(abs_best_tokens, key=lambda x: x[1], reverse=True)
top_2000_tokens = sorted_abs_best_tokens[:2000]
top_2000_tokens

[[317, 0.15339030967529765],
 [276, 0.1331001315143869],
 [12032, 0.1251585492015079],
 [27458, 0.1204344066261232],
 [269, 0.12043242666316074],
 [47010, 0.1134759429667018],
 [46543, 0.10816715873162418],
 [1222, 0.1049828179746218],
 [236, 0.10218255298625062],
 [45188, 0.09720453692685299],
 [25397, 0.09118578715908515],
 [13816, 0.08848222118090653],
 [247, 0.08567510763530746],
 [14133, 0.08544060059669922],
 [19629, 0.08502877418857074],
 [27359, 0.08371041052352068],
 [26581, 0.08316292317846757],
 [36714, 0.08311561088444129],
 [27552, 0.0823581956314936],
 [12688, 0.08098112484342535],
 [320, 0.08027190727621049],
 [12031, 0.07951385155082787],
 [16198, 0.07927624662890814],
 [330, 0.07806335539200374],
 [15720, 0.0774188282540241],
 [11494, 0.07719879111689233],
 [48630, 0.07700304961212005],
 [24596, 0.07684104785531211],
 [12710, 0.07439573049890666],
 [12618, 0.0742733626099999],
 [316, 0.07384970325209814],
 [30960, 0.0737957343377927],
 [382, 0.07229304219318801],
 [189

In [90]:
feature_names = vectorizer.get_feature_names_out()

In [91]:
model_tokens = []

for token in top_2000_tokens:
    model_tokens.append(feature_names[token[0]])

model_tokens


['7 10',
 '4 out of 10',
 'disappointment',
 'not recommend',
 '4 10',
 'well worth',
 'waste',
 'a must',
 '3 10',
 'unfunny',
 'mediocre',
 'excellent',
 '3 out of 10',
 'fails',
 'impressed',
 'not funny',
 'mst3k',
 'superb',
 'not worth',
 'dub',
 '7 out of',
 'disappointing',
 'gem',
 '8',
 'forgettable',
 'definitely worth',
 'worst',
 'love this',
 'dull',
 'dreadful',
 '7',
 'poorly',
 'a 7',
 'hooked',
 'solid',
 'hilarious',
 'loved this',
 'subtle',
 'awful',
 'favorite',
 'pretentious',
 'wasted',
 'and funny',
 'wonderfully',
 'avoid',
 'sorry',
 'redeeming',
 'poor',
 'surprised',
 'recommended',
 'lacks',
 'boring',
 'at best',
 'bravo',
 '10 10',
 'enjoyable',
 'worse',
 'save',
 'ridiculous',
 'very funny',
 'perfect',
 'enjoyed this',
 'weak',
 'unless',
 'mildly',
 'finest',
 'predictable',
 'on dvd',
 'the problem',
 'laughable',
 '2 10',
 'wonderful',
 'outstanding',
 'available',
 '9',
 'loved it',
 'unfortunately',
 'unconvincing',
 'insult to',
 'enjoyed',
 'hi

## Create Customer Tokenizer from only the 2000 best tokens

In [95]:
def custom_tokenizer(text):
    tokens = text.split()
    filtered_tokens = [token for token in tokens if token in model_tokens]
    return filtered_tokens

custom_vectorizer = CountVectorizer(tokenizer=custom_tokenizer, binary=True)

In [96]:
custom_vectorizer = CountVectorizer(binary=True, vocabulary=model_tokens)

## Predict

In [102]:
accuracies = []
aucs = []

for i in range(len(train_datasets)):
    model = LogisticRegression()

    X_train = custom_vectorizer.fit_transform(train_datasets[i]['review'])
    Y_train = train_datasets[i]['sentiment']
    model.fit(X_train, Y_train)

    X_test = custom_vectorizer.fit_transform(test_datasets[i]['review'])
    Y_test = test_y_datasets[i]['sentiment']
    Y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(Y_test, Y_pred)
    auc = roc_auc_score(Y_test, Y_pred)
    accuracies.append(accuracy)
    aucs.append(auc)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [104]:
for i, (accuracy, auc) in enumerate(zip(accuracies, aucs)):
    print(f"Accuracy of fold {i+1}: {accuracy}")
    print(f"AUC of fold {i+1}: {auc}")

Accuracy of fold 1: 0.87348
AUC of fold 1: 0.8734919377639848
Accuracy of fold 2: 0.87908
AUC of fold 2: 0.8790547604715617
Accuracy of fold 3: 0.88048
AUC of fold 3: 0.8804879538576982
Accuracy of fold 4: 0.87952
AUC of fold 4: 0.8795245308956998
Accuracy of fold 5: 0.8804
AUC of fold 5: 0.8803484941873898
