## Import Libraries and Load Data

In [1]:
# Import libraries
import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import Lasso
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

# Set seed
SEED = 4031
np.random.seed(SEED)

In [2]:
# Data file locations and names

project_root_dir = "Data"
project_subdir_prefix = "split_"
train_data_filename = "train.tsv"
test_data_filename = "test.tsv"
test_y_data_filename = "test_y.tsv"


# The number of train/test data folders and the target RMSE for each
# train/test split in each folder

n_datasets = 5

In [3]:
# Get list of data subfolders, each with a separate training and test set.

os_walk = os.walk(project_root_dir)
data_subdir_list = [subdirs for root, subdirs, files in os_walk][0]
n_subdirs = len(data_subdir_list)

assert(n_subdirs == n_datasets)

In [4]:
# Lists for training and test datasets

train_datasets = []
test_datasets = []
test_y_datasets = []


# Loop over subfolders and read in training/test datasets and test weekly sales.
# Use a loop instead of using os.walk directly to avoid "fold10" immediately following "fold1".

for subdir_num in np.arange(n_subdirs) + 1:
    subdir_num_str = str(subdir_num)
    train_datasets.append(pd.read_csv(os.path.join(project_root_dir,
                                                   project_subdir_prefix + subdir_num_str,
                                                   train_data_filename), sep='\t', header=0, dtype=str))
    test_datasets.append(pd.read_csv(os.path.join(project_root_dir,
                                                   project_subdir_prefix + subdir_num_str,
                                                   test_data_filename), sep='\t', header=0, dtype=str))
    test_y_datasets.append(pd.read_csv(os.path.join(project_root_dir,
                                                   project_subdir_prefix + subdir_num_str,
                                                   test_y_data_filename), sep='\t', header=0, dtype=str))

## Data Preprocessing

### Remove punctuations

In [5]:
for i in range(len(train_datasets)):
    train_datasets[i]['review'] = train_datasets[i]['review'].str.replace('&lt;.*?&gt;', ' ', regex=True)
    test_datasets[i]['review'] = test_datasets[i]['review'].str.replace('&lt;.*?&gt;', ' ', regex=True)

### Tokeninzation

In [14]:
#pool all datasets into one large set
all_words = []
for i in range(len(train_datasets)):
    for j in range(len(train_datasets[i]['review'])):
        all_words.append(train_datasets[i]['review'][j])
    for j in range(len(test_datasets[i]['review'])):
        all_words.append(test_datasets[i]['review'][j])


In [15]:
all_words

["Naturally in a film who's main themes are of mortality, nostalgia, and loss of innocence it is perhaps not surprising that it is rated more highly by older viewers than younger ones. However there is a craftsmanship and completeness to the film which anyone can enjoy. The pace is steady and constant, the characters full and engaging, the relationships and interactions natural showing that you do not need floods of tears to show emotion, screams to show fear, shouting to show dispute or violence to show anger. Naturally Joyce's short story lends the film a ready made structure as perfect as a polished diamond, but the small changes Huston makes such as the inclusion of the poem fit in neatly. It is truly a masterpiece of tact, subtlety and overwhelming beauty.",
 "Afraid of the Dark left me with the impression that several different screenplays were written, all too short for a feature length film, then spliced together clumsily into this Frankenstein's monster.<br /><br />At his best

### Stemming and Lemmatization

In [9]:

from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet
import nltk
nltk.download('stopwords')
nltk.download('punkt')

stop_words = set(stopwords.words('english'))

stemmer = PorterStemmer()


def stemming_lemmatizing_tokenizer(text):
    tokens = nltk.word_tokenize(text)
    stemmed_tokens = [stemmer.stem(token) for token in tokens] 
    return stemmed_tokens


corpus = [stemming_lemmatizing_tokenizer(text) for text in all_words]


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\spect\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\spect\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\spect\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [32]:
stemmed_corpus = []
for i, sentence in enumerate(corpus):
    stemmed_sentence = ""
    for word in sentence:
        stemmed_sentence += word + ' '
    stemmed_corpus.append(stemmed_sentence)

In [33]:

vectorizer = CountVectorizer(
    preprocessor=lambda x: x.lower(),  # Convert to lowercase
    stop_words='english',             # Remove stop words
    ngram_range=(1, 4),               # Use 1- to 4-grams
    min_df=0.001,                        # Minimum term frequency
    max_df=0.5,                       # Maximum document frequency
    token_pattern=r'\b\w+\b'          # Use word tokenizer
)

dtm_train = vectorizer.fit_transform(stemmed_corpus)

In [34]:
dtm_train

<250000x18525 sparse matrix of type '<class 'numpy.int64'>'
	with 28079265 stored elements in Compressed Sparse Row format>

### Perform Lasso Regression to find the most predicitive tokens

### Prepare all training data into one dataframe

In [35]:
all_train = pd.DataFrame()

for train_df in train_datasets:
    all_train = pd.concat([all_train, train_df], axis=0)

### Find the tokens that occur in at least 125 reviews for later comparison

In [36]:
len(all_train)*0.1/100

125.0

In [37]:
vectorized_train = vectorizer.transform(all_train['review'])
token_occurrences = vectorized_train.sum(axis=0)
summed_token_occurrences = token_occurrences.tolist()[0]
feature_names = vectorizer.get_feature_names_out()
tokens_appearing_in_at_least_125_reviews = [feature_names[i] for i, count in enumerate(summed_token_occurrences) if count >= 125]

In [38]:
tokens_appearing_in_at_least_125_reviews

['0',
 '0 10',
 '00',
 '000',
 '000 000',
 '1',
 '1 000',
 '1 1',
 '1 10',
 '1 2',
 '1 3',
 '1 5',
 '1 br',
 '1 br br',
 '1 hour',
 '1 star',
 '10',
 '10 10',
 '10 br',
 '10 br br',
 '10 s',
 '10 year',
 '10 year old',
 '100',
 '1000',
 '101',
 '11',
 '12',
 '12 year',
 '12 year old',
 '13',
 '13 year',
 '13 year old',
 '13th',
 '14',
 '14 year',
 '14 year old',
 '15',
 '15 year',
 '15 year old',
 '150',
 '16',
 '17',
 '18',
 '18th',
 '19',
 '1920',
 '1930',
 '1930 s',
 '1931',
 '1932',
 '1933',
 '1934',
 '1935',
 '1936',
 '1937',
 '1938',
 '1939',
 '1940',
 '1940 s',
 '1941',
 '1942',
 '1943',
 '1944',
 '1945',
 '1946',
 '1947',
 '1948',
 '1949',
 '1950',
 '1950 s',
 '1951',
 '1952',
 '1953',
 '1954',
 '1955',
 '1956',
 '1957',
 '1958',
 '1959',
 '1960',
 '1960 s',
 '1962',
 '1963',
 '1964',
 '1965',
 '1966',
 '1967',
 '1968',
 '1969',
 '1970',
 '1970 s',
 '1971',
 '1972',
 '1973',
 '1974',
 '1975',
 '1976',
 '1977',
 '1978',
 '1979',
 '1980',
 '1980 s',
 '1981',
 '1982',
 '1983',
 '1

### Perform Lasso Regression

In [39]:
feature_names = vectorizer.get_feature_names_out()

def find_best_tokens(num, c):
    for i in range(10):
        lasso_log_model = LogisticRegression(C=c, penalty='l1', solver='liblinear', max_iter=100000)  # very high max iter to ensure converge
        X_train = vectorizer.transform(all_train['review'])
        Y_train = all_train['sentiment']
        lasso_log_model.fit(X_train, Y_train)

        best_tokens = [[i, coef] for i, coef in enumerate(lasso_log_model.coef_[0]) if coef != 0]
        
        thresholded_tokens = []
        for i, token in enumerate(best_tokens):
            token = feature_names[token[0]]
            if token in tokens_appearing_in_at_least_125_reviews:
                thresholded_tokens.append(best_tokens[i])

        num_tokens = len(thresholded_tokens)
        print(f'number of tokens: {num_tokens}')
        print(f'old c: {c}')

        if num_tokens in range(num, num+100):
            return best_tokens
        elif num_tokens > num+100:
            c = c-(num_tokens/num*0.1*c)
        elif num_tokens < num:
            c = c+(num/num_tokens*0.1*c)

        print(f'new c: {c}')
    print("Bad initial c value, try another value")
    raise Exception

In [40]:
#Find all tokens with non-zero coefficients after lasso
best_tokens = find_best_tokens(num=2000, c=0.06369)
print(f'Total tokens after regularization: {len(best_tokens)}')

copy1_best_tokens = best_tokens[:]
copy2_best_tokens = best_tokens[:]

#Sort tokens by coefficients, more extreme = more predictive
new_best_tokens = list(map(lambda x: [x[0], x[1]], copy1_best_tokens))
sorted_best_tokens = sorted(new_best_tokens, key=lambda x: x[1], reverse=True)


abs_best_tokens = list(map(lambda x: [x[0], abs(x[1])], copy2_best_tokens))
sorted_abs_best_tokens = sorted(abs_best_tokens, key=lambda x: x[1], reverse=True)

#Take the top 2000 tokens regardless of pos/neg
top_2000_tokens = sorted_abs_best_tokens[:2000]

#Take the top 50 from pos and neg for explainability
positive_tokens = sorted_best_tokens[0:50]
negative_tokens = sorted_best_tokens[-51:-1]

feature_names = vectorizer.get_feature_names_out()

#Make tokens for model
model_tokens = []

for token in top_2000_tokens:
    model_tokens.append(feature_names[token[0]])


positive_predictors = []
negative_predictors = []
#Make tokens for explainability:
for (pos, neg) in zip(positive_tokens, negative_tokens):
    positive_predictors.append(feature_names[pos[0]])
    negative_predictors.append(feature_names[neg[0]])

number of tokens: 1532
old c: 0.0455
new c: 0.05143994778067885
number of tokens: 1694
old c: 0.05143994778067885
new c: 0.05751314114321472
number of tokens: 1862
old c: 0.05751314114321472
new c: 0.06369070732401115
number of tokens: 2036
old c: 0.06369070732401115
Total tokens after regularization: 2046


## Create Customer Tokenizer from only the 2000 best tokens

In [41]:
def custom_tokenizer(text):
    tokens = text.split()
    filtered_tokens = [token for token in tokens if token in model_tokens]
    return filtered_tokens

custom_vectorizer = CountVectorizer(tokenizer=custom_tokenizer, binary=True)

In [42]:
custom_vectorizer = CountVectorizer(binary=True, vocabulary=model_tokens)

## Predict

In [43]:
accuracies = []
aucs = []

for i in range(len(train_datasets)):
    model = LogisticRegression(max_iter=10000)

    X_train = custom_vectorizer.fit_transform(train_datasets[i]['review'])
    Y_train = train_datasets[i]['sentiment']
    model.fit(X_train, Y_train)

    X_test = custom_vectorizer.fit_transform(test_datasets[i]['review'])
    Y_test = test_y_datasets[i]['sentiment']
    Y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(Y_test, Y_pred)
    auc = roc_auc_score(Y_test, Y_pred)
    accuracies.append(accuracy)
    aucs.append(auc)

In [44]:
for i, (accuracy, auc) in enumerate(zip(accuracies, aucs)):
    #print(f"Accuracy of fold {i+1}: {accuracy}")
    print(f"AUC of fold {i+1}: {auc}")

AUC of fold 1: 0.849901850102072
AUC of fold 2: 0.84893056948186
AUC of fold 3: 0.8513669286197614
AUC of fold 4: 0.8494511036487062
AUC of fold 5: 0.8473420661696042


## Explore Tokens for Explainability

In [45]:
for i, token in enumerate(positive_predictors):
    print(f'Most predictive positive token #{i+1}: {token}')

Most predictive positive token #1: 7 10
Most predictive positive token #2: 8 10
Most predictive positive token #3: 10 10
Most predictive positive token #4: gem
Most predictive positive token #5: 9 10
Most predictive positive token #6: grade b
Most predictive positive token #7: bad thing
Most predictive positive token #8: funniest
Most predictive positive token #9: excellent
Most predictive positive token #10: superb
Most predictive positive token #11: amazing
Most predictive positive token #12: finest
Most predictive positive token #13: hilarious
Most predictive positive token #14: solid
Most predictive positive token #15: brilliant
Most predictive positive token #16: t miss
Most predictive positive token #17: look forward
Most predictive positive token #18: bravo
Most predictive positive token #19: perfect
Most predictive positive token #20: enjoyable
Most predictive positive token #21: timeless
Most predictive positive token #22: delight
Most predictive positive token #23: notch
Most

In [46]:
for i, token in enumerate(negative_predictors):
    print(f'Most predictive negative token #{i+1}: {token}')

Most predictive negative token #1: dreck
Most predictive negative token #2: cheap
Most predictive negative token #3: save
Most predictive negative token #4: weak
Most predictive negative token #5: rubbish
Most predictive negative token #6: whatsoever
Most predictive negative token #7: turkey
Most predictive negative token #8: shallow
Most predictive negative token #9: endless
Most predictive negative token #10: hype
Most predictive negative token #11: annoying
Most predictive negative token #12: feel need
Most predictive negative token #13: poor
Most predictive negative token #14: lifeless
Most predictive negative token #15: time money
Most predictive negative token #16: m afraid
Most predictive negative token #17: fast forward
Most predictive negative token #18: drivel
Most predictive negative token #19: insult
Most predictive negative token #20: ridiculous
Most predictive negative token #21: badly
Most predictive negative token #22: unless
Most predictive negative token #23: wooden
M