# CS 598 PSL Project 3: approach based on Campuswire post [628](https://campuswire.com/c/G06C55090/feed/628)

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
#from nltk import word_tokenize          
#from nltk.stem import PorterStemmer
#from nltk.stem import WordNetLemmatizer 

SEED = 4031
np.random.seed(SEED)

In [2]:
#from nltk import download
#download("punkt")
#download('wordnet')

In [3]:
# Pull in datasets
train_datasets = []
test_datasets = []
test_ys = []

num_folds = 5

for fold in range(num_folds):
    folder = f"Data/split_{fold+1}/"
    train_datasets.append(pd.read_csv(folder + "train.tsv", sep="\t"))
    test_datasets.append(pd.read_csv(folder + "test.tsv", sep="\t"))
    test_ys.append(pd.read_csv(folder + "test_y.tsv", sep="\t"))

In [4]:
# Custom stopword list
stopwords = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "their", "they", "his", \
             "her", "she", "he", "a", "an", "and", "is", "was", "are", "were", "him", "himself", "has", "have", "it", "its", \
             "the", "us"]

## Construct vocabulary

In [5]:
# Use all training data to construct vocabulary.

all_train = pd.concat(train_datasets, axis=0, ignore_index=True)
all_train.drop(columns=["id"], inplace=True)
all_train

#all_train = pd.DataFrame()

#for train_df in train_datasets:
#    all_train = pd.concat([all_train, train_df], axis=0)

Unnamed: 0,sentiment,review
0,1,Naturally in a film who's main themes are of m...
1,0,Afraid of the Dark left me with the impression...
2,0,This has to be one of the biggest misfires eve...
3,0,"This is one of those movies I watched, and won..."
4,0,This movie was dreadful. Biblically very inacc...
...,...,...
124995,0,"I am a student of film, and have been for seve..."
124996,0,It seems like more consideration has gone into...
124997,0,I don't believe they made this film. Completel...
124998,0,This 30 minute documentary Buñuel made in the ...


### Preprocess text

In [6]:
# Remove HTML tags and convert to lowercase
all_train["review"] = all_train["review"].str.replace('<.*?>', ' ', regex=True)
# Convert all strings to lowercase
all_train["review"] = all_train["review"].str.lower()

In [7]:
def expand_contractions(reviews):
    
    """
    Routine to expand English contractions, like "isn't" --> "is not".
    This is because "isn't good" and "wasn't good" will both expand to produce the bi-gram "not good".
    The pooled phrase should have more predictive power than the original two phrases.
    """

    # Dictionary of English contractions. Taken from StackOverflow post, which borrowed it from Wikipedia:
    # https://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python

    contractions = { 
        "\\bain't\\b": "am not",
        "\\baren't\\b": "are not",
        "\\bcan't\\b": "cannot",
        "\\bcan't've\\b": "cannot have",
        "\\b'cause\\b": "because",
        "\\bcould've\\b": "could have",
        "\\bcouldn't\\b": "could not",
        "\\bcouldn't've\\b": "could not have",
        "\\bdidn't\\b": "did not",
        "\\bdoesn't\\b": "does not",
        "\\bdon't\\b": "do not",
        "\\bhadn't\\b": "had not",
        "\\bhadn't've\\b": "had not have",
        "\\bhasn't\\b": "has not",
        "\\bhaven't\\b": "have not",
        "\\bhe'd\\b": "he would",
        "\\bhe'd've\\b": "he would have",
        "\\bhe'll\\b": "he will",
        "\\bhe'll've\\b": "he will have",
        "\\bhe's\\b": "he is",
        "\\bhow'd\\b": "how did",
        "\\bhow'd'y\\b": "how do you",
        "\\bhow'll\\b": "how will",
        "\\bhow's\\b": "how is",
        "\\bi'd\\b": "i would",
        "\\bi'd've\\b": "i would have",
        "\\bi'll\\b": "i will",
        "\\bi'll've\\b": "i will have",
        "\\bi'm\\b": "i am",
        "\\bi've\\b": "i have",
        "\\bisn't\\b": "is not",
        "\\bit'd\\b": "it would",
        "\\bit'd've\\b": "it would have",
        "\\bit'll\\b": "it will",
        "\\bit'll've\\b": "it will have",
        "\\bit's\\b": "it is",
        "\\blet's\\b": "let us",
        "\\bma'am\\b": "madam",
        "\\bmayn't\\b": "may not",
        "\\bmight've\\b": "might have",
        "\\bmightn't\\b": "might not",
        "\\bmightn't've\\b": "might not have",
        "\\bmust've\\b": "must have",
        "\\bmustn't\\b": "must not",
        "\\bmustn't've\\b": "must not have",
        "\\bneedn't\\b": "need not",
        "\\bneedn't've\\b": "need not have",
        "\\bo'clock\\b": "of the clock",
        "\\boughtn't\\b": "ought not",
        "\\boughtn't've\\b": "ought not have",
        "\\bshan't\\b": "shall not",
        "\\bsha'n't\\b": "shall not",
        "\\bshan't've\\b": "shall not have",
        "\\bshe'd\\b": "she would",
        "\\bshe'd've\\b": "she would have",
        "\\bshe'll\\b": "she will",
        "\\bshe'll've\\b": "she will have",
        "\\bshe's\\b": "she is",
        "\\bshould've\\b": "should have",
        "\\bshouldn't\\b": "should not",
        "\\bshouldn't've\\b": "should not have",
        "\\bso've\\b": "so have",
        "\\bso's\\b": "so is",
        "\\bthat'd\\b": "that would",
        "\\bthat'd've\\b": "that would have",
        "\\bthat's\\b": "that is",
        "\\bthere'd\\b": "there would",
        "\\bthere'd've\\b": "there would have",
        "\\bthere's\\b": "there is",
        "\\bthey'd\\b": "they would",
        "\\bthey'd've\\b": "they would have",
        "\\bthey'll\\b": "they will",
        "\\bthey'll've\\b": "they will have",
        "\\bthey're\\b": "they are",
        "\\bthey've\\b": "they have",
        "\\bto've\\b": "to have",
        "\\bwasn't\\b": "was not",
        "\\bwe'd\\b": "we would",
        "\\bwe'd've\\b": "we would have",
        "\\bwe'll\\b": "we will",
        "\\bwe'll've\\b": "we will have",
        "\\bwe're\\b": "we are",
        "\\bwe've\\b": "we have",
        "\\bweren't\\b": "were not",
        "\\bwhat'll\\b": "what will",
        "\\bwhat'll've\\b": "what will have",
        "\\bwhat're\\b": "what are",
        "\\bwhat's\\b": "what is",
        "\\bwhat've\\b": "what have",
        "\\bwhen's\\b": "when is",
        "\\bwhen've\\b": "when have",
        "\\bwhere'd\\b": "where did",
        "\\bwhere's\\b": "where is",
        "\\bwhere've\\b": "where have",
        "\\bwho'll\\b": "who will",
        "\\bwho'll've\\b": "who will have",
        "\\bwho's\\b": "who is",
        "\\bwho've\\b": "who have",
        "\\bwhy's\\b": "why is",
        "\\bwhy've\\b": "why have",
        "\\bwill've\\b": "will have",
        "\\bwon't\\b": "will not",
        "\\bwon't've\\b": "will not have",
        "\\bwould've\\b": "would have",
        "\\bwouldn't\\b": "would not",
        "\\bwouldn't've\\b": "would not have",
        "\\by'all\\b": "you all",
        "\\by'all'd\\b": "you all would",
        "\\by'all'd've\\b": "you all would have",
        "\\by'all're\\b": "you all are",
        "\\by'all've\\b": "you all have",
        "\\byou'd\\b": "you would",
        "\\byou'd've\\b": "you would have",
        "\\byou'll\\b": "you will",
        "\\byou'll've\\b": "you will have",
        "\\byou're\\b": "you are",
        "\\byou've\\b": "you have"
    }
    
    # Replace all contractions in all reviews.
    for contraction in contractions:
        reviews = reviews.str.replace(contraction, contractions[contraction], regex=True)
        
    return reviews

In [8]:
# Expand English contractions
all_train["review"] = expand_contractions(all_train["review"])

In [9]:
# Vectorize the reviews

vectorizer = CountVectorizer(
    preprocessor=lambda x: x.lower(), # Convert to lowercase
    stop_words=stopwords,             # Remove stop words
    ngram_range=(1, 4),               # Use 1- to 4-grams
    min_df=0.001,                     # Minimum term frequency
    max_df=0.5,                       # Maximum document frequency
    token_pattern=r"\b[\w+\|']+\b"    # Use word tokenizer, but don't split on apostrophes
)

dtm_train = vectorizer.fit_transform(all_train["review"])

In [10]:
# View the number of ngrams
feature_ngrams = vectorizer.get_feature_names_out()
feature_ngrams.shape

(31701,)

In [11]:
# Output features to file
np.savetxt("all_train_features.txt", feature_ngrams, fmt="%s", delimiter=",")

In [12]:
# Routine to preprocess text: strip out HTML, convert to lowercase, and expand English contractions.

def preprocess_reviews(reviews):
    reviews = reviews.str.replace('<.*?>', ' ', regex=True)
    reviews = reviews.str.lower()
    reviews = expand_contractions(reviews)
    
    return reviews

### Use t-test to identify strongest 2000 positive and negative terms

In [13]:
# Try t-test to identify terms that are strongly associated with only positive or only negative reviews.

#dtm_array = dtm_train.toarray()
#dtm_mmapped = np.memmap('dtm_mmapped.dat', dtype='float32', mode='w+', shape=dtm_train.shape)
#dtm_mmapped[:] = dtm_train.toarray()
dtm_pos = dtm_train[all_train.sentiment == 1, :]
dtm_neg = dtm_train[all_train.sentiment == 0, :]

In [14]:
dtm_pos_count = dtm_pos.shape[0]
dtm_neg_count = dtm_neg.shape[0]
dtm_pos_count, dtm_neg_count

(62385, 62615)

In [15]:
#dtm_pos_means = np.mean(dtm_pos, axis=0)
#dtm_pos_vars = np.var(dtm_pos, axis=0, ddof=1)
#dtm_pos_means.shape

# Define variables to hold means and variances of positive & negative reviews
dtm_pos_means = np.empty(feature_ngrams.shape[0])
dtm_pos_vars = np.empty(feature_ngrams.shape[0])

dtm_neg_means = np.empty(feature_ngrams.shape[0])
dtm_neg_vars = np.empty(feature_ngrams.shape[0])

In [16]:
# Compute feature means and sample variances using dense matrices instead of sparse matrices, one column at a time.
# This is because computing variance of sparse matrices is reported to have numerical instability.
# Use one column at a time to avoid consuming too much memory.
# Approach taken from StackOverflow post:
# https://stackoverflow.com/questions/12169611/how-do-i-compute-the-variance-of-a-column-of-a-sparse-matrix-in-scipy

for col in range(feature_ngrams.shape[0]):
    pos_col_array = dtm_pos[:, col].toarray()
    dtm_pos_means[col] = np.mean(pos_col_array)
    dtm_pos_vars[col] = np.var(pos_col_array, ddof=1)
    
    neg_col_array = dtm_neg[:, col].toarray()
    dtm_neg_means[col] = np.mean(neg_col_array)
    dtm_neg_vars[col] = np.var(neg_col_array, ddof=1)

#dtm_neg_means = np.mean(dtm_neg, axis=0)
#dtm_neg_vars = np.var(dtm_neg, axis=0, ddof=1)

In [17]:
# For each term / ngram, compute t-statistic for two independent samples.
# Hmmm...they're not independent, but we can't really pool the variance...

t_statistics = (dtm_pos_means - dtm_neg_means) / np.sqrt((dtm_pos_vars/dtm_pos_count) + (dtm_neg_vars/dtm_neg_count))
t_statistics.shape

(31701,)

In [18]:
feature_statistic_df = pd.DataFrame({"feature": feature_ngrams.tolist(), "statistic": t_statistics.tolist()})

In [19]:
# Look at top 50 positive words
feature_statistic_df.sort_values(by="statistic", ascending=False).iloc[0:50, :]

Unnamed: 0,feature,statistic
10874,great,73.982716
8232,excellent,58.862798
31186,wonderful,53.246837
3247,best,53.155283
17806,of best,51.324386
19152,one of best,48.93239
14764,love,43.82975
20103,perfect,41.751995
1406,amazing,40.271278
2936,beautiful,39.589658


In [20]:
# Look at bottom 50 words (most negative)
feature_statistic_df.sort_values(by="statistic").iloc[0:50, :]

Unnamed: 0,feature,statistic
2537,bad,-94.755462
31315,worst,-84.764359
29641,waste,-68.997657
2465,awful,-64.850629
17113,not even,-58.121304
25264,terrible,-57.600314
31306,worse,-53.553785
3598,boring,-52.661545
24670,stupid,-50.814659
16835,no,-50.695585


In [21]:
# How many terms meet the 0.05 significance threshold?
len(feature_statistic_df[feature_statistic_df.statistic >= 1.645])

11752

In [22]:
# Check for class imbalance
all_train.groupby(["sentiment"]).count()

Unnamed: 0_level_0,review
sentiment,Unnamed: 1_level_1
0,62615
1,62385


In [23]:
# Keep the top 2000 terms by magnitude of t-statistic

n_terms = 2000

feature_statistic_df["abs_statistic"] = abs(feature_statistic_df["statistic"])

top_features = feature_statistic_df.sort_values(by="abs_statistic", ascending=False).iloc[:n_terms, 0]
top_features

2537                     bad
31315                  worst
10874                  great
29641                  waste
2465                   awful
                ...         
24033                sounded
16860              no excuse
26662    this movie horrible
24173                spot on
17214            not in good
Name: feature, Length: 2000, dtype: object

In [24]:
# Add any terms that only appeared in positive reviews or only in negative reviews.
only_positive = feature_ngrams[np.logical_and((dtm_pos_means > 0), (dtm_neg_means == 0))]
only_negative = feature_ngrams[np.logical_and((dtm_pos_means == 0), (dtm_neg_means > 0))]
only_negative

array(['avoid like', 'bad as this', 'bad music', 'could not save',
       'crap like this', 'do not waste money', 'easily worst', 'gave 2',
       'how not to make', 'instead of 1', 'manos', 'not funny not',
       'not waste money on', 'not waste time or', 'this by far worst',
       'this drivel', 'this dull', 'this junk', 'this lame',
       'this piece of garbage', 'this rubbish', 'this stinker',
       'this tripe', 'this turkey', 'this waste', 'this waste of',
       'waste time or', 'waste time or money', 'waste time with this',
       'worst movie ever made', 'worst movies ever seen'], dtype=object)

In [25]:
top_features_list = list(set(top_features.tolist() + only_positive.tolist() + only_negative.tolist()))
top_features_list

['everyday',
 'some kind',
 '9 out',
 'name',
 'this best',
 'supposed to be',
 'makes no',
 'maybe',
 'this one of best',
 'running',
 'please do',
 'about any of',
 'money on',
 'this waste',
 'sometimes',
 'great performance',
 'do not rent',
 'fine',
 'movie nothing',
 'stupid',
 '20 minutes',
 'stunning',
 'bin',
 'did not work',
 'very different',
 'not funny',
 'sake',
 'travesty',
 'straight to video',
 'watching this',
 '3 out of',
 'of war',
 'plain bad',
 'am sorry but',
 'cut',
 'to all',
 'movie not worth',
 'not recommended',
 'just boring',
 'poorly executed',
 'acting superb',
 'most memorable',
 'paid',
 'beauty',
 '2',
 'barely',
 'did',
 'small',
 'still very',
 'true to',
 'tale',
 'tale of',
 'thumbs up',
 'mildly',
 'for no',
 'to rent',
 'innocent',
 'what waste',
 'really enjoyed this',
 'make',
 'this piece of',
 'tries to',
 'loneliness',
 '7 out',
 'somebody',
 'bad but',
 'on this',
 'masterful',
 'even worse than',
 'well done',
 'william',
 'avoid at all',

In [26]:
top_features_df = feature_statistic_df[feature_statistic_df['feature'].isin(top_features_list)]
top_features_df = top_features_df.sort_values(by='abs_statistic', ascending=False)
top_features_df

Unnamed: 0,feature,statistic,abs_statistic
2537,bad,-94.755462,94.755462
31315,worst,-84.764359,84.764359
10874,great,73.982716,73.982716
29641,waste,-68.997657,68.997657
2465,awful,-64.850629,64.850629
...,...,...,...
7488,easily worst,-11.236193,11.236193
17499,not waste money on,-11.191427,11.191427
26432,this drivel,-11.158397,11.158397
2544,bad as this,-11.147890,11.147890


## Lasso Regression to find the Top ~1000 Tokens

In [27]:
top_features_df['feature']

2537                    bad
31315                 worst
10874                 great
29641                 waste
2465                  awful
                ...        
7488           easily worst
17499    not waste money on
26432           this drivel
2544            bad as this
11822       how not to make
Name: feature, Length: 2009, dtype: object

In [28]:


custom_vectorizer = CountVectorizer(
    vocabulary=top_features_df['feature'],          # The top 2000 features
    stop_words=stopwords,             # Remove stop words
    ngram_range=(1, 4),               # Use 1- to 4-grams
    min_df=0.001,                     # Minimum term frequency
    max_df=0.5,                       # Maximum document frequency
    token_pattern=r"\b[\w+\|']+\b"    # Use word tokenizer, but don't split on apostrophes
)

X_train = custom_vectorizer.fit_transform(preprocess_reviews(all_train['review']))
Y_train = all_train['sentiment']

In [29]:
def find_best_tokens(num, c):
    for i in range(1000):
        lasso_log_model = LogisticRegression(C=c, penalty='l1', solver='liblinear', max_iter=100000)  # very high max iter to ensure converge
        #X_train = custom_vectorizer.fit_transform(preprocess_reviews(all_train['review']))
        #Y_train = all_train['sentiment']
        lasso_log_model.fit(X_train, Y_train)

        best_tokens = [[i, coef] for i, coef in enumerate(lasso_log_model.coef_[0]) if coef != 0]

        num_tokens = len(best_tokens)
        print(f'number of tokens: {num_tokens}')
        print(f'old c: {c}')
        
        diff = num_tokens-num

        if num_tokens == num:
            return best_tokens
        elif num_tokens > num:
            c = c*.999
        elif num_tokens < num:
            c = c*1.001

        print(f'new c: {c}')
    print("Bad initial c value, try another value")
    raise Exception


In [30]:
best_tokens = find_best_tokens(num=1000, c=0.04604)

number of tokens: 1001
old c: 0.04604
new c: 0.04599396
number of tokens: 1001
old c: 0.04599396
new c: 0.04594796604
number of tokens: 1003
old c: 0.04594796604
new c: 0.04590201807396
number of tokens: 1002
old c: 0.04590201807396
new c: 0.04585611605588604
number of tokens: 998
old c: 0.04585611605588604
new c: 0.045901972171941925
number of tokens: 1002
old c: 0.045901972171941925
new c: 0.045856070199769984
number of tokens: 1002
old c: 0.045856070199769984
new c: 0.04581021412957021
number of tokens: 999
old c: 0.04581021412957021
new c: 0.04585602434369978
number of tokens: 1002
old c: 0.04585602434369978
new c: 0.04581016831935608
number of tokens: 998
old c: 0.04581016831935608
new c: 0.04585597848767543
number of tokens: 1002
old c: 0.04585597848767543
new c: 0.045810122509187756
number of tokens: 998
old c: 0.045810122509187756
new c: 0.04585593263169694
number of tokens: 1001
old c: 0.04585593263169694
new c: 0.04581007669906524
number of tokens: 998
old c: 0.04581007669906

In [31]:
top_features

2537                     bad
31315                  worst
10874                  great
29641                  waste
2465                   awful
                ...         
24033                sounded
16860              no excuse
26662    this movie horrible
24173                spot on
17214            not in good
Name: feature, Length: 2000, dtype: object

In [32]:
top_features_df = pd.DataFrame(top_features.items(), columns=['token', 'feature'])
best_tokens_df = pd.DataFrame(best_tokens, columns=['index', 'value']).set_index('index')


lasso_best_tokens_df = top_features_df.join(best_tokens_df)
lasso_best_tokens_df = lasso_best_tokens_df.dropna()
lasso_best_tokens_df['weight'] = lasso_best_tokens_df['value'].abs()

lasso_best_tokens_df = lasso_best_tokens_df.sort_values(by='weight', ascending=False)
lasso_best_tokens_df

Unnamed: 0,token,feature,value,weight
158,289,7 10,2.702648,2.702648
180,244,4 10,-2.301557,2.301557
170,215,3 10,-2.148313,2.148313
117,301,8 10,1.695134,1.695134
435,223,3 out of 10,-1.684497,1.684497
...,...,...,...,...
1030,22034,riveting,0.000804,0.000804
575,11416,hell,0.000704,0.000704
1909,16975,nor,-0.000506,0.000506
602,15114,man,0.000456,0.000456


In [33]:
top_features = lasso_best_tokens_df['feature'].tolist()

In [34]:
top_features

['7 10',
 '4 10',
 '3 10',
 '8 10',
 '3 out of 10',
 '4 out of 10',
 'not recommend',
 'well worth',
 '7 out',
 'waste',
 'mst3k',
 'definitely worth',
 'refreshing',
 'disappointment',
 'poorly',
 'worst',
 '10 10',
 'awful',
 'not funny',
 'unfunny',
 '2 10',
 'redeeming',
 'miscast',
 'forgettable',
 'dull',
 'not worth',
 'must see',
 'very disappointed',
 'tedious',
 'just not',
 'laughable',
 'wooden',
 '9 10',
 'gem',
 'enjoyed this',
 'disappointing',
 'lousy',
 'fails',
 'wonderfully',
 'pretentious',
 'wasting',
 'funniest',
 'superb',
 'fast forward',
 'mildly',
 'lacks',
 'excellent',
 'mediocre',
 'appalling',
 'olds',
 'stinker',
 'uninteresting',
 'brilliantly',
 'yawn',
 '1 10',
 'not very',
 'very funny',
 'incoherent',
 'dreadful',
 'hilarious',
 'terrible',
 'no sense',
 'loved this',
 'unwatchable',
 'pointless',
 'trite',
 'bland',
 'boring',
 'amazing',
 'embarrassing',
 'tiresome',
 'pathetic',
 'perfect',
 'below average',
 'solid',
 'fantastic',
 'lame',
 'to r

In [44]:
np.savetxt("best_1000_tokens.txt", top_features, fmt="%s", delimiter=",")

## Find best ridge-regression model parameters

In [35]:
# Vectorize the full training set using the top 2000 features

top_feature_vectorizer = CountVectorizer(
    vocabulary=top_features,          # The top 200 features
    stop_words=stopwords,             # Remove stop words
    ngram_range=(1, 4),               # Use 1- to 4-grams
    min_df=0.001,                     # Minimum term frequency
    max_df=0.5,                       # Maximum document frequency
    token_pattern=r"\b[\w+\|']+\b"    # Use word tokenizer, but don't split on apostrophes
)

dtm_vocab_train = top_feature_vectorizer.fit_transform(all_train["review"])

In [36]:
from sklearn.linear_model import LogisticRegressionCV

In [37]:
grid_search = LogisticRegressionCV(Cs=10, cv=5, penalty="l2", scoring="roc_auc", max_iter=100000, random_state=SEED, verbose=1)

In [38]:
all_train_y = all_train["sentiment"]

In [39]:
grid_search.fit(dtm_vocab_train, all_train_y)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.2min finished


In [40]:
best_C = grid_search.C_[0]
best_C   # best_C is 166.81005372

21.54434690031882

In [41]:
grid_search.scores_

{1: array([[0.90678022, 0.94166505, 0.96134256, 0.96950294, 0.97171973,
         0.97197573, 0.97197418, 0.97197026, 0.97197044, 0.97197045],
        [0.91097104, 0.94501419, 0.96264855, 0.96962527, 0.97144685,
         0.97164637, 0.97165282, 0.97165164, 0.97165178, 0.97165158],
        [0.90934544, 0.9442392 , 0.96277001, 0.97016074, 0.97214498,
         0.97237922, 0.97239488, 0.97239707, 0.97239703, 0.97239707],
        [0.91045961, 0.94413151, 0.96198886, 0.96941574, 0.97146827,
         0.97173514, 0.97175732, 0.9717603 , 0.97176033, 0.97176036],
        [0.91040321, 0.94408884, 0.9622579 , 0.96972239, 0.97172056,
         0.97193697, 0.97193573, 0.97192499, 0.97192502, 0.97192499]])}

## Inference

In [42]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [43]:
import time
auc_scores = []

for i in range(len(train_datasets)):
    start_time = time.time()
    model = LogisticRegression(C=best_C, penalty="l2", max_iter=100000, random_state=SEED, verbose=1)
    
    train_X = top_feature_vectorizer.fit_transform(preprocess_reviews(train_datasets[i]["review"]))
    train_y = train_datasets[i]["sentiment"]
    
    model.fit(train_X, train_y)
    
    test_X = top_feature_vectorizer.transform(preprocess_reviews(test_datasets[i]["review"]))
    test_y = test_ys[i]["sentiment"]
    
    pred_y = model.predict_proba(test_X)[:, 1]  # Predict probabilities for class 1 (positive review)
    
    auc_score = roc_auc_score(test_y, pred_y)
    auc_scores.append(auc_score)
    
    print(f"AUC of split {i+1}: {auc_score}")
    
    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Split {i+1} run time: {elapsed_time} seconds")

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.2s finished


AUC of split 1: 0.9616746755893664
Split 1 run time: 161.37855577468872 seconds


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.1s finished


AUC of split 2: 0.9616008961805649
Split 2 run time: 163.2076027393341 seconds


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.1s finished


AUC of split 3: 0.9610768793284521
Split 3 run time: 160.60091280937195 seconds


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.3s finished


AUC of split 4: 0.9621500749760479
Split 4 run time: 166.03499817848206 seconds


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.5s finished


AUC of split 5: 0.9620394411343214
Split 5 run time: 163.70697665214539 seconds
