In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression,Ridge
from sklearn.metrics import roc_auc_score

In [2]:
train = pd.read_csv("data/split_1/train.tsv", sep='\t', header=0, dtype=str)
train['review'] = train['review'].str.replace('<.*?>', ' ', regex=True)
y_train = train["sentiment"].astype(int)

test = pd.read_csv("data/split_1/test.tsv", sep='\t', header=0, dtype=str)
test['review'] = test['review'].str.replace('<.*?>', ' ', regex=True)

y_test = pd.read_csv("data/split_1/test_y.tsv", sep='\t', header=0, dtype=int)
y_test = y_test["sentiment"].astype(int)

combined_reviews = pd.concat([train['review'], test['review']], ignore_index=True)
combined_y = pd.concat([y_train, y_test], ignore_index =True)



In [3]:
# below stopwords from nltk stopwords
stop_words = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]

vectorizer = CountVectorizer(
    preprocessor=lambda x: x.lower(),  # Convert to lowercase
    stop_words=stop_words,             # Remove stop words
    ngram_range=(1, 4),               # Use 1- to 4-grams
    min_df=0.001,                        # Minimum term frequency
    max_df=0.5,                       # Maximum document frequency
    token_pattern=r"\b[\w+\|']+\b" # Use word tokenizer: See Ethan's comment below
)


In [4]:
dtm_train = vectorizer.fit_transform(combined_reviews)

In [5]:
lasso = LogisticRegression(penalty='l1', solver='liblinear', C=0.25,  max_iter=300)
lasso.fit(dtm_train, combined_y)

In [7]:
coef = lasso.coef_.ravel()
selected_features = np.where(coef != 0)[0]
sorted_features = np.argsort(np.abs(coef[selected_features]))[::-1]
selected_features = selected_features[sorted_features[:1000]]
feature_names = np.array(vectorizer.get_feature_names_out())
print(feature_names)
selected_words = feature_names[selected_features]
print(selected_words[1:100])

['0' '0 10' '00' ... 'zombies' 'zone' 'zoom']
['3 10' '4 10' '8 10' '1 10' '2 10' 'stinker' '10 10' 'forgettable'
 'mst3k' 'waste' 'well worth' 'refreshing' 'disappointment'
 'highly recommend' 'give 4' 'lifeless' '9 10' 'yawn' 'one worst' 'poorly'
 'olds' 'definitely worth' 'worst' 'laughable' "can't wait"
 'highly recommended' 'awful' 'mildly' 'tedious' 'must see'
 'uninteresting' 'dreadful' "that's point" 'grade b' 'unfunny'
 'entertains' 'wonderfully' "i'm afraid" 'lacks' 'amateurish'
 'unremarkable' 'wasting' 'fails' 'lousy' 'first rate' 'disappointing'
 'tiresome' 'fast forward' 'excellently' 'funniest' 'redeeming' 'dull'
 'uninspired' 'subtle' 'alright' 'pretentious' 'superb' 'mediocre'
 'miscast' 'credibility' 'ladder' 'loved movie' 'embarrassed' 'bland'
 'avoid' 'lame' 'appalling' 'horrible' 'hype' 'wanted like' 'made sense'
 'one best' 'excellent' 'gem' 'hilarious' 'definitely recommend'
 'one better' 'mess' 'incoherent' 'terrible' 'worst movie' 'rainy'
 'unwatchable' 'boring

In [8]:
myvocab = selected_words 


vocabvectorizer = CountVectorizer(
    ngram_range=(1, 2),  # Adjust ngram_range to 4 for testing
    vocabulary=myvocab  
)

dtm_train = vocabvectorizer.transform(train['review'])

y_train = train['sentiment'].astype(int)

# Applying Ridge Regression on the transformed data
ridge = Ridge(alpha=272)  # Alpha can be adjusted based on model tuning
ridge.fit(dtm_train, y_train)

In [10]:
test = pd.read_csv("data/split_1/test.tsv", sep='\t', header=0, dtype=str)
test['review'] = test['review'].str.replace('<.*?>', ' ', regex=True)

y_test = pd.read_csv("data/split_1/test_y.tsv", sep='\t', header=0, dtype=int)
y_test = y_test["sentiment"].astype(int)
dtm_test = vocabvectorizer.transform(test['review'])

In [12]:
y_pred = ridge.predict(dtm_test)
auc = roc_auc_score(y_test, y_pred)
print("ROC AUC Score:", auc)

ROC AUC Score: 0.9495134796500907


In [26]:
y_pred2 = np.where(y_pred2 > 0.5, 1, 0)

array([0.69392785, 0.62829483, 0.54021108, ..., 0.43282653, 0.65947415,
       0.66900518])

In [45]:
# print(y_pred)
# print(y_test)
y_pred2 = 1 / (1 + np.exp(-y_pred))
y_pred2 = np.where(y_pred2 > 0.635, 1, 0)

auc = roc_auc_score(y_test, y_pred2)
print(f"ROC AUC Score for split{i}:", auc)

ROC AUC Score for split5: 0.8734971796215898


In [13]:
for i in range(1,6):
    train = pd.read_csv("data/split_1/train.tsv", sep='\t', header=0, dtype=str)
    train['review'] = train['review'].str.replace('<.*?>', ' ', regex=True)
    
    y_train = train["sentiment"].astype(int)
    dtm_train = vocabvectorizer.transform(train['review'])
    
    ridge.fit(dtm_train, y_train)
    
    test = pd.read_csv(f"data/split_{i}/test.tsv", sep='\t', header=0, dtype=str)
    test['review'] = test['review'].str.replace('<.*?>', ' ', regex=True)
    
    y_test = pd.read_csv(f"data/split_{i}/test_y.tsv", sep='\t', header=0, dtype=int)
    y_test = y_test["sentiment"].astype(int)
    dtm_test = vocabvectorizer.transform(test['review'])

    y_pred = ridge.predict(dtm_test)
    auc = roc_auc_score(y_test, y_pred)
    print(f"ROC AUC Score for split{i}:", auc)

ROC AUC Score for split1: 0.9495134796500907
ROC AUC Score for split2: 0.9521999719367563
ROC AUC Score for split3: 0.9519114372815806
ROC AUC Score for split4: 0.9514632105364548
ROC AUC Score for split5: 0.9518100665715554
