In [191]:
def read_data():
    path = "data/split_1"
    
    train = pd.read_csv(f"{path}/train.tsv", sep='\t', header=0, dtype=str)
    train['review'] = train['review'].str.replace('&lt;.*?&gt;', ' ', regex=True)
    
    test = pd.read_csv(f"{path}/test.tsv", sep='\t', header=0, dtype=str)
    test['review'] = test['review'].str.replace('&lt;.*?&gt;', ' ', regex=True)
        
    test_y = pd.read_csv(f"{path}/test_y.tsv", sep='\t', header=0, dtype=str)
    test["sentiment"] = test_y["sentiment"].astype(int)
    
    return pd.concat([train, test], ignore_index=True)

def process_data(data):
    vectorizer = CountVectorizer(
        preprocessor=lambda x: x.lower(),  # Convert to lowercase
        stop_words=stop_words,             # Remove stop words
        ngram_range=(1, 4),               # Use 1- to 4-grams
        min_df=0.001,                        # Minimum term frequency
        max_df=0.5,                       # Maximum document frequency
        token_pattern=r"\b[\w+\|']+\b" # Use word tokenizer: See Ethan's comment below
    )
        
    dtm = vectorizer.fit_transform(data["review"])
    y = data["sentiment"].astype(int)

    return vectorizer, dtm, y

def calculate_word_to_count(data):
    # create dictionary where n-gram is the key and value is size 2 
    # list where 1st entry is pos count and 2nd entry is neg count
    vectorizer, dtm, y = process_data(data)
    
    words = vectorizer.get_feature_names_out()
    word_to_count = {}
    
    for row, col in zip(dtm.nonzero()[0], dtm.nonzero()[1]):
        sentiment = y[row]
        word = words[col]
    
        if word not in word_to_count:
            word_to_count[word] = [0, 0]
            
        word_to_count[word][sentiment] += 1
        
    return word_to_count

def reduce_vocab(vocab, data):
    vectorizer = CountVectorizer(
        ngram_range=(1, 4)
    )
    vectorizer.fit(vocab)
    
    dtm = vectorizer.transform(data["review"])
    y = data["sentiment"].astype(int)
    
    best_penalty = calculate_optimal_penalty(dtm, y, max_word_count=1000)

    clf = LogisticRegression(C=best_penalty, penalty='l1', solver='liblinear').fit(dtm, y)
    coefs = np.abs(clf.coef_)[0]
    indices = np.argwhere(coefs > 0.001).reshape(-1)

    return vectorizer.get_feature_names_out()[indices]

# Create vocab according to post 627
def get_t_stat_vocab():    
    data = read_data()
    print("Done reading data")
    
    word_to_count = calculate_word_to_count(data)
    print("Done counting data")

    word_scores = []
    data["sentiment"] = data["sentiment"].astype(int)
    m = np.sum(np.where(data["sentiment"] == 1, 1, 0))
    n = np.sum(np.where(data["sentiment"] == 0, 1, 0))
    print(m, n)
    
    for word in word_to_count:
        pos_count, neg_count = word_to_count[word]
        
        pos_mean = pos_count / m
        pos_var = ( ((pos_count) * (1 - pos_mean)**2) + ((m - pos_count) * (pos_mean**2)) ) / (m - 1)
        
        neg_mean = neg_count / n
        neg_var = ( ((neg_count) * (1 - neg_mean)**2) + ((n - neg_count) * (neg_mean**2)) ) / (n - 1)
    
        t_stat = (pos_mean - neg_mean) / np.sqrt((pos_var / m) + (neg_var / n))
    
        if t_stat < 0:
            word_scores.append((word, np.abs(t_stat), 1))
        else:
            word_scores.append((word, np.abs(t_stat), -1))

    word_scores = sorted(word_scores, key=lambda x: x[1], reverse=True)[:2000]
    words = [item[0] for item in word_scores]
    print("Done scoring data")
    
    return reduce_vocab(words, data)

In [204]:
def calculate_optimal_penalty(dtm, y, max_word_count=1000):
    # Pick optimal penalty parameter according to post 626
    best_penalty = 0
    
    for i in np.arange(0.01, 0.6, 0.01):
        clf = LogisticRegression(C=i, penalty='l1', solver='liblinear').fit(dtm, y)
        
        coefs = np.abs(clf.coef_)[0]
        
        word_count = np.sum(np.where(coefs > 0.00001, 1, 0))
        print(f"Lambda {i} :", word_count)

        if word_count < max_word_count:
            best_penalty = i
        else:
            break

    return best_penalty

# Create vocab according to 626
def create_vocab():
    vectorizer = TfidfVectorizer(
        stop_words='english',
        lowercase=True,  # Converts all text to lowercase by default
        ngram_range=(1, 4),  # Extracts unigrams only by default
    	preprocessor=lambda x: x.lower(),  
        min_df=0.001,                        
        max_df=0.5,                       
        token_pattern=r"\b[\w+\|']+\b" 
    )
    
    data = read_data()
    
    dtm = vectorizer.fit_transform(data["review"])
    print(dtm.shape)
    y = data["sentiment"].astype(int)

    best_penalty = calculate_optimal_penalty(dtm, y, max_word_count=1000)

    clf = LogisticRegression(C=best_penalty, penalty='l1', solver='liblinear').fit(dtm, y)
    coefs = np.abs(clf.coef_)[0]
    indices = np.argwhere(coefs > 0.0001).reshape(-1)

    return vectorizer.get_feature_names_out()[indices]

In [219]:
# predictions[:, 1]
print(f"Fold {i} AUC Score:", roc_auc_score(test_y, predictions[:, 1]))

Fold 1 AUC Score: 0.9537940564215498


In [221]:
from sklearn.linear_model import LogisticRegressionCV, Ridge
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score
import pandas as pd
import numpy as np

stop_words = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", 
              "their", "they", "his", "her", "she", "he", "a", "an", "and", "is", "was", "are", 
              "were", "him", "himself", "has", "have", "it", "its", "the", "us"]

vocab = get_t_stat_vocab()
# vocab = create_vocab()
print(len(vocab))

vectorizer = TfidfVectorizer(
    ngram_range=(1, 2)
)
vectorizer.fit(vocab)

num_folds = 5
for i in range(1, num_folds + 1):
    path = f"data/split_{i}"

    train = pd.read_csv(f"{path}/train.tsv", sep='\t', header=0, dtype=str)
    train['review'] = train['review'].str.replace('&lt;.*?&gt;', ' ', regex=True)
    dtm_train = vectorizer.transform(train['review'])

    y_train = train["sentiment"]
    clf = LogisticRegressionCV(cv=5, max_iter=500).fit(dtm_train, y)
    # clf = Ridge(alpha=272)  # Alpha can be adjusted based on model tuning
    clf.fit(dtm_train, y_train)
    
    # process test data
    test = pd.read_csv(f"{path}/test.tsv", sep='\t', header=0, dtype=str)
    test['review'] = test['review'].str.replace('&lt;.*?&gt;', ' ', regex=True)
    dtm_test = vectorizer.transform(test['review'])
    
    predictions = clf.predict_proba(dtm_test)
    
    test_y = pd.read_csv(f"{path}/test_y.tsv", sep='\t', header=0, dtype=str)
    test_y = test_y["sentiment"].astype(int)
    
    print(f"Fold {i} AUC Score:", roc_auc_score(test_y, predictions[:, 1]))

Done reading data
Done counting data
25000 25000
Done scoring data
Lambda 0.01 : 267
Lambda 0.02 : 445
Lambda 0.03 : 567
Lambda 0.04 : 684
Lambda 0.05 : 776
Lambda 0.060000000000000005 : 855
Lambda 0.06999999999999999 : 909
Lambda 0.08 : 953
Lambda 0.09 : 999
Lambda 0.09999999999999999 : 1037
994


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Fold 1 AUC Score: 0.9584358124781267


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Fold 2 AUC Score: 0.9573686379964697


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Fold 3 AUC Score: 0.9576714617900193


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Fold 4 AUC Score: 0.9591848442783004


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Fold 5 AUC Score: 0.9581559659980069
