In [14]:
import numpy as np
import pandas as pd
import os


if __name__ == '__main__':
    data_dir = 'data_reviews'
    x_train_df = pd.read_csv(os.path.join(data_dir, 'x_train.csv'))
    y_train_df = pd.read_csv(os.path.join(data_dir, 'y_train.csv'))

In [15]:
#load data into python
x_train_df = pd.read_csv('data_reviews/x_train.csv')
#concatenating review to make string processing easier
tr_list_of_sentences = x_train_df['text'].str.cat(sep='\n')
#tr_list_of_sentences

# Preprocessing

In [18]:
'''
Steps:
1. Remove all non-alpha numeric characters from the string
2. Remove stop words
3. Correct spelling

'''

'\nSteps:\n1. Remove all non-alpha numeric characters from the string\n2. Remove stop words\n3. Correct spelling\n\n'

In [19]:
#remove all non-alpha numeric characters from the string
import re

In [20]:
#using regex to remove non_alphanum
def remove_non_alpha_num(reviews_string):
    s = re.sub(r'[^A-Za-z\n\s]+', '', reviews_string)
    s = s.lower()
    return s
               
    
        

In [21]:
#remove stop words
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [22]:
stop_words = set(stopwords.words('english'))
stop_words = {word.replace("'", '') for word in stop_words}
toRemove = {'not', "couldn't", "shouldn't", "didn't"}
for entry in toRemove:
    stop_words.discard(entry)

In [23]:
#https://www.geeksforgeeks.org/removing-stop-words-nltk-python/
def remove_stop_words(reviews_string, remove_words):
    reviews_list = reviews_string.split('\n')
    filtered_sentences = []
    for review in reviews_list:
        r = review.split()
        filtered_words = [w for w in r if w not in remove_words]
        filtered_sentence = ' '.join(filtered_words)
        filtered_sentences.append(filtered_sentence)
    
    filtered_sentences = '\n'.join(filtered_sentences)
    return filtered_sentences
    

In [24]:
v = remove_non_alpha_num(tr_list_of_sentences)
x = remove_stop_words(v, stop_words)



In [25]:
from spellchecker import SpellChecker

In [136]:
#correct spelling
#https://pyspellchecker.readthedocs.io/en/latest/
def correct_spelling(reviews_string):
    reviews_list = reviews_string.split('\n')
    correctly_spelled_sentences = []
    
    speller = SpellChecker()
    
    for review in reviews_list:
        words = review.split()
        correctly_spelled_words = [speller.correction(word) if speller.correction(word) is not None else word for word in words]
        correct_sentence = ' '.join(correctly_spelled_words)
        correctly_spelled_sentences.append(correct_sentence)
    
    corrected_reviews_string = '\n'.join(correctly_spelled_sentences)
    return corrected_reviews_string


In [47]:
#The following code is from the Bag of Words Lab

def create_tok_count(list_of_sentences):
    
    tok_count_dict = dict()

    for line in list_of_sentences:
        tok_list = word_tokenize(line)
        for tok in tok_list:
            if tok in tok_count_dict:
                tok_count_dict[tok] += 1
            else:
                tok_count_dict[tok] = 1
                    
    return tok_count_dict

In [100]:
FREQ_THRESHOLD = 2
def build_vocab_list(reviews_string):
    list_of_sentences = reviews_string.split('\n')
    
    tok_count_dict = create_tok_count(list_of_sentences)
    sorted_tokens = list(sorted(tok_count_dict, key=tok_count_dict.get, reverse=True))
    
    vocab_list = [w for w in sorted_tokens[:] if tok_count_dict[w] >= FREQ_THRESHOLD]
    
    vocab_dict = dict()
    for vocab_id, tok in enumerate(vocab_list):
        vocab_dict[tok] = vocab_id
    
    return vocab_dict, len(list_of_sentences), len(vocab_list)
    

In [54]:
def transform_text_into_feature_vector(text, vocab_dict):
    ''' Produce count feature vector for provided text
    
    Args
    ----
    text : string
        A string of raw text, representing a single 'review'
    vocab_dict : dict with string keys
        If token is in vocabulary, will exist as key in the dict
        If token is not in vocabulary, will not be in the dict

    Returns
    -------
    count_V : 1D numpy array, shape (V,) = (n_vocab,)
        Count vector, indicating how often each vocab word
        appears in the provided text string
    '''
    V = len(vocab_dict.keys())
    count_V = np.zeros(V)
    for tok in word_tokenize(text):
        if tok in vocab_dict:
            vv = vocab_dict[tok]
            count_V[vv] += 1
    return count_V

In [None]:
#DO NOT TOUCH THESE: THEY TAKE A LONG TIME TO RUN

In [137]:
reviews_string = remove_non_alpha_num(tr_list_of_sentences)


In [138]:
reviews_string = correct_spelling(reviews_string)
# reviews_string

In [139]:
reviews_string = remove_stop_words(reviews_string, stop_words)
# list_of_sentences = reviews_string.split('\n')
# list_of_sentences

In [140]:
vocab_dict, N, V = build_vocab_list(reviews_string)
N

2400

In [141]:
x_prepared_NV = np.zeros((N, V))
for nn, raw_text_line in enumerate(reviews_string.split("\n")):
    x_prepared_NV[nn] = transform_text_into_feature_vector(raw_text_line, vocab_dict)

In [142]:
x_prepared_NV.shape

(2400, 1781)

# Creating Model

In [143]:
#interesting model stuff
import sklearn
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from scipy.stats import loguniform
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

In [144]:
y_train_df = pd.read_csv('data_reviews/y_train.csv')
y_tr_N = y_train_df.is_positive_sentiment.values.astype(float)

In [202]:
y_tr_N.shape

(2400,)

In [203]:
model = sklearn.linear_model.LogisticRegression(solver='lbfgs', max_iter=300)
SEED = 2
FOLDS = 15

In [204]:
cRange = loguniform(1e-3, 1e3)
tol = [1e-1,1e-2,1e-3,1e-4,1e-5]

In [205]:

distributions = {
    'C': cRange,
    'penalty': ['l2', 'l1'],
    'tol': tol,
    'fit_intercept': [True, False]
}

In [206]:
curr_search = RandomizedSearchCV(
    estimator = model,
    param_distributions = distributions,
    scoring = 'roc_auc',
    n_iter = 50,
    cv = FOLDS,
    
    )

In [207]:
curr_search.fit(x_prepared_NV, y_tr_N)

375 fits failed out of a total of 750.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
375 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/liam/micromamba/envs/cs135_env/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/liam/micromamba/envs/cs135_env/lib/python3.10/site-packages/sklearn/linear_model/_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/Users/liam/micromamba/envs/cs135_env/lib/python3.10/site-packages/sklearn/linear_model/_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports on

In [208]:
print('Best params:', curr_search.best_params_)
print('Best score:', curr_search.best_score_)
print('Best pipeline:', curr_search.best_estimator_)
print('Index of best pipeline:', curr_search.best_index_)
results=pd.DataFrame(curr_search.cv_results_)
results #see results of test

Best params: {'C': 1.3434794015459135, 'fit_intercept': False, 'penalty': 'l2', 'tol': 0.0001}
Best score: 0.8934062500000001
Best pipeline: LogisticRegression(C=1.3434794015459135, fit_intercept=False, max_iter=300)
Index of best pipeline: 7


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_fit_intercept,param_penalty,param_tol,params,split0_test_score,...,split8_test_score,split9_test_score,split10_test_score,split11_test_score,split12_test_score,split13_test_score,split14_test_score,mean_test_score,std_test_score,rank_test_score
0,0.005942,0.001095,0.0,0.0,157.491551,True,l1,1e-05,"{'C': 157.49155121108433, 'fit_intercept': Tru...",,...,,,,,,,,,,26
1,0.141187,0.013001,0.002022,0.000364,131.214466,True,l2,0.1,"{'C': 131.21446555641916, 'fit_intercept': Tru...",0.876875,...,0.839219,0.772422,0.830156,0.8475,0.845938,0.907813,0.880938,0.855521,0.03741,18
2,0.005063,0.000218,0.0,0.0,0.819606,False,l1,0.01,"{'C': 0.8196055021573002, 'fit_intercept': Fal...",,...,,,,,,,,,,26
3,0.405981,0.021325,0.001722,2.2e-05,159.46249,True,l2,0.0001,"{'C': 159.46248998399543, 'fit_intercept': Tru...",0.875937,...,0.837812,0.763203,0.829062,0.849375,0.839531,0.904063,0.878281,0.851927,0.039255,20
4,0.135038,0.012489,0.001703,6.1e-05,206.305268,True,l2,0.1,"{'C': 206.30526809662544, 'fit_intercept': Tru...",0.871875,...,0.836875,0.772109,0.827656,0.845938,0.840938,0.905469,0.876563,0.851271,0.037497,21
5,0.00508,0.000182,0.0,0.0,0.201669,False,l1,1e-05,"{'C': 0.20166923475102697, 'fit_intercept': Fa...",,...,,,,,,,,,,26
6,0.004989,5.8e-05,0.0,0.0,0.193338,True,l1,0.1,"{'C': 0.19333772449847308, 'fit_intercept': Tr...",,...,,,,,,,,,,26
7,0.074872,0.001674,0.001684,2.1e-05,1.343479,False,l2,0.0001,"{'C': 1.3434794015459135, 'fit_intercept': Fal...",0.922656,...,0.882188,0.823516,0.886563,0.861562,0.905937,0.941094,0.899375,0.893406,0.03551,1
8,0.182483,0.009978,0.001778,0.000188,686.545303,False,l2,0.01,"{'C': 686.5453031801089, 'fit_intercept': Fals...",0.862031,...,0.823437,0.742109,0.820937,0.834219,0.824531,0.889531,0.865156,0.836979,0.041101,25
9,0.005385,0.000205,0.0,0.0,31.635197,True,l1,0.01,"{'C': 31.635196806996802, 'fit_intercept': Tru...",,...,,,,,,,,,,26


In [152]:
#getting a leaderboard score:
x_test_df = pd.read_csv('data_reviews/x_test.csv')
test_list_of_sentences = x_test_df['text'].str.cat(sep='\n')

In [153]:
#cleaning test data

In [154]:
test_list_of_sentences = remove_non_alpha_num(test_list_of_sentences)

In [156]:
test_list_of_sentences = correct_spelling(test_list_of_sentences)

In [157]:
test_list_of_sentences = remove_stop_words(test_list_of_sentences, stop_words)

In [158]:
#determining test data size
list_of_sentences2 = test_list_of_sentences.split('\n')
Z = len(list_of_sentences2)
x_tr_ZV = np.zeros((Z, V))

In [159]:
for nn2, raw_text_line2 in enumerate(list_of_sentences2):
    x_tr_ZV[nn2] = transform_text_into_feature_vector(raw_text_line2, vocab_dict)

In [160]:
yhat_test_N = curr_search.predict_proba(x_tr_ZV)

In [161]:
np.savetxt("yproba1_test.txt", yhat_test_N[:, 1])