In [1]:
import numpy as np
import pandas as pd
import os


if __name__ == '__main__':
    data_dir = 'data_reviews'
    x_train_df = pd.read_csv(os.path.join(data_dir, 'x_train.csv'))
    y_train_df = pd.read_csv(os.path.join(data_dir, 'y_train.csv'))

In [2]:
#load data into python
x_train_df = pd.read_csv('data_reviews/x_train.csv')
#concatenating review to make string processing easier
tr_list_of_sentences = x_train_df['text'].str.cat(sep='\n')
#tr_list_of_sentences

# Preprocessing

In [3]:
'''
Steps:
1. Remove all non-alpha numeric characters from the string
2. Remove stop words
3. Correct spelling

'''

'\nSteps:\n1. Remove all non-alpha numeric characters from the string\n2. Remove stop words\n3. Correct spelling\n\n'

In [4]:
#remove all non-alpha numeric characters from the string
import re

In [5]:
#using regex to remove non_alphanum
def remove_non_alpha_num(reviews_string):
    s = re.sub(r'[^A-Za-z\n\s]+', '', reviews_string)
    s = s.lower()
    return s
               
    
        

In [6]:
#remove stop words
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [7]:
stop_words = set(stopwords.words('english'))
stop_words = {word.replace("'", '') for word in stop_words}
toRemove = {'not', "couldn't", "shouldn't", "didn't"}
for entry in toRemove:
    stop_words.discard(entry)

In [8]:
#https://www.geeksforgeeks.org/removing-stop-words-nltk-python/
def remove_stop_words(reviews_string, remove_words):
    reviews_list = reviews_string.split('\n')
    filtered_sentences = []
    for review in reviews_list:
        r = review.split()
        filtered_words = [w for w in r if w not in remove_words]
        filtered_sentence = ' '.join(filtered_words)
        filtered_sentences.append(filtered_sentence)
    
    filtered_sentences = '\n'.join(filtered_sentences)
    return filtered_sentences
    

In [9]:
v = remove_non_alpha_num(tr_list_of_sentences)
x = remove_stop_words(v, stop_words)



In [10]:
from spellchecker import SpellChecker

In [11]:
#correct spelling
#https://pyspellchecker.readthedocs.io/en/latest/
def correct_spelling(reviews_string):
    reviews_list = reviews_string.split('\n')
    correctly_spelled_sentences = []
    
    speller = SpellChecker()
    
    for review in reviews_list:
        words = review.split()
        correctly_spelled_words = [speller.correction(word) if speller.correction(word) is not None else word for word in words]
        correct_sentence = ' '.join(correctly_spelled_words)
        correctly_spelled_sentences.append(correct_sentence)
    
    corrected_reviews_string = '\n'.join(correctly_spelled_sentences)
    return corrected_reviews_string


In [12]:
#The following code is from the Bag of Words Lab

def create_tok_count(list_of_sentences):
    
    tok_count_dict = dict()

    for line in list_of_sentences:
        tok_list = word_tokenize(line)
        for tok in tok_list:
            if tok in tok_count_dict:
                tok_count_dict[tok] += 1
            else:
                tok_count_dict[tok] = 1
                    
    return tok_count_dict

In [13]:
import afinn

In [15]:

from afinn import Afinn

def determine_sentiment(word):
    afinn = Afinn()
    score = afinn.score(word)
    
    if score > 0:
        return 1  # Positive sentiment
    elif score < 0:
        return -1  # Negative sentiment
    else:
        return 0  # Neutral/Indeterminate sentiment



1


In [51]:
# Example usage
word = "okay"
result = determine_sentiment(word)
print(result)  # 1 for positive sentiment

0


In [16]:
FREQ_THRESHOLD = 2
def build_vocab_list(reviews_string):
    list_of_sentences = reviews_string.split('\n')
    
    tok_count_dict = create_tok_count(list_of_sentences)
    sorted_tokens = list(sorted(tok_count_dict, key=tok_count_dict.get, reverse=True))
    
    vocab_list = [w for w in sorted_tokens[:] if ((tok_count_dict[w] >= FREQ_THRESHOLD) or (determine_sentiment(w) != 0))]
    
    vocab_dict = dict()
    for vocab_id, tok in enumerate(vocab_list):
        vocab_dict[tok] = vocab_id
    
    return vocab_dict, len(list_of_sentences), len(vocab_list)
    

In [17]:

# list_of_sentences = reviews_string.split('\n')
    
# tok_count_dict = create_tok_count(list_of_sentences)
# sorted_tokens = list(sorted(tok_count_dict, key=tok_count_dict.get, reverse=True))

# vocab_list = [w for w in sorted_tokens[:] if tok_count_dict[w] >= FREQ_THRESHOLD]

# vocab_dict = dict()
# for vocab_id, tok in enumerate(vocab_list):
#     vocab_dict[tok] = vocab_id

    

In [18]:
# tok_count_dict[sorted_tokens[0]]
# sorted_tokens[0]

In [19]:
def transform_text_into_feature_vector(text, vocab_dict):
    ''' Produce count feature vector for provided text
    
    Args
    ----
    text : string
        A string of raw text, representing a single 'review'
    vocab_dict : dict with string keys
        If token is in vocabulary, will exist as key in the dict
        If token is not in vocabulary, will not be in the dict

    Returns
    -------
    count_V : 1D numpy array, shape (V,) = (n_vocab,)
        Count vector, indicating how often each vocab word
        appears in the provided text string
    '''
    V = len(vocab_dict.keys())
    count_V = np.zeros(V)
    for tok in word_tokenize(text):
        if tok in vocab_dict:
            vv = vocab_dict[tok]
            count_V[vv] += 1
    return count_V

In [20]:
#DO NOT TOUCH THESE: THEY TAKE A LONG TIME TO RUN

In [36]:
reviews_string = remove_non_alpha_num(tr_list_of_sentences)


In [37]:
reviews_string = correct_spelling(reviews_string)
# reviews_string

In [38]:
reviews_string = remove_stop_words(reviews_string, stop_words)
# list_of_sentences = reviews_string.split('\n')
# list_of_sentences

In [39]:
vocab_dict, N, V = build_vocab_list(reviews_string)
N

2400

In [40]:
x_prepared_NV = np.zeros((N, V))
for nn, raw_text_line in enumerate(reviews_string.split("\n")):
    x_prepared_NV[nn] = transform_text_into_feature_vector(raw_text_line, vocab_dict)

In [41]:
x_prepared_NV.shape

(2400, 2083)

# Creating Model

In [42]:
#interesting model stuff
import sklearn
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from scipy.stats import loguniform
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

In [43]:
y_train_df = pd.read_csv('data_reviews/y_train.csv')
y_tr_N = y_train_df.is_positive_sentiment.values.astype(float)

In [44]:
y_tr_N.shape

(2400,)

In [45]:
model = sklearn.linear_model.LogisticRegression(solver='lbfgs', max_iter=300)
SEED = 2
FOLDS = 15

In [46]:
cRange = loguniform(1e-3, 1e3)
tol = [1e-1,1e-2,1e-3,1e-4,1e-5]

In [47]:

distributions = {
    'C': cRange,
    'penalty': ['l2', 'l1'],
    'tol': tol,
    'fit_intercept': [True, False]
}

In [48]:
curr_search = RandomizedSearchCV(
    estimator = model,
    param_distributions = distributions,
    scoring = 'roc_auc',
    n_iter = 20,
    cv = FOLDS,
    
    )

In [49]:
curr_search.fit(x_prepared_NV, y_tr_N)

135 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
135 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/liam/micromamba/envs/cs135_env/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/liam/micromamba/envs/cs135_env/lib/python3.10/site-packages/sklearn/linear_model/_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/Users/liam/micromamba/envs/cs135_env/lib/python3.10/site-packages/sklearn/linear_model/_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports on

In [50]:
print('Best params:', curr_search.best_params_)
print('Best score:', curr_search.best_score_)
print('Best pipeline:', curr_search.best_estimator_)
print('Index of best pipeline:', curr_search.best_index_)
results=pd.DataFrame(curr_search.cv_results_)
results #see results of test

Best params: {'C': 1.5791593743901453, 'fit_intercept': False, 'penalty': 'l2', 'tol': 0.01}
Best score: 0.8935208333333334
Best pipeline: LogisticRegression(C=1.5791593743901453, fit_intercept=False, max_iter=300,
                   tol=0.01)
Index of best pipeline: 18


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_fit_intercept,param_penalty,param_tol,params,split0_test_score,...,split8_test_score,split9_test_score,split10_test_score,split11_test_score,split12_test_score,split13_test_score,split14_test_score,mean_test_score,std_test_score,rank_test_score
0,0.006247,0.002027,0.0,0.0,1.197304,False,l1,0.1,"{'C': 1.1973040276375166, 'fit_intercept': Fal...",,...,,,,,,,,,,12
1,0.024632,0.003825,0.001977,0.0003,0.005273,False,l2,0.1,"{'C': 0.0052731106174940135, 'fit_intercept': ...",0.877969,...,0.854375,0.801328,0.8675,0.810156,0.859844,0.921562,0.833906,0.847938,0.048676,10
2,0.235654,0.029765,0.003291,0.001909,23.327015,False,l2,1e-05,"{'C': 23.327015042770334, 'fit_intercept': Fal...",0.90125,...,0.868125,0.792422,0.859063,0.860781,0.883438,0.915313,0.899531,0.876083,0.036632,4
3,0.182338,0.010661,0.002216,0.000357,12.877865,False,l2,1e-05,"{'C': 12.87786544417333, 'fit_intercept': Fals...",0.907656,...,0.871563,0.800078,0.865938,0.865625,0.889062,0.919531,0.902344,0.881833,0.035991,3
4,0.006108,0.000618,0.0,0.0,2.608202,False,l1,0.1,"{'C': 2.6082020846697724, 'fit_intercept': Fal...",,...,,,,,,,,,,12
5,0.00583,0.00029,0.0,0.0,0.119078,False,l1,0.1,"{'C': 0.11907839372328827, 'fit_intercept': Fa...",,...,,,,,,,,,,12
6,0.791352,0.11471,0.002505,0.000503,963.347797,True,l2,0.0001,"{'C': 963.3477967347928, 'fit_intercept': True...",0.873906,...,0.852344,0.765234,0.84125,0.828438,0.860156,0.894375,0.875469,0.849823,0.040584,9
7,0.032491,0.003212,0.002076,0.00043,0.002029,True,l2,0.0001,"{'C': 0.002028598799256797, 'fit_intercept': T...",0.872969,...,0.856406,0.800078,0.864531,0.806094,0.856563,0.915,0.831094,0.844344,0.048478,11
8,0.006155,0.000386,0.0,0.0,362.638153,True,l1,0.01,"{'C': 362.6381528529944, 'fit_intercept': True...",,...,,,,,,,,,,12
9,0.104376,0.008372,0.002248,0.000454,0.658522,True,l2,1e-05,"{'C': 0.6585215579447987, 'fit_intercept': Tru...",0.919844,...,0.880156,0.825391,0.890469,0.857344,0.901875,0.942969,0.898438,0.892719,0.036377,2


In [227]:
#getting a leaderboard score:
x_test_df = pd.read_csv('data_reviews/x_test.csv')
test_list_of_sentences = x_test_df['text'].str.cat(sep='\n')

In [228]:
#cleaning test data

In [229]:
test_list_of_sentences = remove_non_alpha_num(test_list_of_sentences)

In [245]:
#test_list_of_sentences = correct_spelling(test_list_of_sentences)

In [231]:
test_list_of_sentences = remove_stop_words(test_list_of_sentences, stop_words)

In [232]:
#determining test data size
list_of_sentences2 = test_list_of_sentences.split('\n')
Z = len(list_of_sentences2)
x_tr_ZV = np.zeros((Z, V))

In [233]:
for nn2, raw_text_line2 in enumerate(list_of_sentences2):
    x_tr_ZV[nn2] = transform_text_into_feature_vector(raw_text_line2, vocab_dict)

In [234]:
yhat_test_N = curr_search.predict_proba(x_tr_ZV)

In [235]:
np.savetxt("yproba1_test.txt", yhat_test_N[:, 1])

In [236]:
train_predictions = curr_search.predict(x_prepared_NV)

In [242]:
b = ~np.equal(train_predictions, y_tr_N)

In [243]:
indices = np.flatnonzero(b)
print(indices)

[  53   54   69   72   95  104  120  128  156  168  193  204  207  263
  266  279  297  304  315  345  348  392  412  451  455  464  466  476
  477  491  492  509  510  518  533  546  561  562  564  571  576  579
  586  587  595  597  621  653  666  672  682  691  701  703  704  714
  750  753  759  760  771  786  797  804  819  854  857  858  870  886
  889  890  899  919  924  927  941  945  953  954  967  970  990  991
  994  996  998 1002 1013 1016 1033 1055 1065 1066 1073 1087 1096 1121
 1124 1125 1126 1135 1136 1139 1169 1171 1198 1199 1227 1229 1233 1235
 1236 1237 1244 1265 1281 1285 1294 1296 1301 1309 1320 1324 1327 1333
 1336 1347 1348 1354 1366 1373 1383 1426 1434 1438 1440 1451 1452 1456
 1459 1464 1465 1472 1493 1498 1514 1515 1523 1529 1534 1538 1539 1542
 1549 1563 1568 1583 1584 1586 1596 1613 1618 1632 1637 1639 1640 1641
 1642 1655 1666 1669 1678 1686 1688 1692 1706 1730 1733 1739 1755 1759
 1768 1772 1796 1817 1832 1834 1859 1881 1929 1961 1978 1989 1993 2001
 2023 

In [52]:
reviews_string.split("\n")[54]

'excellent starter wireless headset'

In [53]:
train_predictions[54]

NameError: name 'train_predictions' is not defined