In [14]:
import numpy as np
import pandas as pd
import os


if __name__ == '__main__':
    data_dir = 'data_reviews'
    x_train_df = pd.read_csv(os.path.join(data_dir, 'x_train.csv'))
    y_train_df = pd.read_csv(os.path.join(data_dir, 'y_train.csv'))

In [15]:
#load data into python
x_train_df = pd.read_csv('data_reviews/x_train.csv')
#concatenating review to make string processing easier
tr_list_of_sentences = x_train_df['text'].str.cat(sep='\n')
#tr_list_of_sentences

# Preprocessing

In [18]:
'''
Steps:
1. Remove all non-alpha numeric characters from the string
2. Remove stop words
3. Correct spelling

'''

'\nSteps:\n1. Remove all non-alpha numeric characters from the string\n2. Remove stop words\n3. Correct spelling\n\n'

In [19]:
#remove all non-alpha numeric characters from the string
import re

In [20]:
#using regex to remove non_alphanum
def remove_non_alpha_num(reviews_string):
    s = re.sub(r'[^A-Za-z\n\s]+', '', reviews_string)
    s = s.lower()
    return s
               
    
        

In [21]:
#remove stop words
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [22]:
stop_words = set(stopwords.words('english'))
stop_words = {word.replace("'", '') for word in stop_words}
toRemove = {'not', 'couldnt', 'shouldnt', 'didnt'}
for entry in toRemove:
    stop_words.discard(entry)

In [23]:
#https://www.geeksforgeeks.org/removing-stop-words-nltk-python/
def remove_stop_words(reviews_string, remove_words):
    reviews_list = reviews_string.split('\n')
    filtered_sentences = []
    for review in reviews_list:
        r = review.split()
        filtered_words = [w for w in r if w not in remove_words]
        filtered_sentence = ' '.join(filtered_words)
        filtered_sentences.append(filtered_sentence)
    
    filtered_sentences = '\n'.join(filtered_sentences)
    return filtered_sentences
    

In [24]:
v = remove_non_alpha_num(tr_list_of_sentences)
x = remove_stop_words(v, stop_words)



In [25]:
from spellchecker import SpellChecker

In [34]:
#correct spelling
#https://pyspellchecker.readthedocs.io/en/latest/
def correct_spelling(reviews_string):
    reviews_list = reviews_string.split('/n')
    correctly_spelled_sentences = []
    
    speller = SpellChecker()
    
    for review in reviews_list:
        words = review.split()
        correctly_spelled_words = [speller.correction(word) if speller.correction(word) is not None else word for word in words]
        correct_sentence = ' '.join(correctly_spelled_words)
        correctly_spelled_sentences.append(correct_sentence)
    
    corrected_reviews_string = '\n'.join(correctly_spelled_sentences)
    return corrected_reviews_string


In [47]:
#The following code is from the Bag of Words Lab

def create_tok_count(list_of_sentences):
    
    tok_count_dict = dict()

    for line in list_of_sentences:
        tok_list = word_tokenize(line)
        for tok in tok_list:
            if tok in tok_count_dict:
                tok_count_dict[tok] += 1
            else:
                tok_count_dict[tok] = 1
                    
    return tok_count_dict

In [100]:
FREQ_THRESHOLD = 2
def build_vocab_list(reviews_string):
    list_of_sentences = reviews_string.split('\n')
    
    tok_count_dict = create_tok_count(list_of_sentences)
    sorted_tokens = list(sorted(tok_count_dict, key=tok_count_dict.get, reverse=True))
    
    vocab_list = [w for w in sorted_tokens[:] if tok_count_dict[w] >= FREQ_THRESHOLD]
    
    vocab_dict = dict()
    for vocab_id, tok in enumerate(vocab_list):
        vocab_dict[tok] = vocab_id
    
    return vocab_dict, len(list_of_sentences), len(vocab_list)
    

In [54]:
def transform_text_into_feature_vector(text, vocab_dict):
    ''' Produce count feature vector for provided text
    
    Args
    ----
    text : string
        A string of raw text, representing a single 'review'
    vocab_dict : dict with string keys
        If token is in vocabulary, will exist as key in the dict
        If token is not in vocabulary, will not be in the dict

    Returns
    -------
    count_V : 1D numpy array, shape (V,) = (n_vocab,)
        Count vector, indicating how often each vocab word
        appears in the provided text string
    '''
    V = len(vocab_dict.keys())
    count_V = np.zeros(V)
    for tok in word_tokenize(text):
        if tok in vocab_dict:
            vv = vocab_dict[tok]
            count_V[vv] += 1
    return count_V

In [None]:
#DO NOT TOUCH THESE: THEY TAKE A LONG TIME TO RUN

In [90]:
reviews_string = remove_non_alpha_num(tr_list_of_sentences)


In [91]:
# reviews_string = correct_spelling(reviews_string)
# reviews_string

In [101]:
reviews_string = remove_stop_words(reviews_string, stop_words)
# list_of_sentences = reviews_string.split('\n')
# list_of_sentences

In [102]:
vocab_dict, N, V = build_vocab_list(reviews_string)
N

2400

In [103]:
x_prepared_NV = np.zeros((N, V))
for nn, raw_text_line in enumerate(reviews_string.split("\n")):
    x_prepared_NV[nn] = transform_text_into_feature_vector(raw_text_line, vocab_dict)

In [104]:
x_prepared_NV.shape

(2400, 1757)

# Creating Model

In [105]:
#interesting model stuff
import sklearn
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from scipy.stats import loguniform
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

In [106]:
y_train_df = pd.read_csv('data_reviews/y_train.csv')
y_tr_N = y_train_df.is_positive_sentiment.values.astype(float)

In [107]:
y_tr_N.shape

(2400,)

In [108]:
model = sklearn.linear_model.LogisticRegression(solver='lbfgs')
SEED = 2
FOLDS = 15

In [109]:
cRange = loguniform(1e-3, 1e3)
tol = [1e-1,1e-2,1e-3,1e-4,1e-5]
SEED = 1
#max_iter_values = [100, 200, 300, 400, 500]
param_rand = {
    'clf__C': cRange,
    'clf__tol': tol,
    'clf__fit_intercept': [True, False]
    
    #clf__max_iter': max_iter_values
    
}

# pipeRand = Pipeline([['scaler', StandardScaler()],
#                      ['clf', LogisticRegression(penalty = 'l2',
#                                                 max_iter=300)]
#                     ])
pipeRand = Pipeline([['clf', sklearn.linear_model.LogisticRegression(penalty = 'l2',
                                                max_iter=400)]
                    ])

In [110]:
param_rand = {
    'clf__C': cRange,
    'clf__tol': tol,
    'clf__fit_intercept': [True, False]
    
    #clf__max_iter': max_iter_values
    
}

In [111]:
curr_search = RandomizedSearchCV(
    estimator = pipeRand,
    param_distributions = param_rand,
    cv = FOLDS,
    scoring = 'roc_auc',
    random_state = SEED,
    )

In [115]:
curr_search.fit(x_prepared_NV, y_tr_N)

In [116]:
print('Best params:', curr_search.best_params_)
print('Best score:', curr_search.best_score_)
print('Best pipeline:', curr_search.best_estimator_)
print('Index of best pipeline:', curr_search.best_index_)
results=pd.DataFrame(curr_search.cv_results_)
results #see results of test

Best params: {'clf__C': 0.32746665712253886, 'clf__fit_intercept': True, 'clf__tol': 1e-05}
Best score: 0.8861875
Best pipeline: Pipeline(steps=[['clf',
                 LogisticRegression(C=0.32746665712253886, max_iter=400,
                                    tol=1e-05)]])
Index of best pipeline: 4


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__C,param_clf__fit_intercept,param_clf__tol,params,split0_test_score,split1_test_score,...,split8_test_score,split9_test_score,split10_test_score,split11_test_score,split12_test_score,split13_test_score,split14_test_score,mean_test_score,std_test_score,rank_test_score
0,0.048083,0.00884,0.001857,0.000274,0.317784,True,0.1,"{'clf__C': 0.3177840006884069, 'clf__fit_inter...",0.913438,0.943594,...,0.880625,0.826797,0.88375,0.845,0.89625,0.943281,0.8825,0.88599,0.03693,2
1,0.020796,0.001393,0.001989,0.00035,0.001002,False,0.1,"{'clf__C': 0.001001581395585897, 'clf__fit_int...",0.872969,0.896406,...,0.861406,0.806953,0.86375,0.807969,0.847812,0.909844,0.819531,0.842542,0.045052,9
2,0.0299,0.002537,0.002209,0.000338,0.003581,True,1e-05,"{'clf__C': 0.0035812246787002297, 'clf__fit_in...",0.87625,0.9,...,0.861563,0.807734,0.865156,0.811094,0.851875,0.915,0.822969,0.84549,0.044943,7
3,0.060386,0.005993,0.001998,0.000439,0.240218,True,1e-05,"{'clf__C': 0.24021761202431602, 'clf__fit_inte...",0.909531,0.942969,...,0.877656,0.824297,0.883906,0.84125,0.894219,0.942344,0.877188,0.883396,0.038142,4
4,0.064078,0.005566,0.002008,0.000368,0.327467,True,1e-05,"{'clf__C': 0.32746665712253886, 'clf__fit_inte...",0.913281,0.943594,...,0.880781,0.827109,0.884219,0.845469,0.896094,0.943281,0.883125,0.886188,0.036739,1
5,0.034405,0.004149,0.001899,0.000249,0.016854,True,1e-05,"{'clf__C': 0.01685440782816938, 'clf__fit_inte...",0.887813,0.915625,...,0.859062,0.812109,0.87,0.817188,0.863594,0.929688,0.828906,0.856312,0.044312,5
6,0.024054,0.001534,0.0018,0.000219,0.00146,True,1e-05,"{'clf__C': 0.0014599082378876644, 'clf__fit_in...",0.873437,0.896563,...,0.861875,0.807578,0.863594,0.808281,0.848281,0.91125,0.82,0.843021,0.044979,8
7,0.047137,0.004659,0.001872,0.000299,0.319028,False,0.01,"{'clf__C': 0.3190280094957601, 'clf__fit_inter...",0.913594,0.941719,...,0.880937,0.826797,0.885938,0.842969,0.897656,0.944063,0.880625,0.885583,0.036861,3
8,0.028533,0.003484,0.001942,0.0002,0.006955,True,0.01,"{'clf__C': 0.006955392321661605, 'clf__fit_int...",0.880937,0.904844,...,0.860313,0.808516,0.867344,0.813438,0.856875,0.920469,0.824688,0.849104,0.045032,6
9,0.222698,0.020187,0.002248,0.000433,645.014465,False,0.01,"{'clf__C': 645.0144652189368, 'clf__fit_interc...",0.860781,0.844375,...,0.817187,0.740547,0.805469,0.814688,0.815469,0.877188,0.847187,0.821146,0.03195,10


In [117]:
#getting a leaderboard score:
x_test_df = pd.read_csv('data_reviews/x_test.csv')
test_list_of_sentences = x_test_df['text'].str.cat(sep='\n')

In [126]:
#cleaning test data

In [118]:
test_list_of_sentences = remove_non_alpha_num(test_list_of_sentences)

In [120]:
test_list_of_sentences = remove_stop_words(test_list_of_sentences, stop_words)

In [122]:
#determining test data size
list_of_sentences2 = test_list_of_sentences.split('\n')
Z = len(list_of_sentences2)
x_tr_ZV = np.zeros((Z, V))

In [123]:
for nn2, raw_text_line2 in enumerate(list_of_sentences2):
    x_tr_ZV[nn2] = transform_text_into_feature_vector(raw_text_line2, vocab_dict)

In [124]:
yhat_test_N = curr_search.predict_proba(x_tr_ZV)

In [125]:
np.savetxt("yproba1_test.txt", yhat_test_N[:, 1])