In [56]:
!pip install pandas scikit-learn numpy nltk hyperopt
import pandas as pd
import numpy as np
import nltk




In [2]:
from nltk.corpus import stopwords
from nltk import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize

In [58]:
import hyperopt

In [3]:
"""
handle_text: takes out stop words and punctuation, lemmatizes, and converts to lowercase
* input: string
* output: hanlded string
"""
def handle_text(text):
    # Check that argument provided is a string
    if not isinstance(text,str):
        return ""
    output = []
    stop_words = set(stopwords.words('english'))
    wordnet_lemmatizer = WordNetLemmatizer()
    for word in nltk.word_tokenize(text): 
        # Disclude stop words and words with punctuation
        word = word.lower()
        if word not in stop_words and word.isalpha():
            # Add lemmatized words
            output.append(wordnet_lemmatizer.lemmatize(word))
    return " ".join(output) # return the list of words as one single string

In [4]:
"""
convert_text: applies handle_text to all summaries and reviews in dataframe
* input: dataframe
* output: new dataframe with converted summaries/reviews
"""
def convert_text(df):
    df = df.assign(summary=df['summary'].apply(handle_text))
    df = df.assign(reviewText=df['reviewText'].apply(handle_text))
    return df

In [5]:
"""
get_uncommon_words: gets all words that occur less than the 'max' times in the data
* inputs: list_of_strings to get the uncommon words from
         max: integer of the max number of times a word can occur and still be considered 'uncommon'
* output: dictionary key set of uncommon words
"""
def get_uncommon_words(list_of_strings, max):
    uncommon_words = {} # dictionary of uncommon words where key=word, item=frequency
    seen = {""} # set of 'seen' words to avoid re-adding words to uncommon_words
    for string in list_of_strings:
        for word in nltk.word_tokenize(string): 
            count = uncommon_words.get(word)
            # If we haven't seen the word yet, add it to seen and uncommon_words
            if word not in seen: 
                seen.add(word)
                uncommon_words.update({word: 1})
            # If it's already been seen the max number of times, take it out of uncommon_words
            elif count == max: 
                uncommon_words.pop(word)
            # If it hasn't been seen max times, increment its count in uncommon_words
            elif count is not None:
                uncommon_words.update({word: count+1})
    return uncommon_words.keys() # return only the words, not the counts

In [6]:
"""
remove_words: removes all words in provided list from a given string
* inputs: string (to remove words from)
          words_to_remove: list of strings indicating which words to remove
* output: string (without provided words)
"""
def remove_words(string, words_to_remove):
    list_of_words = nltk.word_tokenize(string)
    for word in list_of_words: 
        # remove all words that occur in words_to_remove
        if word in words_to_remove: 
            list_of_words.remove(word)
    return " ".join(list_of_words) # return as a string

In [7]:
"""
remove_words_summaries: removes all words in provided list from the dataframe's summaries
* inputs: dataframe (to remove words from)
          words_to_remove: list of strings indicating which words to remove
* output: new dataframe (without the given words)
"""
def remove_words_reviews(df, words_to_remove):
    return df.assign(reviewText=df['reviewText'].apply(remove_words, words_to_remove=words_to_remove))

In [8]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split

In [106]:
"""
combine_summaries: gets a list where each item corresponds to all the summaries for one product
* input: dataframe
* output: list of product summaries
"""
def combine_summaries(train_df, test_df):
#    df, vestorizer
    #aggregate reviewTest based on amazon-id
    combined_train_df = train_df.groupby('amazon-id').agg({'reviewText': ' '.join, 'summary': ' '.join})
    vectorizer = TfidfVectorizer(max_features=8000)
    
    #transform it into an np.array then fit it into matrix
    review_vector = np.array(vectorizer.fit_transform(combined_train_df['reviewText']).toarray())
    #saved_train_matrix = vectorizer.fit(combined_train_df['reviewText'])
    summary_vector = np.array(vectorizer.transform(combined_train_df['summary']).toarray())
    
    
    train_vector = review_vector + summary_vector
    #train_vector = np.array(vectorizer.fit_transform(combined_train_df['reviewText']).toarray())
    
    combined_test_df = test_df.groupby('amazon-id').agg({'reviewText': ' '.join, 'summary': ' '.join})

    #transform it into an np.array then fit it into matrix
    review_vector = np.array(vectorizer.fit_transform(combined_test_df['reviewText']).toarray())
    #saved_test_matrix = vectorizer.fit(combined_test_df['reviewText'])
    summary_test_vector = np.array(vectorizer.transform(combined_test_df['summary']).toarray())

    #combine review and summary
    test_vector = np.array(vectorizer.fit_transform(combined_test_df['reviewText']).toarray())
    print("Test vector length: ", len(test_vector))
    print("Train vector length: ", len(train_vector))
    return train_vector, test_vector

In [96]:
"""
awesome_products: gets a list of whether each product is awesome or not
* input: dataframe
* output: list of whether product is awesome or not 
"""
def awesome_products(df):
    df = df.groupby('amazon-id').agg({'overall': lambda x: 1 if np.mean(x) > 4.5 else 0})
    return df['overall']

In [11]:
from sklearn.naive_bayes import GaussianNB

In [11]:
"""
prep_df: gets dataframe ready for predictions by handling the summary and review text 
* input: dataframe
* output: handled dataframe
"""
def prep_df(df):
    # Convert to lowercase, remove punctuation and stop words, lemmatize words
    df = convert_text(df)
    uncommon_words = get_uncommon_words(df['reviewText'], 2)
     # Remove uncommon words
    return remove_words_reviews(df, uncommon_words)

In [12]:
from sklearn.svm import LinearSVC

In [13]:
from sklearn.metrics import confusion_matrix, f1_score, classification_report

In [91]:
"""
make_prediction: makes prection based on training and test data
* inputs: x_train, y_train - training data, used to make the model
          x_test - test data to make prediction from
* output: predicted y values based on test data

make_prediction separates the model-making from the predicting in order to save space

"""
def make_prediction(x_train, y_train, x_test):
    model = LinearSVC()
    model.fit(x_train, y_train)
    return model, model.predict(x_test)

In [114]:
"""
test_model(): runs the model on training data and prints f1-score and classification report
"""
def test_model():
    df = pd.read_csv('datasets/Train.csv')
    df = prep_df(df)
    train_vector, test_vector= combine_summaries(df, df)
    
    x_train, x_test, y_train, y_test = train_test_split(train_vector, awesome_products(df))
    model, y_pred = make_prediction(x_train, y_train, x_test)
    
    print("f1_score=", f1_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    return x_train, x_test, y_train, y_test, model, train_vector

In [108]:
x_train, x_test, y_train, y_test, model, train_vector = test_model()

Test vector length:  10543
Train vector shape:  10543
Awesome length:  amazon-id
-9217723718720870868    0
-9215746463819797371    1
-9213978596308513604    0
-9211290576571923870    0
-9208769561690910545    0
-9208198150317071838    1
-9207623668807116759    1
-9203846742259231188    0
-9202734100323412002    1
-9198322104429679287    0
-9196816809534721621    1
-9195787036214670721    0
-9195715895034041804    0
-9185450485401072051    0
-9185028087385392712    1
-9184754614868031288    0
-9181938828093623181    0
-9177871146610584170    0
-9175284111494784505    1
-9171391896663756617    1
-9171133252298665868    0
-9170624073749960483    0
-9168022306965593241    0
-9164485698573154140    1
-9163887239523687605    1
-9163211980141980025    1
-9163010614846223784    0
-9162759304190308282    0
-9161926972130019940    0
-9157117359304219671    1
                       ..
 9182231784957840868    0
 9183110027749334191    0
 9183152728540863858    0
 9183587774518525868    1
 91846797

In [104]:
"""
run_model(): runs the model on testing data and outputs predictions.csv file
"""
def run_model(): 
    df_train = pd.read_csv('datasets/Train.csv')
    print("Prepping train df")
    df_train = prep_df(df_train)
    print("Done")
    
#   x_train = combine_summaries(df_train)
    print("getting y train")
    y_train = awesome_products(df_train)
    print("Done")
    
    
    df_test = pd.read_csv('datasets/Test.csv')
    print("Prepping test df")
    test_df = prep_df(df_test)
    print("Done")
    #x_test = combine_summaries(df_test)
    
    print("Combining summaries")
    x_train, x_test = combine_summaries(df_train, test_df)
    #print("x_train")
    #print(x_train)
    #print("x_test")
    #print(len(x_test))
    #print("y_train")
    print(len(y_train))
    print("Done")
    
    print("Making prediction")
    y_pred = make_prediction(x_train, y_train, x_test)
    print("y_pred")
    print(len(y_pred))
    print("Done")
    #output = pd.DataFrame({'amazon-id': df_test["amazon-id"].drop_duplicates(), 'Awesome': y_pred})
    
    print("Writing output")
    output = pd.DataFrame({'amazon-id': df_test["amazon-id"].drop_duplicates().reset_index(drop=True), 'Awesome': y_pred})
    output.to_csv('predictions.csv')
    print("Done")
    

In [111]:
# Hyper param tuning

loss = ['hinge', 'squared_hinge']
cs = [0.01, 0.02, 0.03, 0.04, 0.05, 0.1, 1]
tol = [0.00005, 0.00007, 0.0001]
param_grid = {'loss': loss, 'C': cs, 'tol': tol}


    

In [79]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

In [84]:
random_SVC_class = RandomizedSearchCV(
    estimator = model,
    param_distributions = param_grid,
    n_iter = 10,
    scoring='accuracy', n_jobs=4, cv = 3, refit=True, return_train_score = True)

In [85]:
random_SVC_class.fit(x_train, y_train)
print(random_SVC_class.best_estimator_)
print(random_SVC_class.best_score_)



LinearSVC(C=0.04, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=5e-05,
          verbose=0)
0.7163273049196914


In [112]:
grid_SVC_class = GridSearchCV(
    estimator= model,
    param_grid= param_grid,
    scoring='accuracy',
    n_jobs=4,
    cv=5,
    refit= True, return_train_score= True)

In [113]:
grid_SVC_class.fit(x_train, y_train)
print(grid_SVC_class.best_estimator_)
print(grid_SVC_class.best_score_)

LinearSVC(C=0.05, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=5e-05,
          verbose=0)
0.7266978626533451


In [105]:
run_model()

Prepping train df
Done
getting y train
Awesome length:  amazon-id
-9217723718720870868    0
-9215746463819797371    1
-9213978596308513604    0
-9211290576571923870    0
-9208769561690910545    0
-9208198150317071838    1
-9207623668807116759    1
-9203846742259231188    0
-9202734100323412002    1
-9198322104429679287    0
-9196816809534721621    1
-9195787036214670721    0
-9195715895034041804    0
-9185450485401072051    0
-9185028087385392712    1
-9184754614868031288    0
-9181938828093623181    0
-9177871146610584170    0
-9175284111494784505    1
-9171391896663756617    1
-9171133252298665868    0
-9170624073749960483    0
-9168022306965593241    0
-9164485698573154140    1
-9163887239523687605    1
-9163211980141980025    1
-9163010614846223784    0
-9162759304190308282    0
-9161926972130019940    0
-9157117359304219671    1
                       ..
 9182231784957840868    0
 9183110027749334191    0
 9183152728540863858    0
 9183587774518525868    1
 9184679746257273506    

ValueError: array length 2 does not match index length 1172

# INITIAL, ATTEMPTED, AND FAILED CODES


Model Evaluation:

We performed k fold cross validation on the model. However, we were unsure about the ideal k value for cross validation so we had another method to compute the k value that yields the highest accuracy between a specified range.

Unfortunately, both methods only worked on smaller subsets of the data, and crashed when the method was called on the complete dataset.

In [None]:
"""
evaluate_model: performs k fold cross validation  
* input: x_train, x_test, y_test, model, cv
* output: the mean of running cross validation across 'cv' folds         
"""

def evaluate_model(x_train, x_test, y_test, model, cv):
#     matrix, saved_matrix = word_matrix(x_train)
#     X = saved_matrix.transform(x_test).toarray()
    scores = cross_val_score(model, X, y_test, scoring="accuracy", cv= cv)
    return scores.mean()

In [None]:
"""
compare_to_ideal: computes the best k value for k fold cross validation within a given range
* input: minK: minimum k value in a range
         maxK: maximum k value in a range
         x_test
         x_train
         y_test
* output: dev_means: dictionary of the means of cross validation of different k values
          the k value with the highest accuracy
"""
def compare_to_idealK(minK, maxK, x_test, x_train, y_test, model):
     #ideal_mean =  evaluate_model(x_train, x_test, y_test, model, LeaveOneOut())
    folds= range(minK, maxK)
    dev_means={}
    
    for k in folds:
        cv = KFold(n_splits=k, shuffle= True, random_state=1)
        k_mean= evaluate_model(x_train, x_test, y_test, model, cv)
        dev_means[k] = k_mean #abs(ideal_mean - k_mean)
        minK= minK+1
    
    return dev_means, max(dev_means, key=dev_means.get)

CountVectorizer and GaussianNB approach: our first attempt at predicting from summary data-- we ultimately changed to TFIDF and LinearSVC because this approach was only getting us F1-scores of around .4-.5

In [None]:
"""
word_matrix: converts a list of words to a matrix of 0s and 1s 
* input: list_of_words to convert
* outputs: array of 0s and 1s of words in list_of_words
           saved matrix: saves how we fit training data to the model, will be
                         used to transform the test data to fit the model
"""
def word_matrix(list_of_words): 
    vectorizer = CountVectorizer()
    # Convert words to matrix form
    matrix = vectorizer.fit_transform(list_of_words)
    # Save the matrix to later transform the test data
    saved_matrix = vectorizer.fit(list_of_words)
    return matrix.toarray(), saved_matrix

In [None]:
"""
make_gaussiannb_model: makes a gaussian naive bayes model using the training data
* inputs: x_train, y_train
* output: GaussianNB model
"""
def make_gaussiannb_model(x_train, y_train): 
    model = GaussianNB()
    num_rows, num_cols = x_train.shape
    i=0
    # Use partial_fit to save space 
    # Add 1000 rows of the training data into the model at a time
    while i+1000<=num_rows: 
        model.partial_fit(x_train[i:i+1000],y_train[i:i+1000], classes=[0,1])
        i+=1000
    # Fit the remaining data
    model.partial_fit(x_train[i:num_rows], y_train[i:num_rows], classes=[0,1])
    return model

In [None]:
"""
predict: makes prediction based on provided model and test data
* inputs: model: GaussianNB model from training data
          saved_matrix: to transform the test data to fit the model
* output: predicted y values based on test data
"""
def predict(model, saved_matrix, x_test):
    X = saved_matrix.transform(x_test).toarray()
    return model, model.predict(X)

In [None]:
"""
get_model: gets the GaussianNB model from training data 
* inputs: x_train, y_train
* outputs: model: GaussianNB model from training data
           saved matrix: saves how we fit training data to the model, will be
                         used to transform the test data to fit the model
"""
def get_model(x_train, y_train):
    matrix, saved_matrix = word_matrix(x_train)
    print(matrix)
    model = make_gaussiannb_model(matrix, y_train)
    return model, saved_matrix

Categories: 

Creating a dictionary of average star rating based for each category genre on training data. Then using this dictionary to predict star review for a product by looking at it's category. 

We ultimately did not end up using this because we felt using aggregation of star values for a single categorical genre wouldn't accurately give a great prediction, especially when it is not working in conjunction with more heavily influencial feature such as the Summary.

In [None]:
#Creates star rating for each category based on training data

def category_rating(df):
    categories_genre = df['categories']
    overall_points = df['overall']

    dictionary={}
    list_splits = []

    index =0
    index_split = 0

    #going through each categories
    while index < len(df.index):

        #within each categories, there are different genres
        list_splits = categories_genre[index].split(",")

        #splittings genre within categories, loop through each genre 
        while index_split < len(list_splits):

            #strip to make it look cleaner
            list_splits[index_split]= list_splits[index_split].strip("]")
            list_splits[index_split]= list_splits[index_split].strip("[")
            list_splits[index_split]= list_splits[index_split].strip()

            #if genre is not within the category, add it to the dictionary
            if list_splits[index_split] not in dictionary.keys():
                dictionary[list_splits[index_split]] = overall_points[index]

            #if genre is within the category, add the value with the old value and average it
            else:
                dictionary[list_splits[index_split]] = (dictionary[list_splits[index_split]] + overall_points[index])/2

           #increment
            index_split += 1

        #increment
        index +=1

        #reset this index to zero
        index_split = 0
    return dictionary


In [None]:
#returns dictionary that had compiled points based on product's category

def category_testing(df):
    index = 0
    index_split=0
    dictionary_test = {}

    while index < len(df.index):
        #within each categories, there are different genres
        categories = df['categories'].iloc[index]

        #split each categories
        list_splits = categories.split(",")

        #go through each category
        while index_split < len(list_splits):

            #strip to make it look cleaner
            list_splits[index_split]= list_splits[index_split].strip("]")
            list_splits[index_split]= list_splits[index_split].strip("[")
            list_splits[index_split]= list_splits[index_split].strip()

            if df['amazon-id'][index] not in dictionary_test:
                dictionary_test[df['amazon-id'][index]] = dictionary[list_splits[index_split]]
            else:
                dictionary_test[df['amazon-id'][index]] = (dictionary_test[df['amazon-id'][index]] + dictionary[list_splits[index_split]])/2

            index_split+=1

        index_split = 0
        index +=1


Root genre: 

Creating a dictionary of average star rating per root-genre based on training data. Then using dictionary to predict star rating of test data. 

Similarly to Categories, we did not believe that is a good predictor, especially when used alone and not in conjunction with other features with more predictive powers.

In [None]:
#create a dictionary based on genre and the average star rating it has

def root_genre_rating(df):
    genre_list={}
    index_genre =0
    
    root_genre = df['root-genre']

    #going through each categories
    while index_genre < len(df.index):

            #if genre is not within the category, add it to the dictionary
        if root_genre[index_genre] not in genre_list.keys():
            genre_list[root_genre[index_genre]] = overall_points[index_genre]

            #if genre is within the category, add the value with the old value and average it
        else:
            genre_list[root_genre[index_genre]] = (genre_list[root_genre[index_genre]] + overall_points[index_genre])/2

           #increment
        index_genre += 1
    return genre_list


In [None]:
def root_genre_testing(df):
    index = 0
    genre_test = {}

    while index < len(df.index):

        if df['amazon-id'][index] not in genre_test:
            genre_test[df['amazon-id'][index]] = genre_list[df['root-genre'][index]]
        else:
            genre_test[df['amazon-id'][index]] = (genre_test[df['amazon-id'][index]] + genre_list[df['root-genre'][index]])/2

        index +=1
    return genre_test

SalesRank:

We believed that salesRank would be a good indicator of customer preferences and incorporating this feature would help the model predict better - a product with a high sales rank would be more likely to be rated as "Awesome".

As with other omitted features, we believed that it would not be a good predictor on its own.

In [None]:
"""
shift_salesRank: The minimum salesRank in the dataset was 6. We wanted to make sure it started at 0 - this method
normalizes salesRank.
* inputs: None
* outputs: Returns new train dataset with salesRank normalized
"""
def shift_salesRank():
    print("Current Minimum SalesRank: ", train_data['salesRank'].min(), "Current Maximum SalesRank: ", train_data['salesRank'].max())
    
    arr = pd.factorize(train_data['salesRank'])
    
    train_new = train_data.assign(salesRank= arr[0])
    
    print("Current Minimum SalesRank: ", train_new['salesRank'].min(), "Current Maximum SalesRank: ", train_new['salesRank'].max())
    
    return train_new 

Related:
    
Related showed how connected products were to other products in the dataset. We aimed to perform network analysis on this column so that we could give more 'connected' reviews importance by weighting it more. 

Parsing the text was difficult and we settled on getting counts of also bought, bought together and bought after viewing keys for each observation. The idea was to gauge how connected products were based on the counts of these keys. However, the f1 score for the model used (Logistic Regression) was unsatisfactory.

In [None]:
"""
trim_curly: All entries in the related column start and end with curly braces parses it out from all strings in 'related'
* inputs: None
* outputs: Returns a list containing all strings in 'related'.
"""

def trim_curly():
    
    col_le = len(train_data['related'])
    i = 0
    l = []
    
    while (i < col_le):
        l.append(train_data['related'][i].replace('{', '').replace('}', ''))
        i= i+1
    
    return l
    
    pass

In [None]:
"""
related_dicts: Performs get_dict across all strings in 'related'.
* inputs: l: a list - or the related column.
* outputs: d: a list of dictionaries
"""
def related_dicts(l):
    res=[]
    
    for i in l:
        d= get_dict(i)
        res.append(d)
    
    return res
    
    pass

The methods below obtain counts for the three keys in the 'related' column for each row in the dataset.

In [None]:
"""
also_bought_c: Getting count of also bought key from a single observation.
* inputs: d: The dictionary obtained after converting a single string in 'related'.
* outputs: count: count of values mapped to the also bought key.
"""
def also_bought_c(d):
    count=0
    
    if 'also_bought' in d:
        count= len(d['also_bought'])
        
    return count

In [None]:
"""
buy_after_viewing_c: Getting count of buy after viewing key from a single observation.
* inputs: d: The dictionary obtained after converting a single string in related.
* outputs: count: count of values mapped to the bought after key.
"""
def buy_after_viewing_c(d):
    count=0
    
    if 'buy_after_viewing' in d:
        count= len(d['buy_after_viewing'])
        
    return count

In [None]:
"""
bought_together_c: Getting count of bought together key from a single observation.
* inputs: d: The dictionary obtained after converting a single string in 'related'.
* outputs: count: count of values mapped to the bought together key.
"""
def bought_together_c(d):
    count=0
    
    if 'bought_together' in d:
        count= len(d['bought_together'])
        
    return count

In [None]:
"""
ab_tot: Getting count of also bought key from all observations in 'related.'
* inputs: l: The list of dictionaries obtained by parsing 'related'.
* outputs: count: count of values mapped to the bought after key.
"""

def ab_tot(l):
    res=[]
    
    for i in l:
        res.append(also_bought_c(i))
        
    return res

    pass

In [None]:
"""
bt_tot: Getting count of bought together key from all observations in 'related.'
* inputs: l: The list of dictionaries obtained by parsing 'related'.
* outputs: count: count of values mapped to the bought after key.
"""

def bt_tot(l):
    res=[]
    
    for i in l:
        res.append(bought_together_c(i))
        
    return res

    pass

In [None]:
"""
bav_tot: Getting count of buy after viewing key from all observations in 'related.'
* inputs: l: The list of dictionaries obtained by parsing 'related'.
* outputs: count: count of values mapped to the bought after key.
"""
def bav_tot(l):
    res=[]
    
    for i in l:
        res.append(buy_after_viewing_c(i))
        
    return res

    pass

The implementation of the failed model is included below:

In [None]:
# Parsing the training data

train_new = shift_salesRank()
l= trim_curly()
res= related_dicts(l)

ab_count= ab_tot(res)
bt_count= bt_tot(res)
bav_count= bav_c(res)
train_data['ab-count']= ab_count
train_data['bt-count']= bt_count
train_data['bav-count']=bav_count

is_awesome = lambda x: 1 if np.mean(x) > 4.5 else 0
train_new = train_data.groupby('amazon-id').agg({'salesRank': 'mean', 'ab-count': 'mean', 'bt-count': 'mean','bav-count': 'mean','overall': is_awesome})
X_train, y_train = train_data[['salesRank', 'ab-count', 'bt-count', 'bav-count']], train_data['overall']

# Parsing the test data

test_data = pd.read_csv('./data/Test.csv')
ltest= trim_curly()
res1= related_dicts(ltest)
ab_count1= ab_tot(res1)
bt_count1= bt_tot(res1)
bav_count1= bav_c(res1)

test_data['ab-count']= ab_count1
test_data['bt-count']= bt_count1
test_data['bav-count']=bav_count1

test_new = test_data.groupby('amazon-id').agg({'salesRank': 'mean', 'ab-count': 'mean', 'bt-count': 'mean','bav-count': 'mean'})

# Fit model

from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)

# Making predictions
preds = model.predict(X_test)
output = pd.DataFrame({'amazon-id': X_test.index, 'Awesome': preds})

# Model evaluation
from sklearn.model_selection import cross_val_score
cross_val_score(model, X_train, y_train, cv=3, scoring="accuracy")

# The accuracy of the model for three cross validations:
# array([0.71400027, 0.71403111, 0.71405039])




In [None]:
#Using bayesian optimization

#space = {'loss': loss, 'C': cs, 'dual': dual, 'penalty': penalty}
#from hyperopt import fmin, tpe
#from sklearn.model_selection import cross_val_score

# Set up objective function
#def objective(params):
#    params = {'loss': params['loss'], 'C': params['C'], 'dual': params['dual'], 'penalty': params['penalty']}
#    svc_clf = LinearSVC(**params) 
#    best_score = cross_val_score(svc_clf, x_train, y_train, scoring='accuracy', cv=3, n_jobs=4).mean()
#    loss = 1 - best_score
#    return loss

# Run the algorithm
#best = fmin(fn=objective,space=space, max_evals=20, rstate=np.random.RandomState(42), algo=tpe.suggest)
#print(best)