In [25]:
# Importing Libraries
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix  
import pandas as pd
import sklearn.metrics as metrics
from sklearn.metrics import f1_score
import numpy as np
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score 
from sklearn.ensemble import AdaBoostClassifier
from sklearn.utils import resample
from sklearn.svm import SVC  
from sklearn.neighbors import KNeighborsClassifier

In [40]:
# read csv into pandas dataframe
df = pd.read_csv('withdays.csv')
df.loc[(df['year'].isin([2018]))]['category'].unique()
df['category'] = df['category'].map({
'Graphic Novels & Comics' : 0,
'Young Adult Fiction' : 1,
'Memoir & Autobiography' : 2,
'Picture Books' : 3,
'Romance' : 4,
'Humor' : 5,
'Poetry' : 6,
'Horror' : 7,
'Young Adult Fantasy' : 8,
'Science Fiction' : 9,
"Middle Grade & Children's" : 10,
'History & Biography' : 11,
'Nonfiction' : 12,
'Fantasy': 13,
'Mystery & Thriller' : 14,
'Historical Fiction' : 15,
'Debut Goodreads Author' : 16,
'Fiction' : 17,
'Paranormal Fantasy' : 18,
'Food & Cookbooks' : 19,
'Business Books' : 20,
'Science & Technology' : 21,
'Goodreads Author' : 22,
'Debut Novel' : 23,
'Best of the Best' : 24,
'Travel & Outdoors' : 25,
'Food & Cooking' : 19,
'Favorite Book of 2011' : 24,
'Debut Author' : 26
})
df = df.loc[(df['category'].isin([17]))]
df = df.replace([np.inf, -np.inf], 32768)
df = df.replace([np.nan], 0)

In [41]:
dups = df[df.duplicated(['name', 'year', 'category'], keep=False)]

In [67]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
X = df[features]
y = df['winner']
X_norm = MinMaxScaler().fit_transform(X)
chi_selector = SelectKBest(chi2, k=8)
chi_selector.fit(X_norm, y)
chi_support = chi_selector.get_support()
chi_feature = X.loc[:,chi_support].columns.tolist()
print(chi_feature)

['num 4 stars', 'num 5 stars', 'average_rating_w', 'num_ratings', 'num_reviews', 'daystocom', 'avgrank_NYTime', 'avgrank_NYTime']


In [65]:
features = ['num 4 stars', 'num 5 stars', 'average_rating_w', 'num_ratings', 'num_reviews', 'daystocom', 'times_appred_NYTime', 'avgrank_NYTime', 'avgrank_NYTime', 'listnames']
data_test = df.loc[(df['year'].isin([2018]))]
X_test = data_test[features]
y_test = data_test['winner']
data_test['category'].unique()

array([17])

In [44]:
# Use 2011 - 2017 as train data and 2018 as test data
logreg = LogisticRegression(solver='liblinear', class_weight = 'balanced')
for year in [2011, 2012, 2013, 2014, 2015, 2016, 2017]:
    print(year)
    data_temp = df.loc[~(df['year'].isin([2018,2019]))]
    data_train = data_temp.loc[(data_temp['year'].isin([year]))]
    X_train = data_train[features]
    y_train = data_train['winner']
    if(len(X_train) != 0) : 
        print('fitting')
        logreg.fit(X_train, y_train)

2011
fitting
2012
fitting
2013
fitting
2014
fitting
2015
fitting
2016
fitting
2017
fitting


In [45]:
y_pred = logreg.predict(X_test)
predictions = pd.DataFrame(y_pred)
predictions[0].value_counts()

0    16
1     4
Name: 0, dtype: int64

In [46]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.94      0.79      0.86        19
           1       0.00      0.00      0.00         1

    accuracy                           0.75        20
   macro avg       0.47      0.39      0.43        20
weighted avg       0.89      0.75      0.81        20



In [47]:
tn, fp, fn, tp = confusion_matrix(y_test,y_pred).ravel()
print(tn, fp, fn, tp)

15 4 1 0


In [48]:
print('Accuracy: ', metrics.accuracy_score(y_test, y_pred))
#print('F1 score:', metrics.f1_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred)))

Accuracy:  0.75


In [49]:
# Random Forest Classifier
rfc = RandomForestClassifier(n_estimators=10, class_weight = 'balanced')
for year in [2011, 2012, 2013, 2014, 2015, 2016, 2017]:
    data_temp = df.loc[~(df['year'].isin([2018,2019]))]
    data_train = data_temp.loc[(data_temp['year'].isin([year]))]
    X_train = data_train[features]
    y_train = data_train['winner']
    if(len(X_train) != 0) : 
        rfc.fit(X_train, y_train)

In [50]:
rfc_pred = rfc.predict(X_test)
metrics.accuracy_score(y_test, rfc_pred)
print(classification_report(y_test, rfc_pred))

              precision    recall  f1-score   support

           0       0.95      1.00      0.97        19
           1       0.00      0.00      0.00         1

    accuracy                           0.95        20
   macro avg       0.47      0.50      0.49        20
weighted avg       0.90      0.95      0.93        20



  'precision', 'predicted', average, warn_for)


In [51]:
tn, fp, fn, tp = confusion_matrix(y_test, rfc_pred).ravel()
print(tn, fp, fn, tp)

19 0 1 0


In [52]:
# Gaussian Naive Bayes
gnb = GaussianNB()
for year in [2011, 2012, 2013, 2014, 2015, 2016, 2017]:
    data_temp = df.loc[~(df['year'].isin([2018,2019]))]
    data_train = data_temp.loc[(data_temp['year'].isin([year]))]
    X_train = data_train[features]
    y_train = data_train['winner']
    if(len(X_train) != 0) : 
        gnb.fit(X_train, y_train)

In [53]:
y_pred = gnb.predict(X_test)
predictions = pd.DataFrame(y_pred)
predictions[0].value_counts()

0    20
Name: 0, dtype: int64

In [54]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      1.00      0.97        19
           1       0.00      0.00      0.00         1

    accuracy                           0.95        20
   macro avg       0.47      0.50      0.49        20
weighted avg       0.90      0.95      0.93        20



  'precision', 'predicted', average, warn_for)


In [55]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
print(tn, fp, fn, tp)

19 0 1 0


In [56]:
model_GB = GradientBoostingClassifier(n_estimators=1000)
for year in [2011, 2012, 2013, 2014, 2015, 2016, 2017]:
    data_temp = df.loc[~(df['year'].isin([2018,2019]))]
    data_train = data_temp.loc[(data_temp['year'].isin([year]))]
    X_train = data_train[features]
    y_train = data_train['winner']
    if(len(X_train) != 0) : 
        model_GB.fit(X_train , y_train)
y_pred = model_GB.predict(X_test)
target_names = ['nominee', 'winner']
print(classification_report(y_test, y_pred, target_names = target_names))

              precision    recall  f1-score   support

     nominee       0.94      0.79      0.86        19
      winner       0.00      0.00      0.00         1

    accuracy                           0.75        20
   macro avg       0.47      0.39      0.43        20
weighted avg       0.89      0.75      0.81        20



In [57]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
print(tn, fp, fn, tp)

15 4 1 0


In [58]:
# AdaBoost classifier builds a strong classifier by combining multiple
# poorly performing classifiers to get high accuracy strong classifier.
model_ad = AdaBoostClassifier()
for year in [2011, 2012, 2013, 2014, 2015, 2016, 2017]:
    data_temp = df.loc[~(df['year'].isin([2018,2019]))]
    data_train = data_temp.loc[(data_temp['year'].isin([year]))]
    X_train = data_train[features]
    y_train = data_train['winner']
    if(len(X_train) != 0) : 
        model_ad.fit(X_train , y_train)
        
y_pred = model_ad.predict(X_test)
target_names = ['nominee', 'winner']
print(classification_report(y_test, y_pred, target_names = target_names))

              precision    recall  f1-score   support

     nominee       0.94      0.79      0.86        19
      winner       0.00      0.00      0.00         1

    accuracy                           0.75        20
   macro avg       0.47      0.39      0.43        20
weighted avg       0.89      0.75      0.81        20



In [59]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
print(tn, fp, fn, tp)

15 4 1 0


In [60]:
knn = KNeighborsClassifier()
for year in [2011, 2012, 2013, 2014, 2015, 2016, 2017]:
    data_temp = df.loc[~(df['year'].isin([2018,2019]))]
    data_train = data_temp.loc[(data_temp['year'].isin([year]))]
    X_train = data_train[features]
    y_train = data_train['winner']
    if(len(X_train) != 0) : 
        knn.fit(X_train , y_train)
        
y_pred = knn.predict(X_test)
target_names = ['nominee', 'winner']
print(classification_report(y_test, y_pred, target_names = target_names))

              precision    recall  f1-score   support

     nominee       0.95      1.00      0.97        19
      winner       0.00      0.00      0.00         1

    accuracy                           0.95        20
   macro avg       0.47      0.50      0.49        20
weighted avg       0.90      0.95      0.93        20



  'precision', 'predicted', average, warn_for)


In [61]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
print(tn, fp, fn, tp)

19 0 1 0


In [62]:
from sklearn import linear_model
clf = linear_model.SGDClassifier(max_iter=1000, tol=1e-3)
for year in [2011, 2012, 2013, 2014, 2015, 2016, 2017]:
    data_temp = df.loc[~(df['year'].isin([2018,2019]))]
    data_train = data_temp.loc[(data_temp['year'].isin([year]))]
    X_train = data_train[features]
    y_train = data_train['winner']
    if(len(X_train) != 0) : 
        clf.fit(X_train , y_train)
        
y_pred = clf.predict(X_test)
target_names = ['nominee', 'winner']
print(classification_report(y_test, y_pred, target_names = target_names))

              precision    recall  f1-score   support

     nominee       0.95      1.00      0.97        19
      winner       0.00      0.00      0.00         1

    accuracy                           0.95        20
   macro avg       0.47      0.50      0.49        20
weighted avg       0.90      0.95      0.93        20



  'precision', 'predicted', average, warn_for)
