In [453]:
# Importing Libraries
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix  
import pandas as pd
import sklearn.metrics as metrics
from sklearn.metrics import f1_score
import numpy as np
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score 
from sklearn.ensemble import AdaBoostClassifier
from sklearn.utils import resample
from sklearn.svm import SVC  
from sklearn.neighbors import KNeighborsClassifier

In [454]:
# read csv into pandas dataframe
df = pd.read_csv('goodreads_output.csv')
df.loc[(df['year'].isin([2018]))]['category'].unique()
df['category'] = df['category'].map({
'Graphic Novels & Comics' : 0,
'Young Adult Fiction' : 1,
'Memoir & Autobiography' : 2,
'Picture Books' : 3,
'Romance' : 4,
'Humor' : 5,
'Poetry' : 6,
'Horror' : 7,
'Young Adult Fantasy' : 8,
'Science Fiction' : 9,
"Middle Grade & Children's" : 10,
'History & Biography' : 11,
'Nonfiction' : 12,
'Fantasy': 13,
'Mystery & Thriller' : 14,
'Historical Fiction' : 15,
'Debut Goodreads Author' : 16,
'Fiction' : 17,
'Paranormal Fantasy' : 18,
'Food & Cookbooks' : 19,
'Business Books' : 20,
'Science & Technology' : 21,
'Goodreads Author' : 22,
'Debut Novel' : 23,
'Best of the Best' : 24,
'Travel & Outdoors' : 25,
'Food & Cooking' : 19,
'Favorite Book of 2011' : 24,
'Debut Author' : 26
})
#df = df.loc[(df['category'].isin([21]))]
df

Unnamed: 0,year,name,writer,category,winner,num 1 stars,num 2 stars,num 3 stars,num 4 stars,num 5 stars,average rating
0,2013,"Avatar: The Last Airbender: The Search, Part 1","['Gene Luen Yang', 'Bryan Konietzko', 'Michael...",0,0,0.027397,0.010274,0.106164,0.325342,0.530822,4.321918
1,2013,"The Walking Dead, Vol. 18: What Comes After","['Robert Kirkman', 'Charlie Adlard', 'Cliff Ra...",0,0,0.003344,0.023411,0.224080,0.431438,0.317726,4.036789
2,2013,"Batman, Volume 2: The City of Owls","['Scott Snyder', 'Greg Capullo', 'Jonathan Gla...",0,0,0.000000,0.007380,0.114391,0.365314,0.512915,4.383764
3,2013,Kick-Ass 2,"['Mark Millar', 'John Romita Jr.', 'Tom Palmer...",0,0,0.026667,0.063333,0.306667,0.360000,0.243333,3.730000
4,2013,"American Vampire, Vol. 5","['Scott Snyder', 'Rafael Albuquerque', 'Dustin...",0,0,0.003333,0.023333,0.103333,0.440000,0.430000,4.270000
5,2013,Raven Girl,['Audrey Niffenegger'],0,0,0.073826,0.184564,0.382550,0.234899,0.124161,3.151007
6,2013,"Sweet Tooth, Volume 6: Wild Game","['Jeff Lemire', 'José Villarrubia', 'Carlos M....",0,0,0.003333,0.036667,0.113333,0.373333,0.473333,4.276667
7,2013,Blue Bloods: The Graphic Novel,"['Melissa de la Cruz', 'Robert Venditti', 'Ali...",0,0,0.020478,0.068259,0.156997,0.252560,0.501706,4.146758
8,2013,The Infernal Devices: Clockwork Prince,"['Cassandra Clare', 'Hye-Kyung Baek']",0,0,0.006757,0.013514,0.084459,0.263514,0.631757,4.500000
9,2013,"Fables, Vol. 18: Cubs in Toyland","['Bill Willingham', 'Mark Buckingham', 'Gene Ha']",0,0,0.000000,0.016722,0.193980,0.444816,0.344482,4.117057


In [455]:
dups = df[df.duplicated(['name', 'year', 'category'], keep=False)]

In [456]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
X_norm = MinMaxScaler().fit_transform(X)
chi_selector = SelectKBest(chi2, k=5)
chi_selector.fit(X_norm, y)
chi_support = chi_selector.get_support()
chi_feature = X.loc[:,chi_support].columns.tolist()
print(chi_feature)

['num 1 stars', 'num 3 stars', 'num 4 stars', 'num 5 stars', 'average rating']


In [457]:
features = ['num 1 stars', 'num 3 stars', 'num 4 stars', 'num 5 stars', 'average rating']
data_test = df.loc[(df['year'].isin([2018, 2017]))]
X_test = data_test[features]
y_test = data_test['winner']
data_test['category'].unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       21, 17, 19, 26, 24])

In [458]:
# Use 2011 - 2017 as train data and 2018 as test data
logreg = LogisticRegression(solver='liblinear', class_weight = 'balanced')
for year in [2011, 2012, 2013, 2014, 2015, 2016]:
    print(year)
    data_temp = df.loc[~(df['year'].isin([2018,2019]))]
    data_train = data_temp.loc[(data_temp['year'].isin([year]))]
    X_train = data_train[features]
    y_train = data_train['winner']
    if(len(X_train) != 0) : 
        print('fitting')
        logreg.fit(X_train, y_train)

2011
fitting
2012
fitting
2013
fitting
2014
fitting
2015
fitting
2016
fitting


In [459]:
y_pred = logreg.predict(X_test)
predictions = pd.DataFrame(y_pred)
predictions[0].value_counts()

0    459
1    365
Name: 0, dtype: int64

In [460]:
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.96      0.56      0.71       783
          1       0.06      0.51      0.10        41

avg / total       0.91      0.56      0.68       824



In [461]:
tn, fp, fn, tp = confusion_matrix(y_test,y_pred).ravel()
print(tn, fp, fn, tp)

439 344 20 21


In [462]:
print('Accuracy: ', metrics.accuracy_score(y_test, y_pred))
#print('F1 score:', metrics.f1_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred)))

Accuracy:  0.558252427184466


In [382]:
# Random Forest Classifier
rfc = RandomForestClassifier(n_estimators=10, class_weight = 'balanced')
for year in [2011, 2012, 2013, 2014, 2015, 2016, 2017]:
    data_temp = df.loc[~(df['year'].isin([2018,2019]))]
    data_train = data_temp.loc[(data_temp['year'].isin([year]))]
    X_train = data_train[features]
    y_train = data_train['winner']
    if(len(X_train) != 0) : 
        rfc.fit(X_train, y_train)

In [383]:
rfc_pred = rfc.predict(X_test)
metrics.accuracy_score(y_test, rfc_pred)
print(classification_report(y_test, rfc_pred))

             precision    recall  f1-score   support

          0       0.95      0.99      0.97       403
          1       0.14      0.05      0.07        21

avg / total       0.91      0.94      0.92       424



In [384]:
tn, fp, fn, tp = confusion_matrix(y_test, rfc_pred).ravel()
print(tn, fp, fn, tp)

397 6 20 1


In [385]:
# Gaussian Naive Bayes
gnb = GaussianNB()
for year in [2011, 2012, 2013, 2014, 2015, 2016, 2017]:
    data_temp = df.loc[~(df['year'].isin([2018,2019]))]
    data_train = data_temp.loc[(data_temp['year'].isin([year]))]
    X_train = data_train[features]
    y_train = data_train['winner']
    if(len(X_train) != 0) : 
        gnb.fit(X_train, y_train)

In [386]:
y_pred = gnb.predict(X_test)
predictions = pd.DataFrame(y_pred)
predictions[0].value_counts()

0    416
1      8
Name: 0, dtype: int64

In [387]:
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.95      0.98      0.96       403
          1       0.00      0.00      0.00        21

avg / total       0.90      0.93      0.92       424



In [393]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
print(tn, fp, fn, tp)

391 12 20 1


In [397]:
model_GB = GradientBoostingClassifier(n_estimators=1000)
for year in [2011, 2012, 2013, 2014, 2015, 2016, 2017]:
    data_temp = df.loc[~(df['year'].isin([2018,2019]))]
    data_train = data_temp.loc[(data_temp['year'].isin([year]))]
    X_train = data_train[features]
    y_train = data_train['winner']
    if(len(X_train) != 0) : 
        model_GB.fit(X_train , y_train)
y_pred = model_GB.predict(X_test)
target_names = ['nominee', 'winner']
print(classification_report(y_test, y_pred, target_names = target_names))

             precision    recall  f1-score   support

    nominee       0.95      0.97      0.96       403
     winner       0.08      0.05      0.06        21

avg / total       0.91      0.93      0.92       424



In [394]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
print(tn, fp, fn, tp)

391 12 20 1


In [400]:
# AdaBoost classifier builds a strong classifier by combining multiple
# poorly performing classifiers to get high accuracy strong classifier.
model_ad = AdaBoostClassifier()
for year in [2011, 2012, 2013, 2014, 2015, 2016, 2017]:
    data_temp = df.loc[~(df['year'].isin([2018,2019]))]
    data_train = data_temp.loc[(data_temp['year'].isin([year]))]
    X_train = data_train[features]
    y_train = data_train['winner']
    if(len(X_train) != 0) : 
        model_ad.fit(X_train , y_train)
        
y_pred = model_ad.predict(X_test)
target_names = ['nominee', 'winner']
print(classification_report(y_test, y_pred, target_names = target_names))

             precision    recall  f1-score   support

    nominee       0.95      0.98      0.96       403
     winner       0.10      0.05      0.06        21

avg / total       0.91      0.93      0.92       424



In [401]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
print(tn, fp, fn, tp)

394 9 20 1


In [420]:
knn = KNeighborsClassifier()
for year in [2011, 2012, 2013, 2014, 2015, 2016, 2017]:
    data_temp = df.loc[~(df['year'].isin([2018,2019]))]
    data_train = data_temp.loc[(data_temp['year'].isin([year]))]
    X_train = data_train[features]
    y_train = data_train['winner']
    if(len(X_train) != 0) : 
        knn.fit(X_train , y_train)
        
y_pred = knn.predict(X_test)
target_names = ['nominee', 'winner']
print(classification_report(y_test, y_pred, target_names = target_names))

             precision    recall  f1-score   support

    nominee       0.95      1.00      0.97       403
     winner       0.00      0.00      0.00        21

avg / total       0.90      0.95      0.93       424



  'precision', 'predicted', average, warn_for)


In [421]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
print(tn, fp, fn, tp)

403 0 21 0


In [432]:
from sklearn import linear_model
clf = linear_model.SGDClassifier(max_iter=1000, tol=1e-3)
for year in [2011, 2012, 2013, 2014, 2015, 2016, 2017]:
    data_temp = df.loc[~(df['year'].isin([2018,2019]))]
    data_train = data_temp.loc[(data_temp['year'].isin([year]))]
    X_train = data_train[features]
    y_train = data_train['winner']
    if(len(X_train) != 0) : 
        clf.fit(X_train , y_train)
        
y_pred = clf.predict(X_test)
target_names = ['nominee', 'winner']
print(classification_report(y_test, y_pred, target_names = target_names))

             precision    recall  f1-score   support

    nominee       0.95      1.00      0.97       403
     winner       0.00      0.00      0.00        21

avg / total       0.90      0.95      0.93       424



  'precision', 'predicted', average, warn_for)
