In [3]:
# Importing Libraries
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix  
import pandas as pd
import sklearn.metrics as metrics
from sklearn.metrics import f1_score
import numpy as np
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score 
from sklearn.ensemble import AdaBoostClassifier
from sklearn.utils import resample
from sklearn.svm import SVC  
from sklearn.neighbors import KNeighborsClassifier

In [4]:
# read csv into pandas dataframe
df = pd.read_csv('goodreads_output.csv')
df.loc[(df['year'].isin([2018]))]['category'].unique()
df['category'] = df['category'].map({
'Graphic Novels & Comics' : 0,
'Young Adult Fiction' : 1,
'Memoir & Autobiography' : 2,
'Picture Books' : 3,
'Romance' : 4,
'Humor' : 5,
'Poetry' : 6,
'Horror' : 7,
'Young Adult Fantasy' : 8,
'Science Fiction' : 9,
"Middle Grade & Children's" : 10,
'History & Biography' : 11,
'Nonfiction' : 12,
'Fantasy': 13,
'Mystery & Thriller' : 14,
'Historical Fiction' : 15,
'Debut Goodreads Author' : 16,
'Fiction' : 17,
'Paranormal Fantasy' : 18,
'Food & Cookbooks' : 19,
'Business Books' : 20,
'Science & Technology' : 21,
'Goodreads Author' : 22,
'Debut Novel' : 23,
'Best of the Best' : 24,
'Travel & Outdoors' : 25,
'Food & Cooking' : 19,
'Favorite Book of 2011' : 24,
'Debut Author' : 26
})
df = df.loc[(df['category'].isin([17]))]
df

Unnamed: 0,year,name,writer,category,winner,num 1 stars,num 2 stars,num 3 stars,num 4 stars,num 5 stars,average rating,average_rating_w,num_ratings,num_reviews
340,2013,The House Girl,['Tara Conklin'],17,0,0.010033,0.033445,0.290970,0.464883,0.200669,3.812709,3.76,75029,7891
341,2013,Orphan Train,['Christina Baker Kline'],17,0,0.006689,0.023411,0.137124,0.508361,0.324415,4.120401,4.09,417138,35762
342,2013,The Storyteller,['Jodi Picoult'],17,0,0.006711,0.023490,0.097315,0.382550,0.489933,4.325503,3.97,3482177,209189
343,2013,The Rosie Project,['Graeme Simsion'],17,0,0.003367,0.043771,0.127946,0.454545,0.370370,4.144781,3.92,525116,51377
344,2013,And the Mountains Echoed,['Khaled Hosseini'],17,1,0.010101,0.030303,0.259259,0.373737,0.326599,3.976431,4.29,3615182,152304
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3242,2016,Shelter,['Jung Yun'],17,0,0.014599,0.076642,0.193431,0.474453,0.240876,3.850365,3.73,6795,1146
3243,2016,Silver Threads,['Bette Lee Crosby'],17,0,0.015873,0.047619,0.111111,0.206349,0.619048,4.365079,4.23,16847,4058
3244,2016,Another Brooklyn,['Jacqueline Woodson'],17,0,0.006689,0.036789,0.140468,0.458194,0.357860,4.123746,4.04,181528,28868
3245,2016,Everybody's Fool,['Richard Russo'],17,0,0.012195,0.021341,0.146341,0.432927,0.387195,4.161585,3.90,237228,19535


In [5]:
dups = df[df.duplicated(['name', 'year', 'category'], keep=False)]

In [8]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
X = df[features]
y = df['winner']
X_norm = MinMaxScaler().fit_transform(X)
chi_selector = SelectKBest(chi2, k=3)
chi_selector.fit(X_norm, y)
chi_support = chi_selector.get_support()
chi_feature = X.loc[:,chi_support].columns.tolist()
print(chi_feature)

['average_rating_w', 'num_ratings', 'num_reviews']


In [7]:
features = ['num 4 stars', 'num 5 stars', 'average_rating_w', 'num_ratings', 'num_reviews']
data_test = df.loc[(df['year'].isin([2018]))]
X_test = data_test[features]
y_test = data_test['winner']
data_test['category'].unique()

array([17])

In [131]:
# Use 2011 - 2017 as train data and 2018 as test data
logreg = LogisticRegression(solver='liblinear', class_weight = 'balanced')
for year in [2011, 2012, 2013, 2014, 2015, 2016, 2017]:
    print(year)
    data_temp = df.loc[~(df['year'].isin([2018,2019]))]
    data_train = data_temp.loc[(data_temp['year'].isin([year]))]
    X_train = data_train[features]
    y_train = data_train['winner']
    if(len(X_train) != 0) : 
        print('fitting')
        logreg.fit(X_train, y_train)

2011
fitting
2012
fitting
2013
fitting
2014
fitting
2015
fitting
2016
fitting
2017
fitting


In [132]:
y_pred = logreg.predict(X_test)
predictions = pd.DataFrame(y_pred)
predictions[0].value_counts()

0    13
1     7
Name: 0, dtype: int64

In [133]:
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       1.00      0.68      0.81        19
          1       0.14      1.00      0.25         1

avg / total       0.96      0.70      0.78        20



In [134]:
tn, fp, fn, tp = confusion_matrix(y_test,y_pred).ravel()
print(tn, fp, fn, tp)

13 6 0 1


In [111]:
print('Accuracy: ', metrics.accuracy_score(y_test, y_pred))
#print('F1 score:', metrics.f1_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred)))

Accuracy:  0.7


In [112]:
# Random Forest Classifier
rfc = RandomForestClassifier(n_estimators=10, class_weight = 'balanced')
for year in [2011, 2012, 2013, 2014, 2015, 2016, 2017]:
    data_temp = df.loc[~(df['year'].isin([2018,2019]))]
    data_train = data_temp.loc[(data_temp['year'].isin([year]))]
    X_train = data_train[features]
    y_train = data_train['winner']
    if(len(X_train) != 0) : 
        rfc.fit(X_train, y_train)

In [113]:
rfc_pred = rfc.predict(X_test)
metrics.accuracy_score(y_test, rfc_pred)
print(classification_report(y_test, rfc_pred))

             precision    recall  f1-score   support

          0       0.95      1.00      0.97        19
          1       0.00      0.00      0.00         1

avg / total       0.90      0.95      0.93        20



  'precision', 'predicted', average, warn_for)


In [114]:
tn, fp, fn, tp = confusion_matrix(y_test, rfc_pred).ravel()
print(tn, fp, fn, tp)

19 0 1 0


In [115]:
# Gaussian Naive Bayes
gnb = GaussianNB()
for year in [2011, 2012, 2013, 2014, 2015, 2016, 2017]:
    data_temp = df.loc[~(df['year'].isin([2018,2019]))]
    data_train = data_temp.loc[(data_temp['year'].isin([year]))]
    X_train = data_train[features]
    y_train = data_train['winner']
    if(len(X_train) != 0) : 
        gnb.fit(X_train, y_train)

In [116]:
y_pred = gnb.predict(X_test)
predictions = pd.DataFrame(y_pred)
predictions[0].value_counts()

0    20
Name: 0, dtype: int64

In [117]:
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.95      1.00      0.97        19
          1       0.00      0.00      0.00         1

avg / total       0.90      0.95      0.93        20



  'precision', 'predicted', average, warn_for)


In [118]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
print(tn, fp, fn, tp)

19 0 1 0


In [119]:
model_GB = GradientBoostingClassifier(n_estimators=1000)
for year in [2011, 2012, 2013, 2014, 2015, 2016, 2017]:
    data_temp = df.loc[~(df['year'].isin([2018,2019]))]
    data_train = data_temp.loc[(data_temp['year'].isin([year]))]
    X_train = data_train[features]
    y_train = data_train['winner']
    if(len(X_train) != 0) : 
        model_GB.fit(X_train , y_train)
y_pred = model_GB.predict(X_test)
target_names = ['nominee', 'winner']
print(classification_report(y_test, y_pred, target_names = target_names))

             precision    recall  f1-score   support

    nominee       0.95      1.00      0.97        19
     winner       0.00      0.00      0.00         1

avg / total       0.90      0.95      0.93        20



  'precision', 'predicted', average, warn_for)


In [120]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
print(tn, fp, fn, tp)

19 0 1 0


In [121]:
# AdaBoost classifier builds a strong classifier by combining multiple
# poorly performing classifiers to get high accuracy strong classifier.
model_ad = AdaBoostClassifier()
for year in [2011, 2012, 2013, 2014, 2015, 2016, 2017]:
    data_temp = df.loc[~(df['year'].isin([2018,2019]))]
    data_train = data_temp.loc[(data_temp['year'].isin([year]))]
    X_train = data_train[features]
    y_train = data_train['winner']
    if(len(X_train) != 0) : 
        model_ad.fit(X_train , y_train)
        
y_pred = model_ad.predict(X_test)
target_names = ['nominee', 'winner']
print(classification_report(y_test, y_pred, target_names = target_names))

             precision    recall  f1-score   support

    nominee       0.95      1.00      0.97        19
     winner       0.00      0.00      0.00         1

avg / total       0.90      0.95      0.93        20



  'precision', 'predicted', average, warn_for)


In [122]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
print(tn, fp, fn, tp)

19 0 1 0


In [123]:
knn = KNeighborsClassifier()
for year in [2011, 2012, 2013, 2014, 2015, 2016, 2017]:
    data_temp = df.loc[~(df['year'].isin([2018,2019]))]
    data_train = data_temp.loc[(data_temp['year'].isin([year]))]
    X_train = data_train[features]
    y_train = data_train['winner']
    if(len(X_train) != 0) : 
        knn.fit(X_train , y_train)
        
y_pred = knn.predict(X_test)
target_names = ['nominee', 'winner']
print(classification_report(y_test, y_pred, target_names = target_names))

             precision    recall  f1-score   support

    nominee       0.95      1.00      0.97        19
     winner       0.00      0.00      0.00         1

avg / total       0.90      0.95      0.93        20



  'precision', 'predicted', average, warn_for)


In [124]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
print(tn, fp, fn, tp)

19 0 1 0


In [125]:
from sklearn import linear_model
clf = linear_model.SGDClassifier(max_iter=1000, tol=1e-3)
for year in [2011, 2012, 2013, 2014, 2015, 2016, 2017]:
    data_temp = df.loc[~(df['year'].isin([2018,2019]))]
    data_train = data_temp.loc[(data_temp['year'].isin([year]))]
    X_train = data_train[features]
    y_train = data_train['winner']
    if(len(X_train) != 0) : 
        clf.fit(X_train , y_train)
        
y_pred = clf.predict(X_test)
target_names = ['nominee', 'winner']
print(classification_report(y_test, y_pred, target_names = target_names))

             precision    recall  f1-score   support

    nominee       0.95      1.00      0.97        19
     winner       0.00      0.00      0.00         1

avg / total       0.90      0.95      0.93        20



  'precision', 'predicted', average, warn_for)
