In [334]:
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import pandas as pd
import sklearn.metrics as metrics
from sklearn.metrics import f1_score
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score 

In [335]:
# read csv into pandas dataframe
df = pd.read_csv('goodreads_output.csv')
df.loc[(df['year'].isin([2018]))]['category'].unique()
df['category'] = df['category'].map({
'Graphic Novels & Comics' : 0,
'Young Adult Fiction' : 1,
'Memoir & Autobiography' : 2,
'Picture Books' : 3,
'Romance' : 4,
'Humor' : 5,
'Poetry' : 6,
'Horror' : 7,
'Young Adult Fantasy' : 8,
'Science Fiction' : 9,
"Middle Grade & Children's" : 10,
'History & Biography' : 11,
'Nonfiction' : 12,
'Fantasy': 13,
'Mystery & Thriller' : 14,
'Historical Fiction' : 15,
'Debut Goodreads Author' : 16,
'Fiction' : 17,
'Paranormal Fantasy' : 18,
'Food & Cookbooks' : 19,
'Business Books' : 20,
'Science & Technology' : 21,
'Goodreads Author' : 22,
'Debut Novel' : 23,
'Best of the Best' : 24,
'Travel & Outdoors' : 25,
'Food & Cooking' : 19,
'Favorite Book of 2011' : 24,
'Debut Author' : 26
})
#df = df.loc[(df['category'].isin([21]))]
df

In [336]:
dups = df[df.duplicated(['name', 'year', 'category'], keep=False)]

In [354]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
X_norm = MinMaxScaler().fit_transform(X)
chi_selector = SelectKBest(chi2, k=5)
chi_selector.fit(X_norm, y)
chi_support = chi_selector.get_support()
chi_feature = X.loc[:,chi_support].columns.tolist()
print(chi_feature)

['num 1 stars', 'num 3 stars', 'num 4 stars', 'num 5 stars', 'average rating']


In [355]:
features = ['num 1 stars', 'num 3 stars', 'num 4 stars', 'num 5 stars', 'average rating']
data_test = df.loc[(df['year'].isin([2018]))]
X_test = data_test[features]
y_test = data_test['winner']
data_test['category'].unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8, 26,  9, 10, 11, 12, 13, 14, 15,
       24, 21, 17, 19])

In [356]:
# Use 2011 - 2017 as train data and 2018 as test data
logreg = LogisticRegression(solver='liblinear', class_weight = 'balanced')
for year in [2011, 2012, 2013, 2014, 2015, 2016, 2017]:
    print(year)
    data_temp = df.loc[~(df['year'].isin([2018,2019]))]
    data_train = data_temp.loc[(data_temp['year'].isin([year]))]
    X_train = data_train[features]
    y_train = data_train['winner']
    if(len(X_train) != 0) : 
        print('fitting')
        logreg.fit(X_train, y_train)

2011
fitting
2012
fitting
2013
fitting
2014
fitting
2015
fitting
2016
fitting
2017
fitting


In [346]:
y_pred = logreg.predict(X_test)
predictions = pd.DataFrame(y_pred)
predictions[0].value_counts()

0    264
1    160
Name: 0, dtype: int64

In [347]:
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.96      0.63      0.76       403
          1       0.06      0.48      0.11        21

avg / total       0.91      0.62      0.73       424



In [348]:
print('Accuracy: ', metrics.accuracy_score(y_test, y_pred))
#print('F1 score:', metrics.f1_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred)))

Accuracy:  0.6202830188679245


In [349]:
# Random Forest Classifier
rfc = RandomForestClassifier(n_estimators=10, class_weight = 'balanced')
for year in [2011, 2012, 2013, 2014, 2015, 2016, 2017]:
    data_temp = df.loc[~(df['year'].isin([2018,2019]))]
    data_train = data_temp.loc[(data_temp['year'].isin([year]))]
    X_train = data_train[features]
    y_train = data_train['winner']
    if(len(X_train) != 0) : 
        rfc.fit(X_train, y_train)
    
rfc_pred = rfc.predict(X_test)
metrics.accuracy_score(y_test, rfc_pred)
print(classification_report(y_test, rfc_pred))

             precision    recall  f1-score   support

          0       0.95      0.99      0.97       403
          1       0.20      0.05      0.08        21

avg / total       0.92      0.94      0.93       424



In [350]:
# Gaussian Naive Bayes
gnb = GaussianNB()
for year in [2011, 2012, 2013, 2014, 2015, 2016, 2017]:
    data_temp = df.loc[~(df['year'].isin([2018,2019]))]
    data_train = data_temp.loc[(data_temp['year'].isin([year]))]
    X_train = data_train[features]
    y_train = data_train['winner']
    if(len(X_train) != 0) : 
        gnb.fit(X_train, y_train)

In [351]:
y_pred = gnb.predict(X_test)
predictions = pd.DataFrame(y_pred)
predictions[0].value_counts()

0    410
1     14
Name: 0, dtype: int64

In [352]:
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.95      0.97      0.96       403
          1       0.07      0.05      0.06        21

avg / total       0.91      0.92      0.91       424

