In [None]:
#import libraries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import csv
import json
import matplotlib.pyplot as plt 
import seaborn as sns
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
import re
# Performance metric
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
import nltk
​
import warnings
warnings.simplefilter('ignore', UserWarning)

In [None]:
#import dataset
book_summary_df = pd.read_csv("../input/book-genre-prediction-data-preparation/book_summaries.csv")

In [None]:
valid_genres = [#'Speculative fiction',
                'Science Fiction','Crime Fiction','Non-fiction','Children\'s literature',
                'Fantasy', 'Mystery', 'Suspense', 'Young adult literature']

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(data=book_summary_df, order=book_summary_df.Genres.value_counts().index, y='Genres')
plt.title("Counts per Genre")
plt.show()

In [None]:
book_summary_df['String Counts'] = book_summary_df.Genres.str.len()

In [None]:
plt.figure(figsize=(10,5))
df = book_summary_df.groupby(['Genres'])['String Counts'].sum().reset_index()
df.sort_values(['String Counts'], ascending=True).head(10).plot(kind='bar', y='String Counts', x='Genres',
                legend=False, color=['tab:orange', 'tab:green', 'tab:blue', 'tab:brown', 'tab:pink', 'tab:purple','tab:red', 'tab:gray', 'tab:olive'], ax=plt.gca())
plt.title("Total Words per Summary per Genre")
plt.show()

In [None]:
plt.figure(figsize=(10,5))
df = book_summary_df.groupby(['Genres'])['String Counts'].sum().reset_index()
df.sort_values(['String Counts'], ascending=True).head(10).plot(kind='bar', y='String Counts', x='Genres',
                legend=False, color=['tab:orange', 'tab:green', 'tab:blue', 'tab:brown', 'tab:pink', 'tab:purple','tab:red', 'tab:gray', 'tab:olive'], ax=plt.gca())
plt.title("Total Words per Summary per Genre")
plt.show()

In [None]:
book_summary_df['Summary'] = book_summary_df['Summary'].map(lambda summary : clean(summary))
book_summary_df.head(5)

In [None]:
#Training, Validation and Test
#Split the data into training and testing sets.

In [None]:
#Split to train and test data
train, test = train_test_split(book_summary_df, random_state=42, test_size=0.2, shuffle=True, stratify=book_summary_df['Genres'])

train_x = train.Summary
train_y = train.Genres
test_x = test.Summary.to_numpy()
test_y = test.Genres.to_numpy()
test_titles = test['Book Title'].to_numpy()

print("Training dataset = {}".format(len(train_x)))
print("Testing dataset = {}".format(len(test_x)))
colors=['tab:orange', 'tab:green', 'tab:blue', 'tab:brown', 'tab:pink', 'tab:purple','tab:red', 'tab:gray', 'tab:olive']
plt.figure(figsize=(25,8))
plt.subplot(1,2,1).set_title("Train Dataset-Counts per Genre")
train.groupby('Genres').size().sort_values(ascending=True).plot(kind='barh', color=colors,ax=plt.gca())
plt.subplot(1,2,2).set_title("Test Dataset-Counts per Genre")
test.groupby('Genres').size().sort_values(ascending=True).plot(kind='barh', color=colors,ax=plt.gca())
plt.show()


In [None]:

#Feature Extraction
#For all the models, TFIDF vectors have been used and he classifier used is the OneVsRestClassifier from the sklearn library.

In [None]:
import joblib
from time import time
# [training duration, testing duration, Test Accuracy

benchmarks = {'NB' : [0.0, 0.0, 0.0],
              'NB_tuned':  [0.0, 0.0, 0.0],
              'SVC' :  [0.0, 0.0, 0.0],
              'SVC_tuned':  [0.0, 0.0, 0.0],
              'LR' :  [0.0, 0.0, 0.0],
              'LR_tuned':  [0.0, 0.0, 0.0],
                }

In [None]:
tfidf = TfidfVectorizer(stop_words=stop_words, ngram_range=(1,1), use_idf=True)

In [None]:
# Define a pipeline combining a text feature extractor with multi class classifier
t0 = time()
NB_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words, ngram_range=(1,1), use_idf=True)),
                ('clf', OneVsRestClassifier(MultinomialNB())),
            ])
NB_pipeline.fit(train_x, train_y)
benchmarks['NB'][0] = (time() - t0)/60
filename = "./NB_model.sav"
joblib.dump(NB_pipeline, filename)
print("Training took {:.3f} [seconds] to complete and has been saved as {}".format(benchmarks['NB'][0],filename))
print("####Before tuning:####")
print('Train Accuracy : %.3f'%NB_pipeline.score(train_x, train_y))
print('Test Accuracy : %.3f'%NB_pipeline.score(test_x, test_y))

In [None]:
#Tune the model
NB_pipeline.get_params().keys()

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
t0 = time()
parameters = {    
    'tfidf__use_idf': (True, False),
    #'tfidf__lowercase': (True, False), 
    'tfidf__norm': ('l1', 'l2'),  
    'clf__estimator__alpha': (1, 0.1, 0.01, 0.001, 0.0001)  
     }
NB_grid = GridSearchCV(NB_pipeline, param_grid=parameters, n_jobs=-1, verbose=5)
NB_grid.fit(train_x, train_y)
#print("Training took: {:.2f} ".format(time() - t0))
benchmarks['NB_tuned'][0] = (time() - t0)/60

In [None]:
filename = "./NB_tuned_model.sav"
joblib.dump(NB_grid, filename)
print('Best Parameters : ',NB_grid.best_params_)
print("Training took: {:.3f}[minutes] to complete and has been saved as {}".format(benchmarks['NB_tuned'][0]/60,filename))

In [None]:
print("####After tuning:####")
print('Train Accuracy : %.3f'%NB_grid.best_estimator_.score(train_x, train_y))
print('Test Accuracy : %.3f'%NB_grid.best_estimator_.score(test_x, test_y))

In [None]:
#Model 2: Support Vector Classification


In [None]:
t0 = time()
SVC_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words, ngram_range=(1,2))),
                ('clf', OneVsRestClassifier(LinearSVC(), n_jobs=1)),
            ])
SVC_pipeline.fit(train_x, train_y)
benchmarks['SVC'][0] = (time() - t0)/60
#print("Training complete! Saving trained model....")
filename = "./SVC_model.sav"
joblib.dump(SVC_pipeline, filename)
print("Training took: {:.3f}[seconds] to complete and has been saved as {}".format(benchmarks['SVC'][0],filename))

In [None]:
print("####Before tuning:####")
print('Train Accuracy : %.3f'%SVC_pipeline.score(train_x, train_y))
print('Test Accuracy : %.3f'%SVC_pipeline.score(test_x, test_y))

In [None]:
#Tuning for SVC
#SVC model doesn't need any tuning because the train and test accuracy score are already quite near to the desired output.

In [None]:
SVC_pipeline.get_params().keys()

In [None]:
t0 = time()
parameters = {
              'tfidf__use_idf': (True, False),
              'tfidf__max_df': [0.3, 0.5, 0.8, 1.0],
              'clf__estimator__loss' : ['hinge', 'squared_hinge'],
              #'tfidf__ngram_range': [(1,1), (1,2),(1,3)],
              'clf__estimator__penalty' : ["l1", "l2"],
              'clf__estimator__fit_intercept': [True, False],
              'clf__estimator__C': [0.01, 1.0, 2.0]
              #'clf__estimator__solver': ('newton-cg', 'sag','saga','lbfgs')
             }
SVC_grid = GridSearchCV(SVC_pipeline,param_grid=parameters, n_jobs=-1, verbose=5)
SVC_grid.fit(train_x, train_y)
#print("Training took: {:.2f} ".format(time() - t0))
benchmarks['SVC_tuned'][0] = (time() - t0)/60

In [None]:
filename = "./SVC_tuned_model.sav"
joblib.dump(SVC_grid, filename)
print('Best Parameters : ',SVC_grid.best_params_)
print("Training took: {:.3f}[minutes] to complete and has been saved as {}".format(benchmarks['SVC_tuned'][0]/60,filename))
print("####After tuning:####")
print('Train Accuracy : %.3f'%SVC_grid.best_estimator_.score(train_x, train_y))
print('Test Accuracy : %.3f'%SVC_grid.best_estimator_.score(test_x, test_y))

In [None]:
#Model 3: Logistic Regression
t0 = time()
LogReg_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words, ngram_range=(1,1))),
                ('clf', OneVsRestClassifier(LogisticRegression(solver='liblinear'), n_jobs=1)),
            ])
LogReg_pipeline.fit(train_x, train_y)
benchmarks['LR'][0] = (time() - t0)/60
#print("Training complete! Saving trained model....")
filename = "./LogReg_model.sav"
joblib.dump(LogReg_pipeline, filename)
print("Training took: {:.3f}[seconds] to complete and has been saved as {}".format(benchmarks['LR'][0],filename))
print("####Before tuning:####")
print('Train Accuracy : %.3f'%LogReg_pipeline.score(train_x, train_y))
print('Test Accuracy : %.3f'%LogReg_pipeline.score(test_x, test_y))

In [None]:
#TfidfVectorizer(stop_words=stop_words, ngram_range=(1,3))
t0 = time()
LogReg_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words, ngram_range=(1,1))),
                ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=1)),
            ])
LogReg_pipeline.fit(train_x, train_y)
benchmarks['LR'][0] = (time() - t0)/60
#print("Training complete! Saving trained model....")
filename = "./LogReg_model.sav"
joblib.dump(LogReg_pipeline, filename)
print("Training took: {:.3f}[seconds] to complete and has been saved as {}".format(benchmarks['LR'][0],filename))
print("####Before tuning:####")
print('Train Accuracy : %.3f'%LogReg_pipeline.score(train_x, train_y))
print('Test Accuracy : %.3f'%LogReg_pipeline.score(test_x, test_y))

In [None]:
LogReg_pipeline.get_params().keys()

In [None]:
t0 = time()
parameters = {'clf__estimator__penalty' : ['l1', 'l2', 'elasticnet', 'none'],
              'tfidf__use_idf': (True, False),
              #'tfidf__max_df': [0.3, 0.5, 0.8, 1.0],
              'clf__estimator__fit_intercept': [True, False],
              'clf__estimator__C': [0.01, 1.0, 2.0],
              #'clf__estimator__max_iter': [25]
              'clf__estimator__solver': ('newton-cg', 'sag','saga','lbfgs')
             }
LogReg_grid = GridSearchCV(LogReg_pipeline,param_grid=parameters, n_jobs=-1, verbose=5)
LogReg_grid.fit(train_x, train_y)
#print("Training took: {:.2f} ".format(time() - t0))
benchmarks['LR_tuned'][0] = (time() - t0)/60

In [None]:
filename = "./LR_tuned_model.sav"
joblib.dump(LogReg_grid, filename)
print('Best Parameters : ',LogReg_grid.best_params_)
print("Training took: {:.3f}[minutes] to complete and has been saved as {}".format(benchmarks['LR_tuned'][0]/60,filename))

In [None]:
print("####After tuning:####")
print('Train Accuracy : %.3f'%LogReg_grid.best_estimator_.score(train_x, train_y))
print('Test Accuracy : %.3f'%LogReg_grid.best_estimator_.score(test_x, test_y))

In [None]:
#Test Models
def save_benchmarks(estimator, estimator_name,x, y, benchmarks):
    t0 = time()
    pred = estimator.predict(test_x)
    benchmarks[estimator_name][1] = (time() - t0)/60
    benchmarks[estimator_name][2] = accuracy_score(y, pred)*100

In [None]:
#NB
save_benchmarks(estimator=NB_pipeline, estimator_name='NB', x=test_x, y=test_y, benchmarks=benchmarks)
save_benchmarks(estimator=NB_grid.best_estimator_, estimator_name='NB_tuned', x=test_x, y=test_y, benchmarks=benchmarks)
#LR
save_benchmarks(estimator=LogReg_pipeline, estimator_name='LR', x=test_x, y=test_y, benchmarks=benchmarks)
save_benchmarks(estimator=LogReg_grid.best_estimator_, estimator_name='LR_tuned', x=test_x, y=test_y, benchmarks=benchmarks)
#SVC
save_benchmarks(estimator=SVC_pipeline, estimator_name='SVC', x=test_x, y=test_y, benchmarks=benchmarks)
save_benchmarks(estimator=SVC_grid.best_estimator_, estimator_name='SVC_tuned', x=test_x, y=test_y, benchmarks=benchmarks)
print("Benchmaks created!")

In [None]:
df = pd.DataFrame.from_dict(benchmarks, orient='index',columns=['Trained Duration[m]', 'Testing Duration[m]', 'Accuracy(%)'])
df.index.rename('Models', inplace=True)
df

In [None]:
def save_print_results(pred, labels, titles, save_file):
    print('##########################################\n#\tTest accuracy is {:.4f}%\t#\n##########################################'.format(accuracy_score(test_y, 
    pred)*100))
    pred_dict = {'titles':[],
                 'genres': [],
                 'prediction': [],
                'result':[]
                }
    for i in range(len(labels)):
        if (labels[i] == pred[i]):
            prediction = 'Correct'
        else:
            prediction = 'Wrong'
        pred_dict['titles'].append(titles[i])
        pred_dict['genres'].append(labels[i])
        pred_dict['prediction'].append(pred[i])
        pred_dict['result'].append(prediction)
    pred_df = pd.DataFrame.from_dict(pred_dict)
    #Save to csv file
    pred_df.to_csv(save_file)
    return pred_df

In [None]:
pred_nb = NB_grid.best_estimator_.predict(test_x)
pred_nb_df = save_print_results(pred=pred_nb, labels=test_y, titles=test_titles, save_file="./pred_nb_results.csv")
pred_nb_df.head(30)

In [None]:
pred_lr = LogReg_grid.best_estimator_.predict(test_x)
pred_lr_df = save_print_results(pred=pred_lr, labels=test_y, titles=test_titles, save_file="./pred_lr_results.csv")
pred_lr_df.head(30)

In [None]:
pred_svc = SVC_grid.best_estimator_.predict(test_x)
pred_svc_df = save_print_results(pred_svc, test_y, test_titles, save_file="./pred_svc_results.csv")
pred_svc_df.head(30)

In [None]:
colors=['tab:orange', 'tab:green', 'tab:blue', 'tab:brown', 'tab:pink', 'tab:purple','tab:red', 'tab:gray', 'tab:olive']
#df = book_summary_df.groupby(['Genres'])['String Counts'].sum().reset_index()
#Stats
plt.figure(figsize=(20,15))
plt.subplot(2,2,1).set_title("Prediction with LR Accuracy = {:.4f}%".format(benchmarks['LR_tuned'][2]))
pred_lr_df.pivot_table(index='genres',columns=['result'],aggfunc='size').plot(kind='barh', color=['tab:blue', 'tab:orange'],ax=plt.gca())
plt.subplot(2,2,2).set_title("Prediction with SVC Accuracy = {:.4f}%".format(benchmarks['SVC_tuned'][2]))
pred_svc_df.pivot_table(index='genres',columns=['result'],aggfunc='size').plot(kind='barh', color=['tab:blue', 'tab:orange'],ax=plt.gca())
plt.subplot(2,2,3).set_title("Prediction with NB  Accuracy = {:.4f}%".format(benchmarks['NB_tuned'][2]))
pred_nb_df.pivot_table(index='genres',columns=['result'],aggfunc='size').plot(kind='barh', color=['tab:blue', 'tab:orange'],ax=plt.gca())
plt.subplot(2,2,4).set_title("Counts of Summaries per Genre")
book_summary_df.groupby('Genres').size().sort_values(ascending=True).plot(kind='barh', color=colors,ax=plt.gca())
plt.tight_layout()
plt.show()

In [None]:
from sklearn import metrics
def classification_report(y, pred, target, name):
    print('##########################################\n#\tTest accuracy is {:.4f}%\t#\n##########################################'.format(accuracy_score(y, 
    pred)*100))
    print("------------------------------------------------------------")
    print("Classification Report for model {}".format(name))
    print("------------------------------------------------------------")
    print(metrics.classification_report(y, pred, target_names=target, zero_division=0))
    print("------------------------------------------------------------")
    #print("Confusion Matrix for {}".format(name))
    #print("------------------------------------------------------------")
    #print(metrics.confusion_matrix(y, pred))
    plt.figure(figsize = (20,15))
    sns.set(font_scale=1.4)
    sns.heatmap(metrics.confusion_matrix(y, pred), xticklabels = target, yticklabels = target, annot = True, fmt="d",cmap = 'summer', annot_kws={"size": 12})
    plt.xlabel('Predicted Labels')
    plt.ylabel('True Labels')
    plt.title("Confusion Matrix for {}".format(name))
    plt.show()

In [None]:
classification_report(test_y, pred_nb, valid_genres, "Naive Bayes")

In [None]:
classification_report(test_y, pred_svc, valid_genres, "Support Vector Classification")

In [None]:
classification_report(test_y, pred_lr, valid_genres, "Logistic Regression")

In [None]:
#Inference Function
import pandas as pd
import joblib

In [None]:
import string
import re
def clean(summary):
    table = str.maketrans(dict.fromkeys(string.punctuation))
    text = summary.translate(table)
    text = re.sub(r"[^a-zA-Z0-9\s]"," ", summary.lower())
    return text

In [None]:
def predict_genre(plot):
    s = clean(plot)
    lr_est = joblib.load('./LR_tuned_model.sav')
    svc_est = joblib.load('./SVC_tuned_model.sav')
    nb_est = joblib.load('./NB_tuned_model.sav')
    return (lr_est.best_estimator_.predict([s])[0], svc_est.best_estimator_.predict([s])[0], nb_est.best_estimator_.predict([s])[0])

In [None]:
goodreads_df = pd.read_csv(filepath_or_buffer='../input/load-summaries/summaries_for_testing.csv', header=0,names=['title','author','genre','summary'] )
goodreads_df.dropna(inplace=True)

In [None]:
results= { 'Title': [],
           'Author': [],
           'Prediction LR': [],
           'Prediction SVC': [],
           'Prediction NB': [],
           'Genre': []
                }
plots = goodreads_df.summary
titles = goodreads_df.title
authors = goodreads_df.author
genres = goodreads_df.genre
for i in range(len(goodreads_df)):
    (lr, svc, nb) = predict_genre(plot=plots[i])
    results['Title'].append(titles[i])
    results['Author'].append(authors[i])
    results['Prediction LR'].append(lr)
    results['Prediction SVC'].append(svc)
    results['Prediction NB'].append(nb)
    results['Genre'].append(genres[i])
results_df = pd.DataFrame.from_dict(results, orient='columns')
results_df.head(10)