In [112]:
import re
import ast
import nltk
import random
import numpy as np
import pandas as pd
from random import randint
from nltk import word_tokenize
from collections import Counter
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from sklearn import preprocessing
from sklearn.svm import SVC
from nltk.stem import WordNetLemmatizer
from sklearn.mixture import GaussianMixture
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, plot_confusion_matrix

# Data Preprocessing

In [46]:
#loading dataset and removing unwanted features
dataset = pd.read_csv(f'Book_summary.csv', encoding= 'unicode_escape')
dataset = dataset.drop(['Publication Date', 'Wikibedia ID', 'Freebase ID'], axis=1)
dataset

Unnamed: 0,Book title,Book author,Genre,Summary
0,Animal Farm,George Orwell,"{""/m/016lj8"": ""Roman à clef"", ""/m/06nbt"": ""Sat...","Old Major, the old boar on the Manor Farm, ca..."
1,A Clockwork Orange,Anthony Burgess,"{""/m/06n90"": ""Science Fiction"", ""/m/0l67h"": ""N...","Alex, a teenager living in near-future Englan..."
2,The Plague,Albert Camus,"{""/m/02m4t"": ""Existentialism"", ""/m/02xlf"": ""Fi...",The text of The Plague is divided into five p...
3,An Enquiry Concerning Human Understanding,David Hume,,The argument of the Enquiry proceeds by a ser...
4,A Fire Upon the Deep,Vernor Vinge,"{""/m/03lrw"": ""Hard science fiction"", ""/m/06n90...",The novel posits that space around the Milky ...
...,...,...,...,...
16554,Under Wildwood,Colin Meloy,,"Prue McKeel, having rescued her brother from ..."
16555,Transfer of Power,Vince Flynn,"{""/m/01jfsb"": ""Thriller"", ""/m/02xlf"": ""Fiction""}",The reader first meets Rapp while he is doing...
16556,Decoded,Jay-Z,"{""/m/0xdf"": ""Autobiography""}",The book follows very rough chronological ord...
16557,America Again: Re-becoming The Greatness We Ne...,Stephen Colbert,,Colbert addresses topics including Wall Stree...


In [4]:
#check for nulls and removing nans
dataset.isnull().count()
dataset = dataset.dropna(axis = 0)
dataset = dataset.reset_index(drop=True)
dataset

Unnamed: 0,Book title,Book author,Genre,Summary
0,Animal Farm,George Orwell,"{""/m/016lj8"": ""Roman à clef"", ""/m/06nbt"": ""Sat...","Old Major, the old boar on the Manor Farm, ca..."
1,A Clockwork Orange,Anthony Burgess,"{""/m/06n90"": ""Science Fiction"", ""/m/0l67h"": ""N...","Alex, a teenager living in near-future Englan..."
2,The Plague,Albert Camus,"{""/m/02m4t"": ""Existentialism"", ""/m/02xlf"": ""Fi...",The text of The Plague is divided into five p...
3,A Fire Upon the Deep,Vernor Vinge,"{""/m/03lrw"": ""Hard science fiction"", ""/m/06n90...",The novel posits that space around the Milky ...
4,All Quiet on the Western Front,Erich Maria Remarque,"{""/m/098tmk"": ""War novel"", ""/m/016lj8"": ""Roman...","The book tells the story of Paul B??umer, a G..."
...,...,...,...,...
12050,The Third Lynx,Timothy Zahn,"{""/m/06n90"": ""Science Fiction""}",The story starts with former government agent...
12051,Remote Control,Andy McNab,"{""/m/01jfsb"": ""Thriller"", ""/m/02xlf"": ""Fiction...",The series follows the character of Nick Ston...
12052,Transfer of Power,Vince Flynn,"{""/m/01jfsb"": ""Thriller"", ""/m/02xlf"": ""Fiction""}",The reader first meets Rapp while he is doing...
12053,Decoded,Jay-Z,"{""/m/0xdf"": ""Autobiography""}",The book follows very rough chronological ord...


In [5]:
def cleaned_text(raw_text):
    raw_text = re.findall(r"[a-zA-Z]+", raw_text)
    raw_text = " ".join(raw_text).lower()
    tokens = word_tokenize(raw_text)
    stop_words = stopwords.words("english")
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    
    return tokens

In [6]:
def convert_list_to_string(org_list, seperator=' '):
    """ Convert list to string, by joining all item in list with given separator.
        Returns the concatenated string """
    return seperator.join(org_list)

In [7]:
cleaned_dataset = dataset.copy()
cleaned_dataset.insert(loc=3, column='Book summary', value="")
unique_genres = {}

for index, row in cleaned_dataset.iterrows():
    row["Genre"] = list(ast.literal_eval(row["Genre"]).values())
    for item in row["Genre"]:
        if item not in unique_genres:
            unique_genres[item] = 1
        else:
            unique_genres[item] += 1
            
    tokens = cleaned_text(str(row['Summary']))
    # Convert list of strings to string
    row['Book summary'] = convert_list_to_string(tokens)

for index, row in cleaned_dataset.iterrows():
    if len(row["Genre"]) == 1:
        row["Genre"] = row["Genre"][0]
    else :
        max_ = 0
        pop_genre = ""
        for item in row["Genre"]:
            if unique_genres[item] > max_:
                max_ = unique_genres[item]
                pop_genre = item
        row["Genre"] = pop_genre

cleaned_dataset = pd.DataFrame(cleaned_dataset)
cleaned_dataset.drop('Summary', axis=1)

Unnamed: 0,Book title,Book author,Genre,Book summary
0,Animal Farm,George Orwell,Fiction,old major old boar manor farm call animal farm...
1,A Clockwork Orange,Anthony Burgess,Fiction,alex teenager living near future england lead ...
2,The Plague,Albert Camus,Fiction,text plague divided five part town oran thousa...
3,A Fire Upon the Deep,Vernor Vinge,Fiction,novel posit space around milky way divided con...
4,All Quiet on the Western Front,Erich Maria Remarque,War novel,book tell story paul b umer german soldier urg...
...,...,...,...,...
12050,The Third Lynx,Timothy Zahn,Science Fiction,story start former government agent frank comp...
12051,Remote Control,Andy McNab,Fiction,series follows character nick stone ex militar...
12052,Transfer of Power,Vince Flynn,Fiction,reader first meet rapp covert operation iran d...
12053,Decoded,Jay-Z,Autobiography,book follows rough chronological order switchi...


In [8]:
cleaned_dataset.groupby(["Genre"]).count()

Unnamed: 0_level_0,Book title,Book author,Book summary,Summary
Genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Absurdist fiction,2,2,2,2
Adventure novel,56,56,56,56
Albino bias,1,1,1,1
Alien invasion,1,1,1,1
Alternate history,17,17,17,17
...,...,...,...,...
War novel,26,26,26,26
Western,9,9,9,9
Western fiction,8,8,8,8
Wuxia,12,12,12,12


# Feature Engineering

## TF-IDF

In [53]:
def tfidf_model(data):
    tfIdf_vectorizer = TfidfVectorizer(use_idf=True, min_df=0.05)

    tfidf_model = tfIdf_vectorizer.fit_transform(data)
    tfidf_features = pd.DataFrame(tfidf_model.toarray(), columns = tfIdf_vectorizer.get_feature_names())
    return tfidf_features

In [54]:
tfidf_features = tfidf_model(cleaned_dataset['Book summary'])
tfidf_features

Unnamed: 0,ability,able,accident,across,act,action,actually,adventure,affair,age,...,work,working,world,would,written,year,yet,york,young,younger
0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.00000,0.0,0.0,...,0.016108,0.062023,0.014802,0.000000,0.0,0.012618,0.000000,0.0,0.030525,0.0
1,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.058797,0.00000,0.0,0.0,...,0.000000,0.056115,0.000000,0.000000,0.0,0.068494,0.000000,0.0,0.082851,0.0
2,0.000000,0.000000,0.0,0.000000,0.045781,0.045051,0.000000,0.00000,0.0,0.0,...,0.032764,0.042053,0.000000,0.066288,0.0,0.000000,0.045515,0.0,0.031044,0.0
3,0.056044,0.000000,0.0,0.000000,0.052866,0.000000,0.101763,0.00000,0.0,0.0,...,0.000000,0.000000,0.139067,0.000000,0.0,0.088910,0.000000,0.0,0.035849,0.0
4,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.12049,0.0,0.0,...,0.000000,0.000000,0.149440,0.000000,0.0,0.000000,0.056480,0.0,0.038523,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12050,0.000000,0.142992,0.0,0.000000,0.000000,0.000000,0.000000,0.00000,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.115878,0.0
12051,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.00000,0.0,0.0,...,0.356158,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0
12052,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.00000,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0
12053,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.00000,0.0,0.0,...,0.000000,0.000000,0.088724,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0


## Doc2Vec

In [57]:
def doc2vec_model(data):  
    documents = [doc for doc in data]
    tagged_documents = [TaggedDocument(tokens, [i]) for i, tokens in enumerate(documents)]

    doc2vec_model = Doc2Vec(vector_size=300, min_count=2, epochs=40)
    doc2vec_model.build_vocab(tagged_documents)
    doc2vec_model.train(tagged_documents, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs)

    doc2vec_features = [doc2vec_model.infer_vector(doc.split()) for doc in documents]
    doc2vec_features = np.array(doc2vec_features)
    
    return pd.DataFrame(doc2vec_features)

In [58]:
doc2vec_features = doc2vec_model(cleaned_dataset["Book summary"])
doc2vec_features

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,-0.001219,-0.000344,-0.001595,-0.001018,-0.000212,-0.001034,-0.000858,-0.001663,-0.000635,-0.000967,...,-0.001356,0.000714,-0.001008,-0.001274,-0.001204,0.001102,0.001053,-0.000895,0.001616,0.000642
1,0.041657,0.036251,0.055487,-0.013521,0.005494,0.007342,0.036562,0.020155,-0.000578,-0.081172,...,-0.048558,-0.040414,-0.003061,-0.018109,-0.041394,0.071918,0.031009,0.041447,0.101612,0.084258
2,0.001514,0.001134,0.000395,-0.001576,0.001040,0.000555,-0.000748,0.000707,-0.001652,-0.000774,...,-0.000971,0.000019,-0.001171,-0.000140,-0.000195,0.000855,-0.000662,-0.000981,-0.001416,-0.000455
3,-0.001422,0.001093,0.000828,-0.001437,-0.001368,0.000285,-0.001193,-0.000961,-0.001191,-0.000605,...,0.000548,0.001075,0.001250,-0.000935,-0.000811,-0.000094,-0.000175,0.000178,0.000294,-0.001494
4,0.001590,0.140083,0.094291,-0.116739,0.057239,0.043354,-0.000498,-0.118484,0.110672,-0.040346,...,-0.080557,0.035083,0.045583,0.042714,-0.060801,0.041770,-0.018028,-0.025729,-0.012992,0.045464
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12050,0.000341,-0.000238,-0.000872,-0.001448,0.001147,0.000312,0.000519,-0.001383,-0.001575,0.001345,...,0.001011,0.000768,0.000451,-0.000019,0.000260,-0.000237,0.000321,-0.001557,0.000811,-0.000985
12051,0.000509,-0.000774,-0.000488,0.000159,-0.000378,0.000331,0.000768,0.000332,0.000034,-0.000486,...,0.000461,0.001170,0.000076,0.001651,-0.000665,-0.000675,-0.001166,0.000065,0.001059,0.001196
12052,0.019973,0.020488,0.057112,-0.007330,-0.017867,0.016391,-0.002867,0.000841,-0.005017,-0.019947,...,0.000064,0.018505,0.010674,0.008015,-0.034253,0.011280,0.017516,0.015552,0.068332,0.017776
12053,0.142827,0.272958,0.306837,-0.102968,-0.086627,0.008060,-0.052054,-0.029959,0.019043,-0.277637,...,0.018802,0.027967,0.160203,0.132035,-0.373845,-0.024582,0.071706,0.076201,-0.157499,0.371697


## TF-IDF + Doc2Vec

In [13]:
tfidf_doc2vec_features = pd.concat([tfidf_features, doc2vec_features], axis=1)

# Clustering

In [14]:
import matplotlib.cm as cm
from scipy.spatial.distance import cosine
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from yellowbrick.cluster import KElbowVisualizer

In [15]:
model = KMeans(n_clusters = 30).fit_predict(doc2vec_features)

In [16]:
model2 = DBSCAN(eps=0.1, min_samples=3, metric="cosine").fit_predict(tfidf_doc2vec_features)

In [17]:
np.unique(model2)

array([-1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12],
      dtype=int64)

In [18]:
indices = doc2vec_features[model2 == 0].index

In [19]:
indices

Int64Index([19, 1004, 1601, 1895, 10196, 10495, 10833], dtype='int64')

In [20]:
cosine_scores = []
for ind, val in enumerate(indices):
    cosine_score = cosine(doc2vec_features[model2 == 0].iloc[0], doc2vec_features[model2 == 0].iloc[ind])
    cosine_scores.append(cosine_score)
    
print(cosine_scores)

[0, 0.005791068077087402, 0.009916186332702637, 0.015560507774353027, 0.0029700398445129395, 0.014103114604949951, 0.0036133527755737305]


In [21]:
np.argpartition(cosine_scores, 4)[:4]

array([4, 0, 6, 1], dtype=int64)

In [22]:
ya = doc2vec_features[model2 == 0].iloc[np.argpartition(cosine_scores, 4)[:4]].index

In [23]:
ya

Int64Index([10196, 19, 10833, 1004], dtype='int64')

In [24]:
cleaned_dataset.iloc[ya]

Unnamed: 0,Book title,Book author,Genre,Book summary,Summary
10196,The Last Egyptian,L. Frank Baum,Adventure novel,extensive diacritical mark appear novel publis...,The extensive diacritical marks appear in the...
19,The Trial,Franz Kafka,Fiction,thirtieth birthday chief financial officer ban...,"On his thirtieth birthday, the chief financia..."
10833,March to the Sea,John Ringo,Science Fiction,event marshad end previous book roger marine m...,After the events in Marshad (at the end of th...
1004,The Castle,Franz Kafka,Fiction,narrator k arrives village governed mysterious...,"The narrator, K., arrives in a village govern..."


In [35]:
# model = KMeans()
# visualizer = KElbowVisualizer(model, k=(20,50))

# visualizer.fit(tfidf_features_reduced)        # Fit the data to the visualizer
# visualizer.show()  

# Classification

In [47]:
classif_dataset = cleaned_dataset.drop(['Book title', 'Book author', 'Summary'], axis=1)
classif_dataset

Unnamed: 0,Genre,Book summary
0,Fiction,old major old boar manor farm call animal farm...
1,Fiction,alex teenager living near future england lead ...
2,Fiction,text plague divided five part town oran thousa...
3,Fiction,novel posit space around milky way divided con...
4,War novel,book tell story paul b umer german soldier urg...
...,...,...
12050,Science Fiction,story start former government agent frank comp...
12051,Fiction,series follows character nick stone ex militar...
12052,Fiction,reader first meet rapp covert operation iran d...
12053,Autobiography,book follows rough chronological order switchi...


In [101]:
def tfidf_model(data_train, data_test):
    tfIdf_vectorizer = TfidfVectorizer(use_idf=True, min_df=0.05)

    tfidf_model = tfIdf_vectorizer.fit_transform(data_train)
    tfidf_train = pd.DataFrame(tfidf_model.toarray(), columns = tfIdf_vectorizer.get_feature_names())
    tfidf_test = tfIdf_vectorizer.transform(data_test)
    tfidf_test = pd.DataFrame(tfidf_test.toarray(), columns = tfIdf_vectorizer.get_feature_names())
    return tfidf_train, tfidf_test

In [None]:
def doc2vec_model(data):  
    documents = [doc for doc in data]
    tagged_documents = [TaggedDocument(tokens, [i]) for i, tokens in enumerate(documents)]

    doc2vec_model = Doc2Vec(vector_size=300, min_count=2, epochs=40)
    doc2vec_model.build_vocab(tagged_documents)
    doc2vec_model.train(tagged_documents, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs)

    doc2vec_features = [doc2vec_model.infer_vector(doc.split()) for doc in documents]
    doc2vec_features = np.array(doc2vec_features)
    
    return pd.DataFrame(doc2vec_features)

In [89]:
def train_test_data(data):
    dataset_train, dataset_test = train_test_split(data, test_size = 0.3, random_state=0)
    dataset_train = dataset_train.reset_index(drop = True)
    dataset_test = dataset_test.reset_index(drop = True)
    return dataset_train, dataset_test

dataset_train, dataset_test = train_test_data(classif_dataset)

In [107]:
def model_knn(x_train, y_train, x_test, y_test):
    classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2) #choosing Euclidean distance metric
    classifier.fit(x_train, y_train)
    ypred = classifier.predict(x_test)
    accuracy = accuracy_score(y_test, ypred)*100
    print(f'K-NN: {accuracy}')
    
    return classifier, ypred, accuracy

In [116]:
def model_svm(x_train, y_train, x_test, y_test):
    cs = SVC(kernel = 'rbf', random_state = 0)
    cs.fit(x_train, y_train)
    ypred = cs.predict(x_test)
    accuracy = accuracy_score(y_test, ypred)*100
    print(f'K-NN: {accuracy}')
    return cs, ypred, accuracy

In [102]:
tfidf_train_data, tfidf_test_data = tfidf_model(dataset_train["Book summary"], dataset_test["Book summary"])

In [104]:
doc2vec_train_data = doc2vec_model(dataset_train["Book summary"])
doc2vec_test_data = doc2vec_model(dataset_test["Book summary"])

In [117]:
svm_tfidf_model, y_pred_tfidf, accuracy_tfidf = model_svm(tfidf_train_data, dataset_train['Genre'], tfidf_test_data, dataset_test['Genre'])

K-NN: 40.89024053082665


In [118]:
svm_tfidf_model, y_pred_tfidf, accuracy_tfidf = model_svm(doc2vec_train_data, dataset_train['Genre'], doc2vec_test_data, dataset_test['Genre'])

K-NN: 38.789051700304114
