In [15]:

import numpy as np
import pandas as pd
import os
import csv
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import nltk


In [16]:
df = pd.read_csv("booksummaries.txt",
                              header=None,sep="\t",
                              names=["Wikipedia ID", "Freebase ID", "Book Title", "Book Author", "Pub date","Genres","Summary"])
df.head(5)

Unnamed: 0,Wikipedia ID,Freebase ID,Book Title,Book Author,Pub date,Genres,Summary
0,620,/m/0hhy,Animal Farm,George Orwell,1945-08-17,"{""/m/016lj8"": ""Roman \u00e0 clef"", ""/m/06nbt"":...","Old Major, the old boar on the Manor Farm, ca..."
1,843,/m/0k36,A Clockwork Orange,Anthony Burgess,1962,"{""/m/06n90"": ""Science Fiction"", ""/m/0l67h"": ""N...","Alex, a teenager living in near-future Englan..."
2,986,/m/0ldx,The Plague,Albert Camus,1947,"{""/m/02m4t"": ""Existentialism"", ""/m/02xlf"": ""Fi...",The text of The Plague is divided into five p...
3,1756,/m/0sww,An Enquiry Concerning Human Understanding,David Hume,,,The argument of the Enquiry proceeds by a ser...
4,2080,/m/0wkt,A Fire Upon the Deep,Vernor Vinge,,"{""/m/03lrw"": ""Hard science fiction"", ""/m/06n90...",The novel posits that space around the Milky ...


In [17]:
# Preprocessing: removing punctuation and converting text to lowercase
df['clean_summary'] = df['Summary'].str.replace('[^\w\s]', '').str.lower()
df.dropna(subset=['Summary'], inplace=True)
df.dropna(subset=['Genres'], inplace=True)

# Feature extraction: TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(df['clean_summary'])

In [21]:
# Check the distribution of classes in the target variable
class_distribution = df['Genres'].value_counts()

# Identify classes with only one member
single_member_classes = class_distribution[class_distribution == 1].index

# Remove single member classes from the dataset
df_filtered = df[~df['Genres'].isin(single_member_classes)]

# Split the filtered dataset into train and test data
train, test = train_test_split(df_filtered, random_state=42, test_size=0.2, shuffle=True, stratify=df_filtered['Genres'])

# Separate features and target variables for train and test data
train_x = train.Summary
train_y = train.Genres
test_x = test.Summary.to_numpy()
test_y = test.Genres.to_numpy()
test_titles = test['Book Title'].to_numpy()

print("Training dataset = {}".format(len(train_x)))
print("Testing dataset = {}".format(len(test_x)))




Training dataset = 9147
Testing dataset = 2287


In [31]:
import joblib
from time import time
benchmarks = {
              'SVC' :  [0.0, 0.0, 0.0],
              'SVC_tuned':  [0.0, 0.0, 0.0],
                }

t0 = time()
SVC_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer( ngram_range=(1,2))),
                ('clf', OneVsRestClassifier(LinearSVC(), n_jobs=1)),
            ])
SVC_pipeline.fit(train_x, train_y)
benchmarks['SVC'][0] = (time() - t0)/60
filename = "./SVC_model.sav"
joblib.dump(SVC_pipeline, filename)
print("Training took: {:.3f}[seconds] to complete and has been saved as {}".format(benchmarks['SVC'][0],filename))

Training took: 9.660[seconds] to complete and has been saved as ./SVC_model.sav


In [33]:
print("####Before tuning:####")
print('Train Accuracy : %.3f'%SVC_pipeline.score(train_x, train_y))
print('Test Accuracy : %.3f'%SVC_pipeline.score(test_x, test_y))

####Before tuning:####
Train Accuracy : 0.998
Test Accuracy : 0.318


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

# Define a pipeline with TfidfVectorizer and LinearSVC
SVC_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1,2))),
    ('clf', LinearSVC())
])

# Define hyperparameters to search
parameters = {
    'tfidf__max_df': (0.25, 0.5, 0.75),
    'tfidf__min_df': (1, 2),
    'tfidf__sublinear_tf': (True, False),
    'clf__C': (0.1, 1, 10)
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(SVC_pipeline, parameters, cv=5, n_jobs=-1)
grid_search.fit(train_x, train_y)

# Get the best parameters
best_params = grid_search.best_params_
print("Best parameters found:")
print(best_params)

# Train a model with the best parameters
best_SVC_pipeline = grid_search.best_estimator_
best_SVC_pipeline.fit(train_x, train_y)

# Evaluate the model on the test set
test_predictions = best_SVC_pipeline.predict(test_x)
print("Test Accuracy:")
print(classification_report(test_y, test_predictions))

