In [87]:
import nltk
from nltk.corpus import gutenberg
from nltk.corpus import stopwords
import string
import random
import requests
import pandas as pd
import re
import numpy as np

In [88]:
nltk.download('gutenberg')
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

change to lower and remove non-alphabetic character and stopwords

In [89]:
def clear_text(text):

    stop_words = set(stopwords.words('english'))
    words = re.findall(r'\b[a-zA-Z]+\b', text.lower())
    cleaned_words = [word for word in words if word not in stop_words]

    return ' '.join(cleaned_words)

function for getting random word samples from  each book

In [90]:
def get_gutenberg_book(url):
    response = requests.get(url)
    return response.text

In [91]:
def create_samples(book, label, num_samples=200, words_per_sample=100):
    book_text = get_gutenberg_book(book_url)
    clean_text = clear_text(book_text)
    words = nltk.word_tokenize(clean_text)
    partitions = [words[i:i + words_per_sample] for i in range(0, len(words), words_per_sample)]
    num_partitions = min(num_samples, len(partitions))
    random_partitions = random.sample(partitions, num_partitions)
    samples = [{'text': ' '.join(partition), 'label': label} for partition in random_partitions]
    return samples

Book list

In [92]:
book_urls = [
    ('https://www.gutenberg.org/files/158/158-0.txt', 'Austen'),
    ('https://www.gutenberg.org/files/768/768-0.txt', 'Bronte'),
    ('https://www.gutenberg.org/files/1260/1260-0.txt', 'Bronte'),
    ('https://www.gutenberg.org/files/1400/1400-0.txt', 'Dickens'),
    ('https://www.gutenberg.org/files/145/145-0.txt', 'Eliot'),
    ('https://www.gutenberg.org/cache/epub/541/pg541.txt', 'Wharton')
]

all_samples = []
for book_url, label in book_urls:
    samples = create_samples(book_url, label)
    all_samples.extend(samples)

df = pd.DataFrame(all_samples)
df.head()

Unnamed: 0,text,label
0,things assure suspected yet manners evidently ...,Austen
1,better visit help emma watched fluctuations sp...,Austen
2,colonel campbell moment emma could taken oath ...,Austen
3,piece mention coles sure followed mr elton int...,Austen
4,john lawyer inconvenient poor isabella sadly t...,Austen


In [105]:
#from google.colab import files
df.to_csv('final.csv', index=False)
#files.download('final.csv')

In [94]:
from google.colab import drive
drive.mount('/content/gdrive')

glove_file_path= '/content/gdrive/My Drive/Colab Notebooks/glove.6B/glove.6B.50d.txt'

# Function to load GloVe embeddings into a dictionary
def load_glove_embeddings(file_path):
    embeddings = {}
    with open(file_path, encoding='utf-8') as file:
        for line in file:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

# Load GloVe embeddings into the dictionary
glove_embeddings = load_glove_embeddings(glove_file_path)

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [95]:
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
)
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multiclass import OneVsOneClassifier
from sklearn.feature_extraction.text import CountVectorizer

In [96]:
model = [
    OneVsRestClassifier(SVC(kernel="linear")),
    OneVsRestClassifier(SVC(kernel="rbf")),
    OneVsRestClassifier(SVC(kernel="poly")),
    RandomForestClassifier(),
    GaussianNB(),
    KNeighborsClassifier(),
    OneVsOneClassifier(SGDClassifier()),
    DecisionTreeClassifier(),
    AdaBoostClassifier(),
    XGBClassifier(random_state=69)
    ]


names = [
    "Linear SVC",
    "Gaussian SVC",
    "Polynomial SVC",
    "RandomForestClassifier",
    "Naive Bayes",
    "KNeighborsClassifier",
    "SGDClassifier",
    "DecisionTreeClassifier",
    "AdaBoostClassifier",
    "XGBClassifier",
]


In [97]:
import numpy as np
from sklearn.model_selection import KFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import warnings
warnings.filterwarnings('ignore')

kf = KFold(n_splits=10, shuffle=True, random_state=69)
onehot_encoder = OneHotEncoder(sparse=False, drop='first')

for m_obj, m_name in zip(model, names):
    acc_sum = 0
    precision_sum = 0
    recall_sum = 0
    f1_sum = 0
    roc_auc_sum = 0

    for train_index, test_index in kf.split(df[:600]):
        train_data = df.iloc[train_index]
        test_data = df.iloc[test_index]
        X_train = train_data['text']
        y_train = train_data['label']
        X_test = test_data['text']
        y_test = test_data['label']

        # Create document embeddings using GloVe
        X_train_glove = np.array([np.mean([glove_embeddings.get(word, np.zeros(50)) for word in sentence.split()], axis=0) for sentence in X_train])
        X_test_glove = np.array([np.mean([glove_embeddings.get(word, np.zeros(50)) for word in sentence.split()], axis=0) for sentence in X_test])

        # One-hot encoding for the label
        y_train_onehot = onehot_encoder.fit_transform(train_data[['label']])
        y_test_onehot = onehot_encoder.transform(test_data[['label']])
        y_train_df = pd.DataFrame(y_train_onehot, columns=onehot_encoder.get_feature_names_out(['label']))
        y_test_df = pd.DataFrame(y_test_onehot, columns=onehot_encoder.get_feature_names_out(['label']))

        #Model Training and Prediction
        m_obj.fit(X_train_glove, y_train_df)
        y_pred = m_obj.predict(X_test_glove)

        #Compute Evaluation Metrics
        acc = accuracy_score(y_pred, y_test_df)
        precision = precision_score(y_pred, y_test_df, average='macro')
        recall = recall_score(y_pred, y_test_df, average='macro')
        f1 = f1_score(y_pred, y_test_df, average='macro')
        roc_auc = roc_auc_score(y_test_df, y_pred, average='macro')

        acc_sum += acc
        precision_sum += precision
        recall_sum += recall
        f1_sum += f1
        roc_auc_sum += roc_auc

    print("MODEL: {}".format(m_name))
    print("Average Accuracy: {}".format(acc_sum / 10))
    print("Average Precision: {}".format(precision_sum / 10))
    print("Average Recall: {}".format(recall_sum / 10))
    print("Average F1: {}".format(f1_sum / 10))
    print("Average ROC AUC: {}".format(roc_auc_sum / 10))
    print("==========================================")


MODEL: Linear SVC
Average Accuracy: 0.8916666666666666
Average Precision: 0.8777294513466949
Average Recall: 0.8789110254650432
Average F1: 0.8768344031194317
Average ROC AUC: 0.8777294513466949
MODEL: Gaussian SVC
Average Accuracy: 0.8916666666666666
Average Precision: 0.8731645891642936
Average Recall: 0.8811870677392225
Average F1: 0.875665379299767
Average ROC AUC: 0.8731645891642934
MODEL: Polynomial SVC
Average Accuracy: 0.885
Average Precision: 0.8643075275532539
Average Recall: 0.8752210480827397
Average F1: 0.8678910653694422
Average ROC AUC: 0.8643075275532539
MODEL: RandomForestClassifier
Average Accuracy: 0.8766666666666666
Average Precision: 0.8487060419505532
Average Recall: 0.8677358582960197
Average F1: 0.8560695443071079
Average ROC AUC: 0.8487060419505532
MODEL: Naive Bayes
Average Accuracy: 0.8433333333333332
Average Precision: 0.8555691204162139
Average Recall: 0.8253526135154429
Average F1: 0.831984683017604
Average ROC AUC: 0.8555691204162137
MODEL: KNeighborsClas