In [28]:
import random
import nltk
import pandas as pd
import requests
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multiclass import OneVsOneClassifier

In [29]:
nltk.download('gutenberg')
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

change to lower and remove non-alphabetic character and stopwords

In [30]:
def clear_text(text):

    stop_words = set(stopwords.words('english'))
    words = re.findall(r'\b[a-zA-Z]+\b', text.lower())
    cleaned_words = [word for word in words if word not in stop_words]

    return ' '.join(cleaned_words)

In [31]:
def get_gutenberg_book(url):
    response = requests.get(url)
    return response.text

function for getting random word samples from  each book

In [32]:
def create_samples(book, label, num_samples=200, words_per_sample=100):
    book_text = get_gutenberg_book(book_url)
    clean_text = clear_text(book_text)
    words = nltk.word_tokenize(clean_text)
    partitions = [words[i:i + words_per_sample] for i in range(0, len(words), words_per_sample)]
    num_partitions = min(num_samples, len(partitions))
    random_partitions = random.sample(partitions, num_partitions)
    samples = [{'text': ' '.join(partition), 'label': label} for partition in random_partitions]
    return samples

Book list

In [33]:
book_urls = [
    ('https://www.gutenberg.org/files/158/158-0.txt', 'Jane'),
    ('https://www.gutenberg.org/files/768/768-0.txt', 'Emily'),
    ('https://www.gutenberg.org/files/1260/1260-0.txt', 'Charlotte'),
    ('https://www.gutenberg.org/files/1400/1400-0.txt', 'Dickens'),
    ('https://www.gutenberg.org/files/145/145-0.txt', 'Eliot'),
    ('https://www.gutenberg.org/cache/epub/541/pg541.txt', 'Wharton')
]

all_samples = []
for book_url, label in book_urls:
    samples = create_samples(book_url, label)
    all_samples.extend(samples)

df = pd.DataFrame(all_samples)
df.head()

Unnamed: 0,text,label
0,come dine whenever asked would poor fellow exp...,Jane
1,expressions gratitude always feeling towards m...,Jane
2,forward information substance open hearted mr ...,Jane
3,wrap miss emma need fears sir often heard spea...,Jane
4,make prospect ball completely satisfactory fix...,Jane


In [34]:
df.to_csv('finaldoc.csv', index=False)
#from google.colab import files
#files.download('finaldoc.csv')

Model training

In [35]:
model = [
    OneVsRestClassifier(SVC(kernel="linear")),
    OneVsRestClassifier(SVC(kernel="rbf")),
    OneVsRestClassifier(SVC(kernel="poly")),
    RandomForestClassifier(),
    GaussianNB(),
    KNeighborsClassifier(),
    OneVsOneClassifier(SGDClassifier()),
    DecisionTreeClassifier(),
    AdaBoostClassifier(),
    XGBClassifier(num_class=1, random_state=69)
    ]


names = [
    "Linear SVC",
    "Gaussian SVC",
    "Polynomial SVC",
    "RandomForestClassifier",
    "Naive Bayes",
    "KNeighborsClassifier",
    "SGDClassifier",
    "DecisionTreeClassifier",
    "AdaBoostClassifier",
    "XGBClassifier",
]


In [38]:
import numpy as np
from sklearn.model_selection import KFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pandas as pd
import warnings

warnings.filterwarnings('ignore')

num_topics = 100

kf = KFold(n_splits=10, shuffle=True, random_state=69)
onehot_encoder = OneHotEncoder(sparse=False, drop='first')

for m_obj, m_name in zip(model, names):
    acc_sum = 0
    precision_sum = 0
    recall_sum = 0
    f1_sum = 0
    roc_auc_sum = 0

    for train_index, test_index in kf.split(df[:600]):
        train_data = df.iloc[train_index]
        test_data = df.iloc[test_index]
        X_train = train_data['text']
        y_train = train_data['label']
        X_test = test_data['text']
        y_test = test_data['label']

        # Create document-topic distributions using LDA
        vectorizer = CountVectorizer()
        X_train_bow = vectorizer.fit_transform(X_train)
        lda = LatentDirichletAllocation(n_components=num_topics, random_state=69)
        X_train_lda = lda.fit_transform(X_train_bow)

        X_test_bow = vectorizer.transform(X_test)
        X_test_lda = lda.transform(X_test_bow)

        # One-hot encoding for the label
        y_train_onehot = onehot_encoder.fit_transform(train_data[['label']])
        y_test_onehot = onehot_encoder.transform(test_data[['label']])
        y_train_array = y_train_onehot[:, 1]
        y_test_array = y_test_onehot[:, 1]

        # Model Training and Prediction
        m_obj.fit(X_train_lda, y_train_array)
        y_pred = m_obj.predict(X_test_lda)

        # Compute Evaluation Metrics
        acc = accuracy_score(y_pred, y_test_array)
        precision = precision_score(y_pred, y_test_array, average='macro')
        recall = recall_score(y_pred, y_test_array, average='macro')
        f1 = f1_score(y_pred, y_test_array, average='macro')
        roc_auc = roc_auc_score(y_test_array, y_pred, average='macro')

        acc_sum += acc
        precision_sum += precision
        recall_sum += recall
        f1_sum += f1
        roc_auc_sum += roc_auc

    print("MODEL: {}".format(m_name))
    print("Average Accuracy: {}".format(acc_sum / 10))
    print("Average Precision: {}".format(precision_sum / 10))
    print("Average Recall: {}".format(recall_sum / 10))
    print("Average F1: {}".format(f1_sum / 10))
    print("Average ROC AUC: {}".format(roc_auc_sum / 10))
    print("==========================================")

MODEL: Linear SVC
Average Accuracy: 0.8316666666666667
Average Precision: 0.7600293669179119
Average Recall: 0.8940772743449535
Average F1: 0.7607112896634856
Average ROC AUC: 0.7600293669179119
MODEL: Gaussian SVC
Average Accuracy: 0.6766666666666665
Average Precision: 0.7466711840444551
Average Recall: 0.7805003088128336
Average F1: 0.6516106334696975
Average ROC AUC: 0.7466711840444551
MODEL: Polynomial SVC
Average Accuracy: 0.6833333333333333
Average Precision: 0.5233766233766233
Average Recall: 0.43920634920634927
Average F1: 0.44142813972936024
Average ROC AUC: 0.5233766233766233
MODEL: RandomForestClassifier
Average Accuracy: 0.6833333333333333
Average Precision: 0.6654898302795986
Average Recall: 0.7267125826489138
Average F1: 0.6110070321003376
Average ROC AUC: 0.6654898302795986
MODEL: Naive Bayes
Average Accuracy: 0.7633333333333333
Average Precision: 0.6631628124815454
Average Recall: 0.8067040665787589
Average F1: 0.6705416410027103
Average ROC AUC: 0.6631628124815454
MODE