In [1]:
import requests
import re
import pandas as pd
import random
from string import ascii_lowercase
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

In [2]:
# Download NLTK stop words
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Function to clean text: remove punctuation, stop words, and non-textual elements
def clean_text(text):
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Split into words
    words = text.split()
    # Remove stop words and non-textual elements
    cleaned_words = [word for word in words if word.lower() not in stop_words]
    # strip underscores
    cleaned_words = [word.strip('_') for word in cleaned_words]
    # remove numbers
    cleaned_words = [word for word in cleaned_words if not word.isnumeric()]
    return ' '.join(cleaned_words)

def find_start_end(text):
    # Find the start and end of the main text
    start_pattern = r"\*\*\* START OF THIS PROJECT GUTENBERG EBOOK .+ \*\*\*"
    end_pattern = r"\*\*\* END OF THIS PROJECT GUTENBERG EBOOK .+ \*\*\*"

    start_match = re.search(start_pattern, text)
    end_match = re.search(end_pattern, text)

    start_idx = start_match.end() if start_match else 0
    end_idx = end_match.start() if end_match else len(text)

    return text[start_idx:end_idx]

def process_book(url, label):
    # Download the book text from the URL
    response = requests.get(url)
    response.encoding = 'utf-8'
    text = response.text

    # Extract the main text between start and end markers
    main_text = find_start_end(text)

    # Clean the main text
    cleaned_text = clean_text(main_text)

    # Extract words from the cleaned text
    words = cleaned_text.split()

    # Split words into partitions of 100 and take 200 random partitions
    partitions = [words[i:i + 100] for i in range(0, len(words), 100)]
    random_partitions = random.sample(partitions, min(200, len(partitions)))

    return [(label, ' '.join(partition)) for partition in random_partitions]

# Updated list of Gutenberg book URLs (Same as before, no change needed here)
book_urls = [
    'https://www.gutenberg.org/files/1342/1342-0.txt',  # Pride and Prejudice by Jane Austen
    'https://www.gutenberg.org/files/768/768-0.txt',    # Wuthering Heights by Emily Brontë
    'https://www.gutenberg.org/files/1260/1260-0.txt',  # Jane Eyre by Charlotte Brontë
    'https://www.gutenberg.org/files/1400/1400-0.txt',  # Great Expectations by Charles Dickens
    'https://www.gutenberg.org/files/145/145-0.txt',    # Middlemarch by George Eliot
    'https://www.gutenberg.org/files/541/541.txt'       # The Age of Innocence by Edith Wharton
]

book_authors = ["Jane Austen", "Emily Bronte", "Charlotte Bronte", "Charles Dickens",  "George Eliot", "Edith Wharton"]


# Process all books
all_partitions = []

for url, label in zip(book_urls, book_authors):
    book_partitions = process_book(url, label)
    all_partitions.extend(book_partitions)

# Convert to DataFrame
partition_df = pd.DataFrame(all_partitions, columns=['Label', 'Words']).sample(frac=1)

# Serialize DataFrame to CSV
# partition_df.to_csv('book_partitions_cleaned.csv', index=False)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
partition_df

Unnamed: 0,Label,Words
744,Charles Dickens,True simple fruits earth neednt bring William ...
781,Charles Dickens,answered returned Transport said man held line...
1015,Edith Wharton,Thanks wish might happen oftener said visitor ...
1163,Edith Wharton,Mr van der Luyden seemed overwhelmed announcem...
948,George Eliot,want Lydgate felt certain would played much le...
...,...,...
864,George Eliot,habitual selfcherishing anxiety fear harassed ...
1099,Edith Wharton,May disliked move except valid reasons taking ...
623,Charles Dickens,Miss Havisham account sure address tells wants...
729,Charles Dickens,rest except fell asleep chair wholly absorbed ...


In [4]:
partition_df['Label'].value_counts()

Charles Dickens     200
Edith Wharton       200
George Eliot        200
Emily Bronte        200
Charlotte Bronte    200
Jane Austen         200
Name: Label, dtype: int64

In [5]:
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
)
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multiclass import OneVsOneClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
import numpy as np

In [6]:
model_objs = [
    OneVsRestClassifier(SVC(kernel="linear")),
    OneVsRestClassifier(SVC(kernel="rbf")),
    OneVsRestClassifier(SVC(kernel="poly")),
    RandomForestClassifier(),
    OneVsRestClassifier(GaussianNB()),
    KNeighborsClassifier(),
    OneVsRestClassifier(SGDClassifier()),
    DecisionTreeClassifier(),
    OneVsRestClassifier(AdaBoostClassifier()),
    XGBClassifier(random_state=42)
    ]


model_names = [
    "Linear SVC",
    "Gaussian SVC",
    "Polynomial SVC",
    "RandomForestClassifier",
    "Naive Bayes",
    "KNeighborsClassifier",
    "SGDClassifier",
    "DecisionTreeClassifier",
    "AdaBoostClassifier",
    "XGBClassifier",
]

In [7]:
from sklearn.model_selection import KFold
from sklearn.preprocessing import OneHotEncoder
import warnings
warnings.filterwarnings('ignore')

kf = KFold(n_splits=10, shuffle=True, random_state=42)
onehot_encoder = OneHotEncoder(sparse_output=False, drop='first')

for m_obj, m_name in zip(model_objs, model_names):
  acc_sum = 0
  precision_sum = 0
  recall_sum = 0
  f1_sum = 0
  roc_auc_sum = 0
  # train_set,validation_set,test_set = [],[],[]
  for train_index,test_index in kf.split(partition_df[:600]):
    train_set, test_set = partition_df.iloc[train_index], partition_df.iloc[test_index]

    X_train = train_set.drop(['Label'], axis=1)
    y_train = train_set['Label']
    X_test = test_set.drop(['Label'], axis=1)
    y_test = test_set['Label']

    # print(y_train)
    # Append the sets to their respective lists
    # train_set.append(train_subset)
    # validation_set.append(validation_set)
    # test_set.append(test_set)

    min_n, max_n = 1, 2  # Example: Try n-grams from 1 to 2
    ngram_range_values = [(i, j) for i in range(min_n, max_n + 1) for j in range(i, max_n + 1)]

    # print("Unique categories in training set:", set(train_subset['Label']))
    # print("Unique categories in validation set:", set(validation_set['Label']))

    # One-hot encoding for the label
    y_train_onehot = onehot_encoder.fit_transform(train_set[['Label']])
    y_test_onehot = onehot_encoder.transform(test_set[['Label']])
    # y_train_onehot = y_test_onehot[:len(train_set)]
    # y_test_onehot = y_test_onehot[len(train_set):]

    y_train_df = pd.DataFrame(y_train_onehot, columns=onehot_encoder.get_feature_names_out(['Label']))
    y_test_df = pd.DataFrame(y_test_onehot, columns=onehot_encoder.get_feature_names_out(['Label']))

    max_acc,max_precision,max_recall,max_f1,max_roc_auc = [-1,-1,-1,-1,-1]

    for ngrams in ngram_range_values:
      vec = CountVectorizer(ngram_range = ngrams)
      # print(X_train)
      X_train_vec = vec.fit_transform(X_train['Words'])
      X_train_vec = pd.DataFrame(X_train_vec.toarray(), columns=vec.get_feature_names_out())
      X_test_vec = vec.transform(X_test['Words'])
      X_test_vec = pd.DataFrame(X_test_vec.toarray(), columns=vec.get_feature_names_out())
      # print(X_train)
      # print(y_train.shape)
      # clf = Pipeline([
      #     ('vectorizer', CountVectorizer(ngram_range = ngrams)),        #using the ngram_range parameter
      #     (m_name, m_obj)
      # ])
      m_obj.fit(X_train_vec,y_train_df)
      y_pred = m_obj.predict(X_test_vec)
      acc = accuracy_score(y_pred, y_test_df)
      precision = precision_score(y_pred, y_test_df, average='macro')
      recall = recall_score(y_pred, y_test_df, average='macro')
      f1 = f1_score(y_test_df, y_pred, average='macro')
      #unique_classes = np.unique(y_valid_df)
      # print(unique_classes)
      # if len(np.unique(y_valid_df)) > 1:
      #oc_auc = roc_auc_score(y_valid_df.values, y_pred_val, average='macro')
      # else:
         # Handle the case when there's only one class
      # roc_auc = 0.5  # You can set it to 0.5 or any other suitable value

      roc_auc = roc_auc_score(y_test_df, y_pred, average='macro')

      max_acc = max(max_acc,acc)
      max_precision = max(max_precision,precision)
      max_recall = max(max_recall,recall)
      max_f1 = max(max_f1,f1)
      max_roc_auc = max(max_roc_auc,roc_auc)

    acc_sum+=max_acc
    precision_sum += max_precision
    recall_sum += max_recall
    f1_sum += max_f1
    roc_auc_sum += max_roc_auc

  print("MODEL: {}".format(m_name))
  print("Average Accuracy: {}".format(acc_sum/10))
  print("Average Precision: {}".format(precision_sum/10))
  print("Average Recall: {}".format(recall_sum/10))
  print("Average F1: {}".format(f1_sum/10))
  print("Average ROC AUC: {}".format(roc_auc_sum/10))
  print()

MODEL: Linear SVC
Average Accuracy: 0.7816666666666665
Average Precision: 0.7376202131202131
Average Recall: 0.9821580086580086
Average F1: 0.8194349175372981
Average ROC AUC: 0.8658295475991707

MODEL: Gaussian SVC
Average Accuracy: 0.48166666666666674
Average Precision: 0.3713664113664114
Average Recall: 0.7775
Average F1: 0.4830243470353849
Average ROC AUC: 0.6854791240505527

MODEL: Polynomial SVC
Average Accuracy: 0.16833333333333333
Average Precision: 0.006871794871794871
Average Recall: 0.05
Average F1: 0.011904761904761906
Average ROC AUC: 0.5030435897435896

MODEL: RandomForestClassifier
Average Accuracy: 0.6333333333333333
Average Precision: 0.5618626373626373
Average Recall: 0.9983333333333334
Average F1: 0.6856348466821612
Average ROC AUC: 0.780713927376971

MODEL: Naive Bayes
Average Accuracy: 0.5916666666666666
Average Precision: 0.533916860916861
Average Recall: 0.9174365079365078
Average F1: 0.645201044255695
Average ROC AUC: 0.760032152866409

MODEL: KNeighborsClassifi