In [1]:
import re
import pandas as pd
from urllib.request import urlopen
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import random
from sklearn.utils import shuffle

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
def find_start_end(text):
    # Find the start and end of the main text
    start_pattern = r"\*\*\* START OF THIS PROJECT GUTENBERG EBOOK .+ \*\*\*"
    end_pattern = r"\*\*\* END OF THIS PROJECT GUTENBERG EBOOK .+ \*\*\*"

    start_match = re.search(start_pattern, text)
    end_match = re.search(end_pattern, text)

    start_idx = start_match.end() if start_match else 0
    end_idx = end_match.start() if end_match else len(text)

    return text[start_idx:end_idx]

In [3]:
def split_into_fixed_partitions(text, partitions=200, words_per_partition=100):
    words = text.split()

    # Ensure that there are enough words for the specified partitions and words per partition
    # total_words_needed = partitions * words_per_partition

    # # Create the partitions
    # result = [words[i:i+words_per_partition] for i in range(0, total_words_needed, words_per_partition)]
    # return result
    partitions = [words[i:i + 100] for i in range(0, len(words), 100)]
    random_partitions = random.sample(partitions, min(200, len(partitions)))

    return random_partitions

# Example usage:
# partitions = split_into_fixed_partitions(cleaned_string, partitions=200, words_per_partition=100)
# partitions = [{"Samples" : ' '.join(part), "Book_name" : book_name} for part in partitions]

In [4]:
def process_book(url):
  raw = urlopen(url).read()

  string_txt = str(raw)
  author_name_match = re.search(r'Author:?[\w\s;:,\'\"]+',string_txt)
  tokens = word_tokenize(string_txt)
  author_name = author_name_match.group(0)[7:len(author_name_match.group(0))]


  stwrd = stopwords.words('english')
  cleaned_string = [word for word in tokens if word.lower() not in stwrd and word.isalpha()]
  cleaned_string = ' '.join(cleaned_string)
  #removing special characters and escape sequences
  cleaned_string = re.sub(r'\\[^,:;]+|[^\x20-\x7E]+','',cleaned_string)
  new_string = find_start_end(cleaned_string)
  new_string
  partitions = split_into_fixed_partitions(cleaned_string, partitions=200, words_per_partition=100)
  partitions = [{"Samples" : ' '.join(part), "Label" : author_name} for part in partitions]
  return partitions

In [5]:
book_urls = [
    'https://www.gutenberg.org/cache/epub/1342/pg1342.txt',  # Pride and Prejudice by Jane Austen
    'https://www.gutenberg.org/cache/epub/768/pg768.txt',    # Wuthering Heights by Emily Brontë
    'https://www.gutenberg.org/cache/epub/1260/pg1260.txt',    # Jane Eyre: An Autobiography by Charlotte Brontë
    'https://www.gutenberg.org/cache/epub/1400/pg1400.txt',    # Great Expectations Charles Dickens
    'https://www.gutenberg.org/cache/epub/145/pg145.txt',      # Middlemarch by George Eliot
    'https://www.gutenberg.org/cache/epub/541/pg541.txt'       # The Age of Innocence by Edith Wharton
]

all_partitions = []

for url in book_urls:
    book_partitions = process_book(url)
    all_partitions.extend(book_partitions)

In [6]:
df = pd.DataFrame(all_partitions)
df = shuffle(df, random_state=42)
df

Unnamed: 0,Samples,Label
0,well every mark regard stay Hertfordshire fair...,Jane Austen
1,learned still remained bent steps principal wo...,Jane Austen
2,think call Charles engaging heart circumstance...,Jane Austen
3,looked Bingley thought quite beautiful danced ...,Jane Austen
4,prepared meet folly conceit every room used fr...,Jane Austen
...,...,...
1195,telegram Newland luncheon rose murmur readines...,Edith Wharton
1196,Love one expected either ladies return wedding...,Edith Wharton
1197,candelabra piano stood large basket orchids de...,Edith Wharton
1198,tulle tucker fastened gardenia dropped eyes im...,Edith Wharton


In [7]:
df['Label'].value_counts()

 Jane Austen        200
 Emily Bront        200
 Charlotte Bront    200
 Charles Dickens    200
 George Eliot       200
 Edith Wharton      200
Name: Label, dtype: int64

In [8]:
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
# from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
)
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multiclass import OneVsOneClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
import numpy as np

In [9]:
model_objs = [
    OneVsRestClassifier(SVC(kernel="linear")),
    OneVsRestClassifier(SVC(kernel="rbf")),
    OneVsRestClassifier(SVC(kernel="poly")),
    RandomForestClassifier(),
    OneVsRestClassifier(MultinomialNB()),
    KNeighborsClassifier(),
    OneVsRestClassifier(SGDClassifier()), #was onevsone now one vs rest
    DecisionTreeClassifier(),
    AdaBoostClassifier(),
    XGBClassifier(random_state=69)
    ]


model_names = [
    "Linear SVC",
    "Gaussian SVC",
    "Polynomial SVC",
    "RandomForestClassifier",
    "Naive Bayes",
    "KNeighborsClassifier",
    "SGDClassifier",
    "DecisionTreeClassifier",
    "AdaBoostClassifier",
    "XGBClassifier",
]

In [13]:
from sklearn.model_selection import KFold
from sklearn.preprocessing import OneHotEncoder
import warnings
warnings.filterwarnings('ignore')

kf = KFold(n_splits=10, shuffle=True, random_state=42)
onehot_encoder = OneHotEncoder(sparse_output=False, drop='first')

for m_obj, m_name in zip(model_objs, model_names):
  acc_sum = 0
  precision_sum = 0
  recall_sum = 0
  f1_sum = 0
  roc_auc_sum = 0
  # train_set,validation_set,test_set = [],[],[]
  for train_index,test_index in kf.split(df[:600]):
    train_set, test_set = df.iloc[train_index], df.iloc[test_index]

    X_train = train_set.drop(['Label'], axis=1)
    y_train = train_set['Label']
    X_test = test_set.drop(['Label'], axis=1)
    y_test = test_set['Label']

    # print(y_train)
    # Append the sets to their respective lists
    # train_set.append(train_subset)
    # validation_set.append(validation_set)
    # test_set.append(test_set)

    min_n, max_n = 1, 2  # Example: Try n-grams from 1 to 2
    ngram_range_values = [(i, j) for i in range(min_n, max_n + 1) for j in range(i, max_n + 1)]

    # print("Unique categories in training set:", set(train_subset['Label']))
    # print("Unique categories in validation set:", set(validation_set['Label']))

    # One-hot encoding for the label
    y_train_onehot = onehot_encoder.fit_transform(train_set[['Label']])
    y_test_onehot = onehot_encoder.transform(test_set[['Label']])
    # y_train_onehot = y_test_onehot[:len(train_set)]
    # y_test_onehot = y_test_onehot[len(train_set):]

    y_train_df = pd.DataFrame(y_train_onehot, columns=onehot_encoder.get_feature_names_out(['Label']))
    y_test_df = pd.DataFrame(y_test_onehot, columns=onehot_encoder.get_feature_names_out(['Label']))

    max_acc,max_precision,max_recall,max_f1,max_roc_auc = [-1,-1,-1,-1,-1]

    for ngrams in ngram_range_values:
      vec = CountVectorizer(ngram_range = ngrams)
      # print(X_train)
      X_train_vec = vec.fit_transform(X_train['Samples'])
      X_train_vec = pd.DataFrame(X_train_vec.toarray(), columns=vec.get_feature_names_out())
      X_test_vec = vec.transform(X_test['Samples'])
      X_test_vec = pd.DataFrame(X_test_vec.toarray(), columns=vec.get_feature_names_out())
      # print(X_train)
      # print(y_train.shape)
      # clf = Pipeline([
      #     ('vectorizer', CountVectorizer(ngram_range = ngrams)),        #using the ngram_range parameter
      #     (m_name, m_obj)
      # ])
      m_obj.fit(X_train_vec,y_train_df)
      y_pred = m_obj.predict(X_test_vec)
      acc = accuracy_score(y_pred, y_test_df)
      precision = precision_score(y_pred, y_test_df, average='macro')
      recall = recall_score(y_pred, y_test_df, average='macro')
      f1 = f1_score(y_test_df, y_pred, average='macro')
      #unique_classes = np.unique(y_valid_df)
      # print(unique_classes)
      # if len(np.unique(y_valid_df)) > 1:
      #oc_auc = roc_auc_score(y_valid_df.values, y_pred_val, average='macro')
      # else:
         # Handle the case when there's only one class
      # roc_auc = 0.5  # You can set it to 0.5 or any other suitable value

      roc_auc = roc_auc_score(y_test_df, y_pred, average='macro')

      max_acc = max(max_acc,acc)
      max_precision = max(max_precision,precision)
      max_recall = max(max_recall,recall)
      max_f1 = max(max_f1,f1)
      max_roc_auc = max(max_roc_auc,roc_auc)

    acc_sum+=max_acc
    precision_sum += max_precision
    recall_sum += max_recall
    f1_sum += max_f1
    roc_auc_sum += max_roc_auc

  print("MODEL: {}".format(m_name))
  print("Average Accuracy: {}".format(acc_sum/10))
  print("Average Precision: {}".format(precision_sum/10))
  print("Average Recall: {}".format(recall_sum/10))
  print("Average F1: {}".format(f1_sum/10))
  print("Average ROC AUC: {}".format(roc_auc_sum/10))
  print()



NameError: name 'roc_auc' is not defined