In [25]:
import requests
import re
import pandas as pd
import random
from string import ascii_lowercase
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

In [26]:
# Download NLTK stop words
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Function to clean text: remove punctuation, stop words, and non-textual elements
def clean_text(text):
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Split into words
    words = text.split()
    # Remove stop words and non-textual elements
    cleaned_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(cleaned_words)

def find_start_end(text):
    # Find the start and end of the main text
    start_pattern = r"\*\*\* START OF THIS PROJECT GUTENBERG EBOOK .+ \*\*\*"
    end_pattern = r"\*\*\* END OF THIS PROJECT GUTENBERG EBOOK .+ \*\*\*"

    start_match = re.search(start_pattern, text)
    end_match = re.search(end_pattern, text)

    start_idx = start_match.end() if start_match else 0
    end_idx = end_match.start() if end_match else len(text)

    return text[start_idx:end_idx]

def process_book(url, label):
    # Download the book text from the URL
    response = requests.get(url)
    response.encoding = 'utf-8'
    text = response.text

    # Extract the main text between start and end markers
    main_text = find_start_end(text)

    # Clean the main text
    cleaned_text = clean_text(main_text)

    # Extract words from the cleaned text
    words = cleaned_text.split()

    # Split words into partitions of 100 and take 200 random partitions
    partitions = [words[i:i + 100] for i in range(0, len(words), 100)]
    random_partitions = random.sample(partitions, min(200, len(partitions)))

    return [(label, ' '.join(partition)) for partition in random_partitions]

# Updated list of Gutenberg book URLs (Same as before, no change needed here)
book_urls = [
    'https://www.gutenberg.org/files/1342/1342-0.txt',  # Pride and Prejudice by Jane Austen
    'https://www.gutenberg.org/files/768/768-0.txt',    # Wuthering Heights by Emily Brontë
    'https://www.gutenberg.org/files/1260/1260-0.txt',  # Jane Eyre by Charlotte Brontë
    'https://www.gutenberg.org/files/1400/1400-0.txt',  # Great Expectations by Charles Dickens
    'https://www.gutenberg.org/files/145/145-0.txt',    # Middlemarch by George Eliot
    'https://www.gutenberg.org/files/541/541.txt'       # The Age of Innocence by Edith Wharton
]

# Generate alphabetic labels based on the number of URLs
labels = list(ascii_lowercase)[:len(book_urls)]

# Process all books
all_partitions = []

for url, label in zip(book_urls, labels):
    book_partitions = process_book(url, label)
    all_partitions.extend(book_partitions)

# Convert to DataFrame
df = pd.DataFrame(all_partitions, columns=['Label', 'Words'])

# Splitting the DataFrame into training (60%), validation (20%), and testing (20%) sets
df_train, df_temp = train_test_split(df, test_size=0.4, random_state=42)  # 60% for training, 40% for temp
df_validation, df_test = train_test_split(df_temp, test_size=0.5, random_state=42)  # Split temp equally into validation and testing

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [28]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier

# Combine training and validation datasets for cross-validation
df_combined = pd.concat([df_train, df_validation])

# Feature Extraction with Bag of Words for combined dataset
vectorizer = CountVectorizer()
X_combined = vectorizer.fit_transform(df_combined['Words'])
y_combined = LabelEncoder().fit_transform(df_combined['Label'])

# Define models
models = {
    "Naive Bayes": MultinomialNB(),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(probability=True),  # SVM with probability estimates
    "k-NN": KNeighborsClassifier(),
    "SGD": SGDClassifier(),
    "XG-Boost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

# Scoring metrics
scoring = {
    'average accuracy': make_scorer(accuracy_score),
    'average precision': make_scorer(precision_score, average='weighted'),
    'average recall': make_scorer(recall_score, average='weighted'),
    'average f1': make_scorer(f1_score, average='weighted'),
    'average roc_auc': make_scorer(roc_auc_score, needs_proba=True, average='weighted', multi_class='ovr')
}

# Perform 10-fold cross-validation and evaluate models
cv_results = {}
kfold = StratifiedKFold(n_splits=10)

for name, model in models.items():
    cv_scores = cross_validate(model, X_combined, y_combined, cv=kfold, scoring=scoring)
    cv_results[name] = cv_scores

# Output cross-validation results
for model, scores in cv_results.items():
    print(f"Cross-validation Results for {model}:")
    for metric in scoring.keys():
        average_score = scores[f'test_{metric}'].mean()
        print(f"{metric}: {average_score}")
    print()

print("------------")
print()

# Feature Extraction for test set
X_test = vectorizer.transform(df_test['Words'])
y_test = LabelEncoder().fit_transform(df_test['Label'])

# Final Evaluation on Test Set
test_results = {}
for name, model in models.items():
    model.fit(X_combined, y_combined)  # Train on the combined dataset
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test) if hasattr(model, "predict_proba") else None
    test_results[name] = {
        'average accuracy': accuracy_score(y_test, y_pred),
        'average precision': precision_score(y_test, y_pred, average='weighted'),
        'average recall': recall_score(y_test, y_pred, average='weighted'),
        'average f1': f1_score(y_test, y_pred, average='weighted'),
        'average roc_auc': roc_auc_score(y_test, y_proba, multi_class='ovr', average='weighted') if y_proba is not None else None
    }

# Output test set results
for model, scores in test_results.items():
    print(f"Test Set Results for {model}:")
    for metric, score in scores.items():
        if score is not None:
            print(f"{metric}: {score}")
        else:
            print(f"{metric}: nan")
    print()

Cross-validation Results for Naive Bayes:
average accuracy: 0.9645833333333332
average precision: 0.9675357165892672
average recall: 0.9645833333333332
average f1: 0.9647937155457205
average roc_auc: 0.988922663900935

Cross-validation Results for Random Forest:
average accuracy: 0.9416666666666667
average precision: 0.949014594986054
average recall: 0.9416666666666667
average f1: 0.9425046529173011
average roc_auc: 0.9887668239177996

Cross-validation Results for SVM:
average accuracy: 0.9135416666666668
average precision: 0.9255930767217533
average recall: 0.9135416666666668
average f1: 0.9151498869528819
average roc_auc: 0.9890234476741157

Cross-validation Results for k-NN:
average accuracy: 0.6979166666666666
average precision: 0.7709070551023229
average recall: 0.6979166666666666
average f1: 0.6843997914835847
average roc_auc: 0.9138894657612517

Cross-validation Results for SGD:
average accuracy: 0.915625
average precision: 0.9208002302409355
average recall: 0.915625
average f1: