In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
data = pd.read_csv("IMDB Dataset.csv")

X_train, X_test, y_train, y_test = train_test_split(data['review'], data['sentiment'], test_size=0.2, random_state=42)

In [None]:
vectorizer = CountVectorizer()
X_train_counts = vectorizer.fit_transform(X_train)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

# Обучение модели логистической регрессии
clf = LogisticRegression(max_iter=100)
clf.fit(X_train_tfidf, y_train)

# Преобразование тестовых данных в векторы TF-IDF
X_test_counts = vectorizer.transform(X_test)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

# Предсказание меток для тестовых данных
y_pred = clf.predict(X_test_tfidf)

# Оценка точности модели
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Вывод отчета о классификации и матрицы ошибок
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.90
              precision    recall  f1-score   support

    negative       0.91      0.89      0.90      4961
    positive       0.89      0.91      0.90      5039

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000

[[4403  558]
 [ 440 4599]]


In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

data = pd.read_csv("IMDB Dataset.csv")

data['tokens'] = data['review'].apply(word_tokenize)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
stop_words = set(stopwords.words('english'))

lemmatizer = WordNetLemmatizer()

def preprocess(tokens):
    return [lemmatizer.lemmatize(word.lower()) for word in tokens if word.lower() not in stop_words]

data['lemma_no_stop'] = data['tokens'].apply(preprocess)

In [None]:
data['lemma_no_stop'] = data['lemma_no_stop'].apply(lambda tokens: ' '.join(tokens))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data['lemma_no_stop'], data['sentiment'], test_size=0.2, random_state=42)

# TF-IDF

In [None]:
from sklearn.ensemble import RandomForestClassifier

vectorizer = CountVectorizer()
X_train_counts = vectorizer.fit_transform(X_train)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

# Обучение модели логистической регрессии
clf = LogisticRegression(max_iter=100)
clf.fit(X_train_tfidf, y_train)

rfc = RandomForestClassifier(max_depth=10, random_state=42)
rfc.fit(X_train_tfidf, y_train)

In [None]:
X_test_counts = vectorizer.transform(X_test)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

# Предсказание меток для тестовых данных
y_pred = clf.predict(X_test_tfidf)

# Оценка точности модели
accuracy_clf_tfidf = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy_clf_tfidf:.2f}")

# Вывод отчета о классификации и матрицы ошибок
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.90
              precision    recall  f1-score   support

    negative       0.91      0.88      0.89      4961
    positive       0.88      0.91      0.90      5039

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000

[[4359  602]
 [ 440 4599]]


In [None]:
X_test_counts = vectorizer.transform(X_test)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

# Предсказание меток для тестовых данных
y_pred = rfc.predict(X_test_tfidf)

# Оценка точности модели
accuracy_rfc_tfidf = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy_rfc_tfidf:.2f}")

# Вывод отчета о классификации и матрицы ошибок
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.83
              precision    recall  f1-score   support

    negative       0.85      0.80      0.82      4961
    positive       0.81      0.86      0.84      5039

    accuracy                           0.83     10000
   macro avg       0.83      0.83      0.83     10000
weighted avg       0.83      0.83      0.83     10000

[[3976  985]
 [ 704 4335]]


# Word2Vec

In [None]:
from gensim.models import Word2Vec
import numpy as np

In [None]:
model = Word2Vec(data['lemma_no_stop'].apply(lambda x: x.split()), vector_size=100, window=5, min_count=1)

def get_vector(sentence):
    words = sentence.split()
    vectors = [model.wv[word] for word in words if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(100)

X_train_w2v = np.array([get_vector(sentence) for sentence in X_train])
X_test_w2v = np.array([get_vector(sentence) for sentence in X_test])


In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = LogisticRegression(max_iter=100)
clf.fit(X_train_w2v, y_train)

rfc = RandomForestClassifier(max_depth=10, random_state=42)
rfc.fit(X_train_w2v, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
y_pred = clf.predict(X_test_w2v)

# Оценка точности модели
accuracy_clf_w2v = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy_clf_w2v:.2f}")

# Вывод отчета о классификации и матрицы ошибок
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.85
              precision    recall  f1-score   support

    negative       0.86      0.85      0.85      4961
    positive       0.85      0.86      0.86      5039

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000

[[4202  759]
 [ 697 4342]]


In [None]:
y_pred = rfc.predict(X_test_w2v)

# Оценка точности модели
accuracy_rfc_w2v = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy_clf_w2v:.2f}")

# Вывод отчета о классификации и матрицы ошибок
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.85
              precision    recall  f1-score   support

    negative       0.80      0.79      0.79      4961
    positive       0.79      0.81      0.80      5039

    accuracy                           0.80     10000
   macro avg       0.80      0.80      0.80     10000
weighted avg       0.80      0.80      0.80     10000

[[3910 1051]
 [ 980 4059]]


# Bag of Words

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)

In [None]:
clf = LogisticRegression(max_iter=100)
clf.fit(X_train_bow, y_train)

rfc = RandomForestClassifier(max_depth=10, random_state=42)
rfc.fit(X_train_bow, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
y_pred = clf.predict(X_test_bow)

# Оценка точности модели
accuracy_clf_bow = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy_clf_bow:.2f}")

# Вывод отчета о классификации и матрицы ошибок
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.89
              precision    recall  f1-score   support

    negative       0.89      0.88      0.88      4961
    positive       0.88      0.89      0.89      5039

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000

[[4352  609]
 [ 538 4501]]


In [None]:
y_pred = rfc.predict(X_test_bow)

# Оценка точности модели
accuracy_rfc_bow = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy_rfc_bow:.2f}")

# Вывод отчета о классификации и матрицы ошибок
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.84
              precision    recall  f1-score   support

    negative       0.85      0.82      0.83      4961
    positive       0.83      0.86      0.84      5039

    accuracy                           0.84     10000
   macro avg       0.84      0.84      0.84     10000
weighted avg       0.84      0.84      0.84     10000

[[4046  915]
 [ 701 4338]]


# TF-IDF с n-граммами

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2))
X_train_tfidf_bi = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf_bi = tfidf_vectorizer.transform(X_test)

In [None]:
clf = LogisticRegression(max_iter=100)
clf.fit(X_train_tfidf_bi, y_train)

rfc = RandomForestClassifier(max_depth=10, random_state=42)
rfc.fit(X_train_tfidf_bi, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
y_pred = clf.predict(X_test_tfidf_bi)

# Оценка точности модели
accuracy_clf_bi = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy_clf_bi:.2f}")

# Вывод отчета о классификации и матрицы ошибок
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


y_pred = rfc.predict(X_test_tfidf_bi)

# Оценка точности модели
accuracy_rfc_bi = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy_rfc_bi:.2f}")

# Вывод отчета о классификации и матрицы ошибок
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.89
              precision    recall  f1-score   support

    negative       0.91      0.87      0.89      4961
    positive       0.88      0.91      0.90      5039

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000

[[4329  632]
 [ 444 4595]]
Accuracy: 0.82
              precision    recall  f1-score   support

    negative       0.83      0.80      0.81      4961
    positive       0.81      0.83      0.82      5039

    accuracy                           0.82     10000
   macro avg       0.82      0.82      0.82     10000
weighted avg       0.82      0.82      0.82     10000

[[3968  993]
 [ 839 4200]]


# FastText

In [None]:
from gensim.models import FastText

model_fasttext = FastText(data['lemma_no_stop'].apply(lambda x: x.split()), vector_size=100, window=5, min_count=1)

def get_vector_fasttext(sentence):
    words = sentence.split()
    vectors = [model_fasttext.wv[word] for word in words if word in model_fasttext.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(100)

X_train_fasttext = np.array([get_vector_fasttext(sentence) for sentence in X_train])
X_test_fasttext = np.array([get_vector_fasttext(sentence) for sentence in X_test])

In [None]:
clf = LogisticRegression(max_iter=100)
clf.fit(X_train_fasttext, y_train)

rfc = RandomForestClassifier(max_depth=10, random_state=42)
rfc.fit(X_train_fasttext, y_train)


y_pred = clf.predict(X_test_fasttext)

# Оценка точности модели
accuracy_clf_fasttext = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy_clf_fasttext:.2f}")

# Вывод отчета о классификации и матрицы ошибок
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


y_pred = rfc.predict(X_test_fasttext)

# Оценка точности модели
accuracy_rfc_fasttext = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy_rfc_fasttext:.2f}")

# Вывод отчета о классификации и матрицы ошибок
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.84
              precision    recall  f1-score   support

    negative       0.84      0.83      0.83      4961
    positive       0.83      0.84      0.84      5039

    accuracy                           0.84     10000
   macro avg       0.84      0.84      0.84     10000
weighted avg       0.84      0.84      0.84     10000

[[4118  843]
 [ 791 4248]]
Accuracy: 0.74
              precision    recall  f1-score   support

    negative       0.74      0.76      0.75      4961
    positive       0.75      0.73      0.74      5039

    accuracy                           0.74     10000
   macro avg       0.74      0.74      0.74     10000
weighted avg       0.74      0.74      0.74     10000

[[3755 1206]
 [1347 3692]]


# Universal Sentence Encoder (недостаточно оперативки)

In [None]:
import tensorflow_hub as hub
import numpy as np

embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

def embed_in_batches(texts, batch_size=1000):
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch_embeddings = embed(texts[i: i + batch_size]).numpy()
        embeddings.append(batch_embeddings)
    return np.concatenate(embeddings, axis=0)

X_train_use = embed_in_batches(X_train.tolist())
X_test_use = embed_in_batches(X_test.tolist())

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = LogisticRegression(max_iter=100)
clf.fit(X_train_use, y_train)

rfc = RandomForestClassifier(max_depth=10, random_state=42)
rfc.fit(X_train_use, y_train)


y_pred = clf.predict(X_test_use)

# Оценка точности модели
accuracy_clf_use = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy_clf_use:.2f}")

# Вывод отчета о классификации и матрицы ошибок
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


y_pred = rfc.predict(X_test_use)

# Оценка точности модели
accuracy_rfc_use = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy_rfc_use:.2f}")

# Вывод отчета о классификации и матрицы ошибок
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.83
              precision    recall  f1-score   support

    negative       0.83      0.84      0.83      4961
    positive       0.84      0.83      0.83      5039

    accuracy                           0.83     10000
   macro avg       0.83      0.83      0.83     10000
weighted avg       0.83      0.83      0.83     10000

[[4144  817]
 [ 842 4197]]
Accuracy: 0.81
              precision    recall  f1-score   support

    negative       0.80      0.81      0.81      4961
    positive       0.81      0.81      0.81      5039

    accuracy                           0.81     10000
   macro avg       0.81      0.81      0.81     10000
weighted avg       0.81      0.81      0.81     10000

[[4027  934]
 [ 981 4058]]


In [None]:
dl_compare_with_mech = {
        'TF-IDF': [accuracy_clf_tfidf, accuracy_rfc_tfidf],
        'Word2Vec': [accuracy_clf_w2v, accuracy_rfc_w2v],
        'Bag of Words': [accuracy_clf_bow, accuracy_rfc_bow],
        'TF-IDF с n-граммами': [accuracy_clf_bi, accuracy_rfc_bi],
        'FastText': [accuracy_clf_fasttext, accuracy_rfc_fasttext],
        'Universal Sentence Encoder': [accuracy_clf_use, accuracy_rfc_use]
            }
dl_compare_with_mech = pd.DataFrame(dl_compare_with_mech,index=['Logistic Regression','RandomForestClassifier'])
dl_compare_with_mech.head()

Unnamed: 0,TF-IDF,Word2Vec,Bag of Words,TF-IDF с n-граммами,FastText,Universal Sentence Encoder
Logistic Regression,0.8958,0.8544,0.8853,0.8924,0.8366,0.8341
RandomForestClassifier,0.8311,0.7969,0.8384,0.8168,0.7447,0.8085


# SPAM FILTER

In [69]:
import os
import tarfile
import urllib.request
import re
import numpy as np
import nltk
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import optuna

In [70]:
class SpamClassifier():
  def __init__(self, download_ham, download_spam, download_dir):
    self.download_ham = download_ham
    self.download_spam = download_spam
    self.download_dir = download_dir
    self.ham_dir = os.path.join(download_dir, 'ham')
    self.spam_dir = os.path.join(download_dir, 'spam')
    self.stop_words = set(stopwords.words('english'))
    self.lemmatizer = WordNetLemmatizer()
    self.vectorizer = CountVectorizer(stop_words='english')
    self.rfc_model = None
    self.tfidf_transformer = TfidfTransformer()

  def download_and_extract(self, url, extract_path):
    archive_name = os.path.join(extract_path, url.split('/')[-1])
    urllib.request.urlretrieve(url, archive_name)
    with tarfile.open(archive_name, "r:bz2") as tar:
        tar.extractall(path=extract_path)
    os.remove(archive_name)

  def setup_data(self):
    os.makedirs(self.ham_dir, exist_ok=True)
    for url in self.download_ham:
        self.download_and_extract(url, self.ham_dir)

    os.makedirs(self.spam_dir, exist_ok=True)
    for url in self.download_spam:
        self.download_and_extract(url, self.spam_dir)

  def get_data(self, path):
    data = []
    for root, dirs, files in os.walk(path):
        for file in files:
            file_path = os.path.join(root, file)
            if os.path.isfile(file_path):
                with open(file_path, encoding="ISO-8859-1") as f:
                    words_list = f.read()
                    data.append(words_list)
    return data

  def preprocess_email(self, email):
    email = BeautifulSoup(email, "html.parser").get_text()
    email = re.sub(r'^(From|To|Subject|Date|Return-Path|Received|Message-Id|X-\w+):.*', '', email, flags=re.MULTILINE)
    email = email.lower()
    email = re.sub(r'http\S+|www\S+|https\S+', '', email, flags=re.MULTILINE)
    email = re.sub(r'\S+@\S+', '', email)
    email = re.sub(r'[^a-z\s]', '', email)
    words = word_tokenize(email)
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

  def prepare_data(self):
    ham_data = self.get_data(self.ham_dir)
    spam_data = self.get_data(self.spam_dir)
    np.random.shuffle(ham_data)
    np.random.shuffle(spam_data)

    processed_ham_data = [self.preprocess_email(email) for email in ham_data]
    processed_spam_data = [self.preprocess_email(email) for email in spam_data]

    y = len(processed_ham_data)*[0] + len(processed_spam_data)*[1]
    X = processed_ham_data + processed_spam_data

    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42, test_size=0.2)

    X_train_counts = self.vectorizer.fit_transform(X_train)
    X_train_tfidf = self.tfidf_transformer.fit_transform(X_train_counts).toarray()

    X_test_counts = self.vectorizer.transform(X_test)
    X_test_tfidf = self.tfidf_transformer.transform(X_test_counts).toarray()

    return X_train_tfidf, X_test_tfidf, y_train, y_test

  def optimize_hyperparameters(self, n_trials=50):
        def objective(trial):
            n_estimators = trial.suggest_int('n_estimators', 100, 1500)
            max_depth = trial.suggest_int('max_depth', 10, 50)
            min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
            min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 4)

            model = RandomForestClassifier(
                n_estimators=n_estimators,
                max_depth=max_depth,
                min_samples_split=min_samples_split,
                min_samples_leaf=min_samples_leaf,
                random_state=42
            )

            X_train, _, y_train, _ = self.prepare_data()
            scores = cross_val_score(model, X_train, y_train, cv=3, scoring='accuracy')
            return scores.mean()

        study = optuna.create_study(direction='maximize')
        study.optimize(objective, n_trials=n_trials)
        print("Best hyperparameters:", study.best_params)
        self.rfc_model = RandomForestClassifier(**study.best_params, random_state=42)

  def train_and_evaluate(self):
    if self.rfc_model is None:
            self.rfc_model = RandomForestClassifier(n_estimators=1200)
    X_train, X_test, y_train, y_test = self.prepare_data()
    self.rfc_model.fit(X_train, y_train)
    y_pred = self.rfc_model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.2f}")
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))

In [71]:
DOWNLOAD_HAM = [
    "https://spamassassin.apache.org/old/publiccorpus/20030228_easy_ham.tar.bz2",
    "https://spamassassin.apache.org/old/publiccorpus/20030228_easy_ham_2.tar.bz2",
    "https://spamassassin.apache.org/old/publiccorpus/20030228_hard_ham.tar.bz2"
]

DOWNLOAD_SPAM = [
    "https://spamassassin.apache.org/old/publiccorpus/20050311_spam_2.tar.bz2",
    "https://spamassassin.apache.org/old/publiccorpus/20030228_spam.tar.bz2"
]

DOWNLOAD_DIR = '/content/spam_data'

classifier = SpamClassifier(DOWNLOAD_HAM, DOWNLOAD_SPAM, DOWNLOAD_DIR)
classifier.setup_data()
classifier.optimize_hyperparameters(n_trials=15)
classifier.train_and_evaluate()

[I 2024-08-23 09:30:06,986] A new study created in memory with name: no-name-f035dc2a-8cdd-40f2-9774-0357f1cbfc60
[I 2024-08-23 09:40:33,126] Trial 0 finished with value: 0.9661157422665339 and parameters: {'n_estimators': 992, 'max_depth': 26, 'min_samples_split': 4, 'min_samples_leaf': 1}. Best is trial 0 with value: 0.9661157422665339.
[I 2024-08-23 09:45:04,815] Trial 1 finished with value: 0.9340906047082859 and parameters: {'n_estimators': 759, 'max_depth': 12, 'min_samples_split': 7, 'min_samples_leaf': 2}. Best is trial 0 with value: 0.9661157422665339.
[I 2024-08-23 09:52:39,613] Trial 2 finished with value: 0.9611565520029971 and parameters: {'n_estimators': 834, 'max_depth': 23, 'min_samples_split': 5, 'min_samples_leaf': 2}. Best is trial 0 with value: 0.9661157422665339.
[I 2024-08-23 09:56:55,598] Trial 3 finished with value: 0.9113623484119246 and parameters: {'n_estimators': 813, 'max_depth': 10, 'min_samples_split': 3, 'min_samples_leaf': 2}. Best is trial 0 with value

Best hyperparameters: {'n_estimators': 218, 'max_depth': 50, 'min_samples_split': 7, 'min_samples_leaf': 1}
Accuracy: 0.97
              precision    recall  f1-score   support

           0       0.98      0.99      0.98       831
           1       0.97      0.95      0.96       380

    accuracy                           0.97      1211
   macro avg       0.97      0.97      0.97      1211
weighted avg       0.97      0.97      0.97      1211

[[819  12]
 [ 20 360]]
