In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, classification_report
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import GradientBoostingClassifier

In [13]:
train_df = pd.read_csv("data_train_preprocessed.csv")
test_df = pd.read_csv("data_test_preprocessed.csv")
# Fill NaN values with an empty string
train_df['text'] = train_df['text'].fillna('')
train_df['processed_text_alt'] = train_df['processed_text_alt'].fillna('')
test_df['processed_text_alt'] = test_df['processed_text_alt'].fillna('')

In [14]:
# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000)  # You can adjust max_features based on your dataset size
X_train = vectorizer.fit_transform(train_df['processed_text_alt'])
X_test = vectorizer.transform(test_df['processed_text_alt'])
y_train = (train_df['binary_labels'] == 1).astype(int)
y_test = (test_df['binary_labels'] == 1).astype(int)

In [15]:
# AdaBoost Classifier
adaboost_classifier = AdaBoostClassifier(n_estimators=50, random_state=42)
adaboost_classifier.fit(X_train, y_train)

In [16]:
# Predictions
y_pred_train = adaboost_classifier.predict(X_train)
y_pred_test = adaboost_classifier.predict(X_test)

# Evaluation
train_accuracy = accuracy_score(y_train, y_pred_train)
test_accuracy = accuracy_score(y_test, y_pred_test)

print(f"Training Accuracy: {train_accuracy}")
print(f"Testing Accuracy: {test_accuracy}")

# Classification Report
print("Classification Report on Test Set:")
print(classification_report(y_test, y_pred_test))

Training Accuracy: 0.7988715565881181
Testing Accuracy: 0.768361581920904
Classification Report on Test Set:
              precision    recall  f1-score   support

           0       0.81      0.84      0.82       230
           1       0.68      0.64      0.66       124

    accuracy                           0.77       354
   macro avg       0.75      0.74      0.74       354
weighted avg       0.77      0.77      0.77       354



# Decision Tree

In [17]:
train_tokens = [token for tokens_list in train_df['text'] for token in tokens_list]
preprocessed_train_tokens = [token for tokens_list in train_df['processed_text_alt'] for token in tokens_list]

vocabulary_size = len(set(train_tokens))
vocabulary_size_preprocessed = len(set(preprocessed_train_tokens))

print(f'Vocabulary Size Preprocessed: {vocabulary_size_preprocessed}')
print(f'Vocabulary Size: {vocabulary_size}')

# Split the dataset into training and validation sets
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

Vocabulary Size Preprocessed: 132
Vocabulary Size: 246


In [18]:
# Text vectorization using CountVectorizer
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(train_df['processed_text_alt'])
X_val = vectorizer.transform(val_df['processed_text_alt'])
X_test = vectorizer.transform(test_df['processed_text_alt'])

In [19]:
# Labels
y_train = (train_df['binary_labels'] == 1).astype(int)
y_val = (val_df['binary_labels'] == 1).astype(int)
y_test = (test_df['binary_labels'] == 1).astype(int)

In [20]:
# Hyperparameter tuning
best_accuracy = 0
best_model = None

for max_depth in [None, 5, 10, 15]:
    for min_samples_split in [2, 5, 10]:
        for min_samples_leaf in [1, 2, 4]:
            
            # Decision tree
            tree_classifier = DecisionTreeClassifier(max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf)
            tree_classifier.fit(X_train, y_train)

            # Evaluate on the validation set
            val_predictions = tree_classifier.predict(X_val)
            val_accuracy = accuracy_score(y_val, val_predictions)

            # Check if the current configuration is the best so far
            if val_accuracy > best_accuracy:
                best_accuracy = val_accuracy
                best_model = tree_classifier

print(f'Best Max Depth: {best_model.max_depth}')
print(f'Best Min Samples Split: {best_model.min_samples_split}')
print(f'Best Min Samples Leaf: {best_model.min_samples_leaf}')

# Use the best model for evaluation on the test set
test_predictions = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test, test_predictions)

print(f'Best Validation Accuracy: {best_accuracy}')
print(f'Test Accuracy with Best Model: {test_accuracy}')


# Evaluate on the validation set and print classification report
val_predictions = best_model.predict(X_val)
print("Validation Classification Report: (Validation)")
print(classification_report(y_val, val_predictions))

# Use the best model for evaluation on the test set and print classification report
test_predictions = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test, test_predictions)
print(f'Test Accuracy with Best Model: {test_accuracy}')
print("Test Classification Report: (Test")
print(classification_report(y_test, test_predictions))


Best Max Depth: 10
Best Min Samples Split: 10
Best Min Samples Leaf: 1
Best Validation Accuracy: 0.7628524046434494
Test Accuracy with Best Model: 0.7824858757062146
Validation Classification Report: (Validation)
              precision    recall  f1-score   support

           0       0.77      0.89      0.83       387
           1       0.73      0.53      0.62       216

    accuracy                           0.76       603
   macro avg       0.75      0.71      0.72       603
weighted avg       0.76      0.76      0.75       603

Test Accuracy with Best Model: 0.7824858757062146
Test Classification Report: (Test
              precision    recall  f1-score   support

           0       0.82      0.86      0.84       230
           1       0.71      0.65      0.68       124

    accuracy                           0.78       354
   macro avg       0.76      0.75      0.76       354
weighted avg       0.78      0.78      0.78       354



# Gradient Boosting Classifier


In [21]:
# Gradient Boosting Classifier
gbdt_classifier = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
gbdt_classifier.fit(X_train, y_train)

In [22]:
# Predictions
y_pred_train = gbdt_classifier.predict(X_train)
y_pred_test = gbdt_classifier.predict(X_test)

# Evaluation
train_accuracy = accuracy_score(y_train, y_pred_train)
test_accuracy = accuracy_score(y_test, y_pred_test)

print(f"Training Accuracy: {train_accuracy}")
print(f"Testing Accuracy: {test_accuracy}")

# Classification Report
print("Classification Report on Test Set:")
print(classification_report(y_test, y_pred_test))

Training Accuracy: 0.8120331950207469
Testing Accuracy: 0.7909604519774012
Classification Report on Test Set:
              precision    recall  f1-score   support

           0       0.81      0.88      0.85       230
           1       0.74      0.63      0.68       124

    accuracy                           0.79       354
   macro avg       0.78      0.75      0.76       354
weighted avg       0.79      0.79      0.79       354

