In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
import nltk

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

def snake_columns(data):
    """
    Standardize and returns snake_case columns
    """
    data.columns = [column.lower().replace(' ', '_') for column in data.columns]

# ⚙️ Settings
pd.set_option('display.max_columns', None)  # Display all columns
warnings.filterwarnings('ignore')  # Ignore warnings

# Load the data
data = pd.read_csv("dataset/training_data.csv", sep='\t', names=["label", "text"])
df = data.copy()
df.shape
df.head(10)

# Exploratory Data Analysis
# Check for null values and label distribution
df.isnull().sum()
sns.countplot(x=data["label"])
plt.show()
data["label"].value_counts()

# Overview
# There's no null value in the dataset. Labels are balanced.
target = data.pop("label")
data["label"] = target

# Data Preprocessing (clean the text)
train_data = data

# Clean non-alphabetic characters
train_data["text"] = train_data["text"].replace("[^a-zA-Z]", " ", regex=True)

# Convert the text to lowercase
train_data["text"] = train_data["text"].str.lower()

# Check data type
train_data["text"] = train_data["text"].astype(str)

# Vectorize the text using CountVectorizer (focus on bigrams)
countvector = CountVectorizer(ngram_range=(2, 2))
train_dataset = countvector.fit_transform(train_data["text"])

# Define the target
ground_truth = train_data["label"]

# Initialize RandomForestClassifier
random_classifier = RandomForestClassifier(n_estimators=200, criterion='entropy', max_depth=10, random_state=42)

# Train the model
random_classifier.fit(train_dataset, ground_truth)

# Cross-validation
scores = cross_val_score(random_classifier, train_dataset, ground_truth, cv=5)
print(f"Cross-validation scores: {scores}")
print(f"Mean cross-validation score: {scores.mean()}")

# Predictions on the test data
test_data = pd.read_csv("../dataset/testing_data.csv", sep='\t', names=["label", "text"])

test_data["text"] = test_data["text"].replace("[^a-zA-Z]", " ", regex=True)
test_data["text"] = test_data["text"].str.lower()
test_dataset = countvector.transform(test_data["text"])

# Make predictions
predictions = random_classifier.predict(test_dataset)

# Plot predictions
sns.countplot(x=predictions)
plt.show()

# Assign predictions to the test data
test_data["predictions_rand_for"] = predictions

# Save the results to CSV
test_data.to_csv("../dataset/predictions.csv", index=False)
