In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from src.data_preparation import load_split_data,get_datasets
import seaborn as sns
import matplotlib.pyplot as plt
from tensorflow.keras import layers, models
import torch
from sentence_transformers import SentenceTransformer

In [None]:
train_df, val_df, test_df = load_split_data()
X_train, X_val, X_test, y_train, y_val, y_test = get_datasets()

In [None]:
""""""
#Logistic Regression
lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model.fit(X_train, y_train)

# Predict on validation set
y_val_pred = lr_model.predict(X_val)

# Evaluate on validation set
print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))
print("\nClassification Report (Validation):")
print(classification_report(y_val, y_val_pred))

In [None]:
y_test_pred = lr_model.predict(X_test)

print("test accuracy:",accuracy_score(y_test,y_test_pred))
print("\nclassification report:")
print(classification_report(y_test, y_test_pred))
c_m = confusion_matrix(y_test, y_test_pred)
sns.heatmap(c_m, annot=True, fmt="d", cmap="Reds")
plt.xlabel("predicted")
plt.ylabel("actual")
plt.title("confusion Matrix-logistic Regression")
plt.show()

In [None]:
#Machine Learning Models
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# (SVM)
dic_models = {}
svm_model = SVC(kernel='linear', C=1.0, random_state=42)
svm_model.fit(X_train, y_train)

y_val_pred_svm = svm_model.predict(X_val)
print("SVM Validation Accuracy:", accuracy_score(y_val, y_val_pred_svm))
print(classification_report(y_val, y_val_pred_svm))

dic_models['SVM'] = svm_model

#Random Forest
rf_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    random_state=42,
    n_jobs=-1
)
rf_model.fit(X_train, y_train)

y_val_pred_rf = rf_model.predict(X_val)
print("Random Forest Validation Accuracy:", accuracy_score(y_val, y_val_pred_rf))
print(classification_report(y_val, y_val_pred_rf))

dic_models['RandomForest'] = rf_model

In [None]:
svm_model = dic_models['SVM']
y_test_pred_svm=svm_model.predict(X_test)

print("SVM Test Evaluation")
print("Accuracy:",accuracy_score(y_test, y_test_pred_svm))
print("\nClassification Report:")
print(classification_report(y_test, y_test_pred_svm))

cm_svm = confusion_matrix(y_test, y_test_pred_svm)
plt.figure(figsize=(5,4))
sns.heatmap(cm_svm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix-SVM')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()


In [None]:
rf_model = dic_models['RandomForest']
y_test_pred_rf = rf_model.predict(X_test)

print("RandomForest Test Evaluation")
print("Accuracy:",accuracy_score(y_test, y_test_pred_rf))
print("\nClassification Report:")
print(classification_report(y_test, y_test_pred_rf))

cm_rf = confusion_matrix(y_test, y_test_pred_rf)
plt.figure(figsize=(5,4))
sns.heatmap(cm_rf, annot=True, fmt='d', cmap='Purples')
plt.title('Confusion Matrix-Random Forest')
plt.title('Confusion Matrix-RandomForest')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()


In [None]:
if torch.backends.mps.is_available():
    device = "mps"
    print("Using mps")
else:
    device = "cpu"
    print("Using CPU")

bert_model = SentenceTransformer(
    "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
    device=device
)

X_train_emb = bert_model.encode(
    train_df["cleaned_text"].tolist(),
    convert_to_numpy=True,
    show_progress_bar=True
)

X_val_emb = bert_model.encode(
    val_df["cleaned_text"].tolist(),
    convert_to_numpy=True,
    show_progress_bar=True
)

X_test_emb = bert_model.encode(
    test_df["cleaned_text"].tolist(),
    convert_to_numpy=True,
    show_progress_bar=True
)

y_train = train_df["label"].values
y_val = val_df["label"].values
y_test = test_df["label"].values

print("Train embedding shape:", X_train_emb.shape)

In [None]:
#feedforward classifier on embeddings
ffnn_model = models.Sequential([
    layers.Input(shape=(X_train_emb.shape[1],)),
    layers.Dense(256, activation="relu"),
    layers.Dropout(0.3),
    layers.Dense(128, activation="relu"),
    layers.Dropout(0.3),
    layers.Dense(1, activation="sigmoid")   # binary classification
])

ffnn_model.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=["accuracy"]
)

ffnn_model.summary()