In [None]:
#1 Logistic Regression
import numpy as np
import pandas as pd
import os

# load datasets
train_df = pd.read_csv('train (2).csv', on_bad_lines='skip', sep=';')
eval_df = pd.read_csv('evaluation.csv', on_bad_lines='skip', sep=';')
test_df = pd.read_csv('test (1).csv', on_bad_lines='skip', sep=';')

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder

# combine title and text
train_df['content'] = train_df['title'].astype(str) + " " + train_df['text'].astype(str)
eval_df['content'] = eval_df['title'].astype(str) + " " + eval_df['text'].astype(str)
test_df['content'] = test_df['title'].astype(str) + " " + test_df['text'].astype(str)

# encode labels
le = LabelEncoder()
train_df['label'] = le.fit_transform(train_df['label'])
eval_df['label'] = le.transform(eval_df['label'])

# split training data
X_train, X_val, y_train, y_val = train_test_split(
    train_df['content'], train_df['label'], test_size=0.2, random_state=42
)

# pipeline: TF-IDF + Logistic Regression
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000, ngram_range=(1, 2), stop_words='english')),
    ('clf', LogisticRegression(max_iter=1000))
])

# train model
pipeline.fit(X_train, y_train)

# validation evaluation
val_pred = pipeline.predict(X_val)
print("Training Accuracy:", accuracy_score(y_val, val_pred))
print(classification_report(y_val, val_pred))
print(confusion_matrix(y_val, val_pred))

# optional: evaluation.csv performance
eval_pred = pipeline.predict(eval_df['content'])
print("Evaluation Accuracy:", accuracy_score(eval_df['label'], eval_pred))

# test evaluation or prediction
if 'label' in test_df.columns:
    y_test = le.transform(test_df['label'])
    test_pred = pipeline.predict(test_df['content'])
    print("Test Accuracy:", accuracy_score(y_test, test_pred))
    print(classification_report(y_test, test_pred))
    print(confusion_matrix(y_test, test_pred))
else:
    preds = pipeline.predict(test_df['content'])
    test_df['predicted_label'] = le.inverse_transform(preds)
    test_df.to_csv('/kaggle/working/test_predictions.csv', index=False)
    print("Saved predictions to test_predictions.csv")

Training Accuracy: 0.9667419421063437
              precision    recall  f1-score   support

           0       0.96      0.97      0.96      2215
           1       0.97      0.97      0.97      2656

    accuracy                           0.97      4871
   macro avg       0.97      0.97      0.97      4871
weighted avg       0.97      0.97      0.97      4871

[[2140   75]
 [  87 2569]]
Evaluation Accuracy: 0.9653812985093014
Test Accuracy: 0.9721572009363065
              precision    recall  f1-score   support

           0       0.97      0.97      0.97      3753
           1       0.97      0.98      0.97      4364

    accuracy                           0.97      8117
   macro avg       0.97      0.97      0.97      8117
weighted avg       0.97      0.97      0.97      8117

[[3634  119]
 [ 107 4257]]


In [None]:
#2 Linear SVM
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

train_df = pd.read_csv('train (2).csv', sep=';', on_bad_lines='skip')
eval_df  = pd.read_csv('evaluation.csv', sep=';', on_bad_lines='skip')
test_df  = pd.read_csv('test (1).csv', sep=';', on_bad_lines='skip')

train_df['content'] = train_df['title'].astype(str) + " " + train_df['text'].astype(str)
eval_df['content']  = eval_df['title'].astype(str) + " " + eval_df['text'].astype(str)
test_df['content']  = test_df['title'].astype(str) + " " + test_df['text'].astype(str)

In [11]:
le = LabelEncoder()
train_df['label'] = le.fit_transform(train_df['label'])
eval_df['label']  = le.transform(eval_df['label'])

X_train, X_val, y_train, y_val = train_test_split(
    train_df['content'], train_df['label'], test_size=0.2, random_state=42
)

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=20000, ngram_range=(1,2), stop_words='english')),
    ('svm', LinearSVC())
])

pipeline.fit(X_train, y_train)

val_pred = pipeline.predict(X_val)
print("Training Accuracy:", accuracy_score(y_val, val_pred))
print(classification_report(y_val, val_pred))

eval_pred = pipeline.predict(eval_df['content'])
print("Evaluation Accuracy:", accuracy_score(eval_df['label'], eval_pred))

if 'label' in test_df.columns:
    test_df['label'] = le.transform(test_df['label'])
    test_pred = pipeline.predict(test_df['content'])
    print("Test Accuracy:", accuracy_score(test_df['label'], test_pred))
else:
    preds = pipeline.predict(test_df['content'])
    test_df['prediction'] = le.inverse_transform(preds)
    test_df.to_csv('svm_test_predictions.csv', index=False)


Training Accuracy: 0.9757749948675837
              precision    recall  f1-score   support

           0       0.97      0.98      0.97      2215
           1       0.98      0.97      0.98      2656

    accuracy                           0.98      4871
   macro avg       0.98      0.98      0.98      4871
weighted avg       0.98      0.98      0.98      4871

Evaluation Accuracy: 0.9769619317481828
Test Accuracy: 0.9818898607860047
