In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [None]:
train = pd.read_csv('/content/train.csv')
test = pd.read_csv('/content/test.csv')

In [None]:
train.head()

Unnamed: 0,id,topic,answer,is_cheating
0,scr_cd3edac7d322,A girl wakes from a dream and she is not sure ...,"My eyes flew open, and the air around me feels...",1
1,scr_86f1104afb28,A journalistic review piece about the top 6 ai...,Robot Butlers in the year of 2025. What are th...,0
2,scr_c461dafbe886,The influence of fictional universities in cam...,"In recent years, apparel featuring the names a...",1
3,scr_64880cce429f,Why do girls love horses,"The moment before I hit the dirt, I thought we...",0
4,scr_c7742a3b2444,"Every year, a remote mountain town elects a ne...","In the valley of Eldermist, were the mountains...",1


In [None]:

print('Number of Rows', train.shape[0])
print('Number of Columns', train.shape[1])

Number of Rows 269
Number of Columns 4


In [None]:
train.dtypes

Unnamed: 0,0
id,object
topic,object
answer,object
is_cheating,int64


In [None]:
#Text Cleaning & Preprocessing

#Remove unnecessary noise:

import re

def clean_text(text):
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'[^A-Za-z0-9.,!?;:()\'\" ]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

train["clean_text"] = train["answer"].apply(clean_text)
test["clean_text"]  = test["answer"].apply(clean_text)

In [None]:
#Feature Extraction (TF-IDF)

#Convert text into numerical features for ML models:


from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    max_features=10000,
    ngram_range=(1,2),
    stop_words='english'
)

X_train = vectorizer.fit_transform(train["clean_text"])
X_test  = vectorizer.transform(test["clean_text"])
y_train = train["is_cheating"]


In [None]:
#Model Training

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score, r2_score
from sklearn.model_selection import train_test_split

X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

model = LogisticRegression(max_iter=1000)
model.fit(X_tr, y_tr)

y_pred = model.predict(X_val)
print("Validation Accuracy:", accuracy_score(y_val, y_pred))
print("Validation F1:", f1_score(y_val, y_pred))
print('r2 score',r2_score(y_val, y_pred))

Validation Accuracy: 0.5370370370370371
Validation F1: 0.6575342465753424
r2 score -0.8749999999999991


In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score

xgb = XGBClassifier(
    n_estimators=800,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.9,
    colsample_bytree=0.8,
    min_child_weight=3,
    eval_metric='logloss',
    random_state=42,
    n_jobs=-1
)

xgb.fit(X_tr, y_tr)
y_pred = xgb.predict(X_val)

print("Validation Accuracy:", accuracy_score(y_val, y_pred))
print("Validation F1:", f1_score(y_val, y_pred))


Validation Accuracy: 0.6666666666666666
Validation F1: 0.6785714285714286


In [None]:
y_test_pred = xgb.predict(X_test)

submission = pd.read_csv("/content/sample_submission.csv")
submission["is_cheating"] = y_test_pred
submission.to_csv("submission.csv", index=False)


In [None]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.6.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.10.1-py3-none-any.whl.metadata (11 kB)
Downloading optuna-4.6.0-py3-none-any.whl (404 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m404.7/404.7 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.10.1-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.10.1 optuna-4.6.0


In [None]:
import optuna
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score

def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 500, 2500),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "gamma": trial.suggest_float("gamma", 0, 1.0),
        "eval_metric": "logloss",
        "random_state": 42,
        "n_jobs": -1
    }
    model = XGBClassifier(**params)
    f1 = cross_val_score(model, X_tr, y_tr, scoring='f1', cv=3).mean()
    return f1

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)
print(study.best_params)


[I 2025-11-11 22:49:39,545] A new study created in memory with name: no-name-297af97c-1ef4-426d-8171-9b29dad92496
[I 2025-11-11 22:50:10,497] Trial 0 finished with value: 0.6798093216393871 and parameters: {'n_estimators': 1896, 'max_depth': 11, 'learning_rate': 0.11032343195675567, 'subsample': 0.7030861651308451, 'colsample_bytree': 0.8239407485241775, 'min_child_weight': 4, 'gamma': 0.3629541367897491}. Best is trial 0 with value: 0.6798093216393871.
[I 2025-11-11 22:50:21,861] Trial 1 finished with value: 0.707009780856751 and parameters: {'n_estimators': 898, 'max_depth': 4, 'learning_rate': 0.10326769688864627, 'subsample': 0.7399243858743987, 'colsample_bytree': 0.7356582264616058, 'min_child_weight': 6, 'gamma': 0.12982668660768137}. Best is trial 1 with value: 0.707009780856751.
[I 2025-11-11 22:50:26,137] Trial 2 finished with value: 0.7807448688674743 and parameters: {'n_estimators': 1010, 'max_depth': 6, 'learning_rate': 0.17437032739957767, 'subsample': 0.8056743773525297,

{'n_estimators': 1392, 'max_depth': 5, 'learning_rate': 0.019790837837224864, 'subsample': 0.6734998627615472, 'colsample_bytree': 0.7175557769418476, 'min_child_weight': 1, 'gamma': 0.6841847920968599}


In [None]:


xgb1 = XGBClassifier(n_estimators=1392, max_depth=5, learning_rate=0.019790837837224864, subsample=0.6734998627615472, colsample_bytree=0.7175557769418476, min_child_weight=1, gamma=0.6841847920968599)
xgb1.fit(X_tr, y_tr)
y_pred = xgb1.predict(X_val)

print("Validation Accuracy:", accuracy_score(y_val, y_pred))
print("Validation F1:", f1_score(y_val, y_pred))

Validation Accuracy: 0.7592592592592593
Validation F1: 0.7719298245614035


In [None]:
#submission
prediction = xgb1.predict(X_test)


In [None]:
y_test_pred = xgb1.predict(X_test)

submission = pd.read_csv("/content/sample_submission.csv")
submission["is_cheating"] = y_test_pred
submission.to_csv("submission.csv", index=False)
