In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import make_pipeline

In [3]:
# Load the data
train_essays = pd.read_csv(r'llm-detect-ai-generated-text\train_essays.csv')  
test_essays = pd.read_csv(r'llm-detect-ai-generated-text\test_essays.csv')    
train_prompts = pd.read_csv(r'llm-detect-ai-generated-text\train_prompts.csv') 

In [4]:
# Merge training essays with prompts
train_data = pd.merge(train_essays, train_prompts, on='prompt_id', how='left')

In [5]:
# Split the training data into training and validation sets
train, valid = train_test_split(train_data, test_size=0.2, random_state=42)

In [6]:
# Pipeline
model = make_pipeline(
    TfidfVectorizer(max_features=15000, stop_words='english', ngram_range=(1, 2)),
    RandomForestClassifier(random_state=42)
)

In [7]:
# Parameter grid for RandomForestClassifier
param_grid = {
    'randomforestclassifier__n_estimators': [100, 200, 300, 500],
    'randomforestclassifier__max_depth': [5, 10, 15, 20]
}

In [8]:
#GridSearchCV object
grid_search = GridSearchCV(model, param_grid, cv=3, scoring='roc_auc', n_jobs=-1)

In [9]:
# Fiting the model on the training data with hyperparameter tuning
grid_search.fit(train['text'], train['generated'])



In [11]:
# Model from the grid search
best_model = grid_search.best_estimator_
best_model

In [12]:
# Make predictions on the validation set
predictions = best_model.predict_proba(valid['text'])[:, 1]

In [13]:
# Evaluate the model using ROC AUC score
roc_auc = roc_auc_score(valid['generated'], predictions)
print(f'Best ROC AUC Score: {roc_auc}')

Best ROC AUC Score: 0.9927272727272727


In [14]:
# Make predictions on the test set
test_predictions = best_model.predict_proba(test_essays['text'])[:, 1]

In [15]:
# Create a submission file with the required name
submission_df = pd.DataFrame({'id': test_essays['id'], 'generated': test_predictions})
submission_df.to_csv('submission.csv', index=False) 