# Detect AI Generated Text
In this challenge, the goal is to determine whether a piece of text is AI generated or not.

# Import the necessary libraries
Here, we import the libraries necessary for the challenge.

In [18]:
# Libraries
import joblib
import mlflow
import numpy as np
import pandas as pd
import sentencepiece as spm
from mlflow.models import infer_signature
from sklearn.metrics import classification_report, roc_auc_score, make_scorer, recall_score
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from lightgbm import LGBMClassifier




In [19]:

# Setup MLFlow
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("detect-ai-text")

<Experiment: artifact_location='mlflow-artifacts:/318529209653114071', creation_time=1706007893086, experiment_id='318529209653114071', last_update_time=1706007893086, lifecycle_stage='active', name='detect-ai-text', tags={}>

# Obtain data
Here, we obtain the data to train.

In [5]:
# get training data
final_augmented_train = pd.read_csv(r"C:\Users\Spectra\Desktop\detect_AI_text\data\raw\final_augmented_data.csv")
#test_df = pd.read_csv("data/test_essays.csv")
new_data_one = pd.read_csv(r"C:\Users\Spectra\Desktop\detect_AI_text\data\raw\train_drcat_01.csv")
new_data_two = pd.read_csv(r"C:\Users\Spectra\Desktop\detect_AI_text\data\raw\train_drcat_02.csv")
new_data_three = pd.read_csv(r"C:\Users\Spectra\Desktop\detect_AI_text\data\raw\train_drcat_03.csv")
new_data_four = pd.read_csv(r"C:\Users\Spectra\Desktop\detect_AI_text\data\raw\train_drcat_04.csv")
augmented_test = pd.read_csv(r"C:\Users\Spectra\Desktop\detect_AI_text\data\archive (2).zip (Unzipped Files)\final_test.csv")

**Get train data ready**

In [6]:
# Rename Columns
new_data_four.rename(columns={'label': 'generated'}, inplace=True)
new_data_two.rename(columns={'label': 'generated'}, inplace=True)

In [7]:
# Concat the data
concat_train_df = pd.concat([final_augmented_train,
                             new_data_two[["text", "generated"]],
                             new_data_four[["text", "generated"]]], axis=0)

concat_train_df.head()

Unnamed: 0,text,generated
0,"Dear Mr. Senator, I have decided to express my...",0
1,Limiting car usage is advantageous for multip...,1
2,Limiting car usage is beneficial for a number...,1
3,"""The day that mankind realizes that their crea...",0
4,"Dear me. Senator, I am fed up with the elector...",0


In [8]:
# Drop duplicates
concat_train_df.drop_duplicates(subset=["text"])

Unnamed: 0,text,generated
0,"Dear Mr. Senator, I have decided to express my...",0
1,Limiting car usage is advantageous for multip...,1
2,Limiting car usage is beneficial for a number...,1
3,"""The day that mankind realizes that their crea...",0
4,"Dear me. Senator, I am fed up with the elector...",0
...,...,...
44131,Looking for some fun activities to do at the ...,1
44142,The school administration has recently announ...,1
44144,\nSeeking advice from more than one person whe...,1
44160,While the Facial Action Coding System technolo...,1


In [20]:
exp_train_data = concat_train_df.sample(2000)
exp_train_data.reset_index(drop=True, inplace=True)
exp_train_data.to_csv(r"C:\Users\Spectra\Desktop\detect_AI_text\data\interim\exp_train.csv", index=False)

In [11]:
# Save the training data
concat_train_df.to_csv(r"C:\Users\Spectra\Desktop\detect_AI_text\data\interim\train.csv", index=False)

In [12]:
# Sample training data
sampled_train_df, _ = train_test_split(concat_train_df,
                                       test_size=(len(concat_train_df) - 100000) / len(concat_train_df),
                                       stratify=concat_train_df["generated"],
                                       random_state=42)

**Sample Test Data**

In [13]:
# Rename columns
new_data_three.rename(columns={'label': 'generated'}, inplace=True)
new_data_one.rename(columns={'label': 'generated'}, inplace=True)
augmented_test.rename(columns={'label': 'generated'}, inplace=True)

In [14]:
# Concat the data
concat_eval_df = pd.concat([augmented_test,
                             new_data_one[["text", "generated"]],
                             new_data_three[["text", "generated"]]], axis=0)



In [None]:
exp_eval_data = concat_eval_df.sample(2000)
exp_eval_data.reset_index(drop=True, inplace=True)
exp_eval_data.to_csv(r"C:\Users\Spectra\Desktop\detect_AI_text\data\interim\exp_eval.csv", index=False)

In [15]:
# Drop duplicates
concat_eval_df.drop_duplicates(subset=["text"])

Unnamed: 0,text,generated
0,The Face on Mars is nothing but a natural occu...,0
1,Students have a higher chance of catching a vi...,0
2,Driverless cars have good and bad things that ...,0
3,Some people might think that traveling in a gr...,1
4,How many of us students want to be forced to d...,0
...,...,...
42190,I think our principal's idea of making us do e...,1
42191,I think it's a good idea for schools to have o...,1
42196,Students often debate whether inactivity or s...,1
42199,Advantages of Limiting Car Usage\n\nLimiting c...,1


In [17]:
# Save the data
concat_eval_df.to_csv(r"C:\Users\Spectra\Desktop\detect_AI_text\data\interim\eval.csv", index=False)

In [13]:
# Create sample evaluation set
sampled_eval_df, _ = train_test_split(concat_eval_df,
                                       test_size=(len(concat_eval_df) - 20000) / len(concat_eval_df),
                                       stratify=concat_eval_df["generated"],
                                       random_state=42)

In [14]:
# Reset its index
sampled_eval_df.reset_index(drop=True, inplace=True)

In [15]:
# Split the evaluation dataset into validation and test datasets
val_df, test_df = train_test_split(sampled_eval_df,
                                test_size=0.5,
                                stratify=sampled_eval_df["generated"],
                                random_state=42)


In [4]:
val_df

NameError: name 'val_df' is not defined

# Preprocess text

**Train Tokenizer**

In [16]:
# Create dataset to train sentence piece tokenizer
train_sentpiece_df = pd.concat([test_df["text"], concat_train_df["text"]])
train_sentpiece_df.reset_index(drop=True, inplace=True)

In [22]:
# Train sentence piece model

import sentencepiece as spm
 
# Write text data to a text file for SentencePiece training
with open('sentpiece_train.txt', 'w', encoding='utf-8') as file:
    for text in train_sentpiece_df:
        file.write(text + '\n')

# Train SentencePiece model
spm.SentencePieceTrainer.train(input='data/sentpiece_train.txt', model_prefix='data/sentpiece_model', vocab_size=30522)


In [17]:
def get_tokenized_text(df, tok_model_path):
  sp = spm.SentencePieceProcessor()
  sp.Load(tok_model_path)
  # Tokenize text using SentencePiece
  df['tokens'] = df['text'].apply(lambda x: sp.EncodeAsPieces(x.lower()))
  # Convert SentencePiece tokens to text for XGBoost
  df['text_spm'] = df['tokens'].apply(lambda x: ' '.join(x))
  return df


In [18]:
#tokenized_train_one_df = get_tokenized_text(train_set_one, "sentpiece_model.model")
#tokenized_train_two_df = get_tokenized_text(train_set_two, "sentpiece_model.model")
#tokenized_train_three_df = get_tokenized_text(train_set_three, "sentpiece_model.model")
tokenized_eval_df = get_tokenized_text(val_df, "data/sentpiece_model.model")
tokenized_test_df = get_tokenized_text(test_df, "data/sentpiece_model.model")
tokenized_main_train_df = get_tokenized_text(sampled_train_df, "data/sentpiece_model.model")

**Train Vectorizer**

In [19]:

vectorizer = TfidfVectorizer(ngram_range=(3, 5), 
                             sublinear_tf=True,
                             lowercase=False,
                             max_features=5000)


X_train_main = vectorizer.fit_transform(tokenized_main_train_df["text_spm"])
y_train_main = tokenized_main_train_df["generated"]
X_val = vectorizer.transform(tokenized_eval_df["text_spm"])
y_val = tokenized_eval_df["generated"]
X_test = vectorizer.transform(tokenized_eval_df["text_spm"])
y_test = tokenized_test_df["generated"]



In [20]:
# Save the vectorizer
joblib.dump(vectorizer, "data/vectorizer.pkl")

['data/vectorizer.pkl']

# Modelling

## Modelling with LGBM

**Hyperparameter tune LGBM**

In [None]:
X_train_hyper, _, y_train_hyper, _ = train_test_split(X_train_main, y_train_main, train_size=0.1, random_state=42)

def custom_metric(y_true, y_pred):
    # Your custom metric, for example, a combination of ROC-AUC and recall for class 1
    roc_auc = roc_auc_score(y_true, y_pred)
    recall_class_1 = recall_score(y_true, (y_pred > 0.5).astype(int), pos_label=1)

    # You can adjust the weights based on your preference
    custom_metric_value = 0.5 * roc_auc + 0.5 * recall_class_1
    
    return custom_metric_value

def custom_metric_for_lgbm(y_true, y_pred):
    roc_auc = roc_auc_score(y_true, y_pred)
    recall_class_1 = recall_score(y_true, (y_pred > 0.5).astype(int), pos_label=1)
    custom_metric_value = 0.5 * roc_auc + 0.5 * recall_class_1
    return 'custom', custom_metric_value, True

custom_scorer = make_scorer(custom_metric, needs_proba=True)

# def objective(params):
#     with mlflow.start_run():
#         mlflow.set_tag("model", "lightgbm-classifier")
#         mlflow.log_artifact("data/vectorizer.pkl")
#         mlflow.log_artifact("data/sentpiece_model.model")
#         mlflow.log_params(params)
#         params['objective'] = 'cross_entropy'
#         params['metric'] = 'custom'
#         params['n_iter'] = 2500
#         params['verbose'] = -1
#         params['min_data_in_leaf'] = int(params['min_data_in_leaf'])
#         params['max_depth'] = int(params['max_depth'])
#         params['max_bin'] = int(params['max_bin'])
        


#         cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
#         model = LGBMClassifier(**params)

#         scores = cross_val_score(estimator=model, X=X_train_hyper, y=y_train_hyper, cv=cv, scoring=custom_scorer, n_jobs=-1)

#         # Take the mean of the custom metric across folds
#         mean_score = np.mean(scores)
#         mlflow.log_metric("roc and recall", mean_score)

#         # Log the model
#         artifact_path = "model"
#         mlflow.lightgbm.log_model(model, artifact_path)
       
#     return -mean_score
#     # Define the search space
# space = {
#     'learning_rate': hp.uniform('learning_rate', 0.001, 0.1),
#     'colsample_bytree': hp.uniform('colsample_bytree', 0.7, 1.0),
#     'colsample_bynode': hp.uniform('colsample_bynode', 0.7, 1.0),
#     'lambda_l1': hp.uniform('lambda_l1', 0.1, 10),
#     'lambda_l2': hp.uniform('lambda_l2', 0.1, 10),
#     'min_data_in_leaf': hp.quniform('min_data_in_leaf', 50, 150, 1),
#     'max_depth': hp.quniform('max_depth', 10, 30, 1),
#     'max_bin': hp.quniform('max_bin', 500, 1000, 1),
# }

# # Run Hyperopt optimization
# trials = Trials()
# best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=50, trials=trials, verbose=1)

# # Print the best hyperparameters
# print("Best Hyperparameters:", best)

  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?]




 52%|█████▏    | 26/50 [45:04<1:01:47, 154.49s/trial, best loss: -0.9576689687005819]

job exception: API request to http://localhost:5000/api/2.0/mlflow-artifacts/artifacts/434808567354833662/9ed5e5e21bf24c1881e420a6d399d680/artifacts/vectorizer.pkl failed with exception HTTPConnectionPool(host='localhost', port=5000): Max retries exceeded with url: /api/2.0/mlflow-artifacts/artifacts/434808567354833662/9ed5e5e21bf24c1881e420a6d399d680/artifacts/vectorizer.pkl (Caused by ResponseError('too many 500 error responses'))



 52%|█████▏    | 26/50 [47:06<43:29, 108.73s/trial, best loss: -0.9576689687005819]  


MlflowException: API request to http://localhost:5000/api/2.0/mlflow-artifacts/artifacts/434808567354833662/9ed5e5e21bf24c1881e420a6d399d680/artifacts/vectorizer.pkl failed with exception HTTPConnectionPool(host='localhost', port=5000): Max retries exceeded with url: /api/2.0/mlflow-artifacts/artifacts/434808567354833662/9ed5e5e21bf24c1881e420a6d399d680/artifacts/vectorizer.pkl (Caused by ResponseError('too many 500 error responses'))

In [33]:


with mlflow.start_run():
    params = {'colsample_bynode': 0.891492853612568,
          'colsample_bytree': 0.8652279279632024,
          'lambda_l1': 0.1497981206936176,
          'lambda_l2': 2.5379309743954743,
          'learning_rate': 0.04665576114608101,
          'max_bin': 984,
          'max_depth': 14, 
          'min_data_in_leaf': 50,
          'objective' : 'cross_entropy',
          'metric' : 'custom',
          'n_iter' : 2500,
          'verbose' : -1}
    
    mlflow.set_tag("model", "lightgbm-classifier")
    mlflow.log_artifact("data/vectorizer.pkl")
    mlflow.log_artifact("data/sentpiece_model.model")
    mlflow.log_params(params)
        
    model = LGBMClassifier(**params)
    model.fit(X_train_main, y_train_main)
    predictions = model.predict_proba(X_val)[:, 1]
    roc = roc_auc_score(y_val, predictions)
    custom = custom_metric(y_val, predictions)
    mlflow.log_metric("roc", roc)
    mlflow.log_metric("roc and recall", custom)
    # Log the model
    artifact_path = "model"
    signature = infer_signature(X_train_main, model.predict(X_train_main))
    mlflow.lightgbm.log_model(model, artifact_path, signature=signature)



**Try out optuna**

In [36]:
import optuna

In [45]:

X_train_hyper, _, y_train_hyper, _ = train_test_split(X_train_main, y_train_main, train_size=0.2, random_state=42)

def custom_metric(y_true, y_pred):
    roc_auc = roc_auc_score(y_true, y_pred)
    recall_class_1 = recall_score(y_true, (y_pred > 0.5).astype(int), pos_label=1)
    custom_metric_value = 0.5 * roc_auc + 0.5 * recall_class_1
    return custom_metric_value

def custom_metric_for_lgbm(y_true, y_pred):
    roc_auc = roc_auc_score(y_true, y_pred)
    recall_class_1 = recall_score(y_true, (y_pred > 0.5).astype(int), pos_label=1)
    custom_metric_value = 0.5 * roc_auc + 0.5 * recall_class_1
    return 'custom', custom_metric_value, True

custom_scorer = make_scorer(custom_metric, needs_proba=True)

def objective(trial):
    
    params = {
        'objective': 'cross_entropy',
        'metric': 'custom',
        'n_iter': 2500,
        'verbose': -1,
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 50, 150),
        'max_depth': trial.suggest_int('max_depth', 10, 30),
        'max_bin': trial.suggest_int('max_bin', 500, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 1.0),
        'colsample_bynode': trial.suggest_float('colsample_bynode', 0.7, 1.0),
        'lambda_l1': trial.suggest_float('lambda_l1', 0.1, 10),
        'lambda_l2': trial.suggest_float('lambda_l2', 0.1, 10),
    }

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    model = LGBMClassifier(**params)

    scores = cross_val_score(estimator=model, X=X_train_hyper, y=y_train_hyper, cv=cv, scoring=custom_scorer, n_jobs=-1)

    mean_score = np.mean(scores)

    return mean_score

# Run Optuna optimization
study = optuna.create_study(direction='maximize')
with mlflow.start_run():
    mlflow.lightgbm.autolog()
    study.optimize(objective, n_trials=100)
    best_params = study.best_params
# Print the best hyperparameters
print("Best Hyperparameters:", best_params)


[I 2024-01-24 10:29:54,542] A new study created in memory with name: no-name-bfc1223f-0249-4072-a704-3ee73f30e644
[I 2024-01-24 10:36:34,240] Trial 0 finished with value: 0.945172731525088 and parameters: {'min_data_in_leaf': 115, 'max_depth': 22, 'max_bin': 683, 'learning_rate': 0.039635794241083565, 'colsample_bytree': 0.9183151453640777, 'colsample_bynode': 0.7802044674869707, 'lambda_l1': 6.696717439312544, 'lambda_l2': 6.745319601410854}. Best is trial 0 with value: 0.945172731525088.
[I 2024-01-24 10:44:26,160] Trial 1 finished with value: 0.9627546986429119 and parameters: {'min_data_in_leaf': 114, 'max_depth': 22, 'max_bin': 731, 'learning_rate': 0.08871870509146462, 'colsample_bytree': 0.8582571636012482, 'colsample_bynode': 0.7233964936426379, 'lambda_l1': 0.5583732630527912, 'lambda_l2': 3.3860155144753}. Best is trial 1 with value: 0.9627546986429119.
[I 2024-01-24 10:46:51,925] Trial 2 finished with value: 0.9309545620408372 and parameters: {'min_data_in_leaf': 144, 'max_d

In [1]:
best_params

NameError: name 'best_params' is not defined

In [22]:
mlflow.get_experiment_by_name('detect-ai-text').

<Experiment: artifact_location='mlflow-artifacts:/318529209653114071', creation_time=1706007893086, experiment_id='318529209653114071', last_update_time=1706007893086, lifecycle_stage='active', name='detect-ai-text', tags={}>

In [25]:
mlflow.get_tracking_uri()

'http://localhost:5000'

In [26]:
from mlflow.tracking import MlflowClient


client = MlflowClient("http://localhost:5000")

In [27]:
client.

AttributeError: 'MlflowClient' object has no attribute 'list_registered_models'