# Machine generated Text Detection

Description of the project

## Baseline:

In [8]:
from datasets import Dataset
import pandas as pd
import evaluate
import numpy as np
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding, AutoTokenizer, set_seed
import os
from sklearn.model_selection import train_test_split
import torch
from scipy.special import softmax
import logging

### Baseline Model:

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
import logging
import os

# Read data and split into train/validation/test
def get_data(train_path, test_path, random_seed):
    """
    Function to read dataframe with columns.
    """
    train_df = pd.read_json(train_path, lines=True)
    test_df = pd.read_json(test_path, lines=True)
    
    train_df, val_df = train_test_split(train_df, test_size=0.2, stratify=train_df['label'], random_state=random_seed)
    return train_df, val_df, test_df

# Preprocess text data
def preprocess_function(examples):
    return examples["text"]

# Train Naive Bayes model
def fine_tune(train_df, valid_df):
    # Extract text and labels
    X_train = train_df['text']
    y_train = train_df['label']
    X_valid = valid_df['text']
    y_valid = valid_df['label']

    # Vectorize text data
    vectorizer = CountVectorizer()
    X_train_vec = vectorizer.fit_transform(X_train)
    X_valid_vec = vectorizer.transform(X_valid)

    # Initialize Naive Bayes model
    model = MultinomialNB()
    model.fit(X_train_vec, y_train)

    # Evaluate the model on the validation set
    y_pred = model.predict(X_valid_vec)
    report = classification_report(y_valid, y_pred)
    logging.info(f"Validation Classification Report:\n{report}")
    
    return model, vectorizer

# Test Naive Bayes model
def test(test_df, model, vectorizer):
    X_test = test_df['text']
    y_test = test_df['label']
    
    # Vectorize test data
    X_test_vec = vectorizer.transform(X_test)
    
    # Make predictions
    y_pred = model.predict(X_test_vec)
    
    # Get classification report
    report = classification_report(y_test, y_pred)
    logging.info(f"Test Classification Report:\n{report}")
    
    return report, y_pred

# Main script
random_seed = 0
train_path = 'dataset/subtaskA_train_monolingual.jsonl'
test_path = 'dataset/subtaskA_monolingual_gold.jsonl'
prediction_path = 'baseline_Results.jsonl'

if not os.path.exists(train_path):
    logging.error(f"File doesn't exist: {train_path}")
    raise ValueError(f"File doesn't exist: {train_path}")

if not os.path.exists(test_path):
    logging.error(f"File doesn't exist: {test_path}")
    raise ValueError(f"File doesn't exist: {test_path}")

# Get data for train/dev/test sets
train_df, valid_df, test_df = get_data(train_path, test_path, random_seed)

# Train the Naive Bayes model
model, vectorizer = fine_tune(train_df, valid_df)

# Test the Naive Bayes model
results, predictions = test(test_df, model, vectorizer)

# Save predictions to file
predictions_df = pd.DataFrame({'id': test_df['id'], 'label': predictions})
predictions_df.to_json(prediction_path, lines=True, orient='records')


### Baseline Evaluation:

## Deep Learning:

### Deep Learning Based Model:

### Deep Learning Evaluation:

## LLM:

### LLM Based Model:

### LLM Evaluation: