In [1]:
from sklearn.utils import resample
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import joblib
import numpy as np
import pandas as pd

In [2]:
def split_data(X, y, split, trial):
    return train_test_split(X, y, test_size=1 - split, random_state=trial, stratify=y)

def balance_training_data(X_train, y_train, trial):
    train_data = pd.concat([X_train, y_train], axis=1)
    openai_data = train_data[train_data['y'] == 'OpenAI']
    antrophic_data = train_data[train_data['y'] == 'Antrophic']
    mistral_data = train_data[train_data['y'] == 'Mistral']

    antrophic_oversampled = resample(antrophic_data, 
                                     replace=True,
                                     n_samples=len(openai_data),
                                     random_state=trial)
    mistral_oversampled = resample(mistral_data, 
                                   replace=True,
                                   n_samples=len(openai_data),
                                   random_state=trial)

    balanced_train_data = pd.concat([openai_data, antrophic_oversampled, mistral_oversampled])
    balanced_train_data = balanced_train_data.sample(frac=1, random_state=trial).reset_index(drop=True)

    X_train_balanced = balanced_train_data.drop(columns=['y', 'Unnamed: 0'])
    
    y_train_balanced = balanced_train_data['y']

    return X_train_balanced, y_train_balanced


In [3]:
def preprocess_data(X_train, X_test):
    # Keep only float64 columns
    X_train = X_train.select_dtypes(include=['float64'])
    X_test = X_test.select_dtypes(include=['float64'])

    # Scale numerical features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Save scaler
    joblib.dump(scaler, "scaler.pkl")

    return X_train, X_test, scaler

In [4]:
def apply_pca(X_train, X_test, dim):
    if dim > 0:
        pca = PCA(n_components=dim)
        X_train = pca.fit_transform(X_train)
        X_test = pca.transform(X_test)
        return X_train, X_test, pca
    return X_train, X_test, None

def train_and_evaluate(classifier, X_train, y_train, X_test, y_test):
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

In [5]:
def testClassifier(classifier, X, y, dim=0, split=0.7, ntrials=100, save_best_model=True, model_filename="Best_Model.pkl"):
    means = np.zeros(ntrials)
    best_accuracy = 0
    best_model = None
    best_scaler = None
    best_label_encoders = None
    best_pca = None  # Initialize as None

    for trial in range(ntrials):
        # Split 
        X_train, X_test, y_train, y_test = split_data(X, y, split, trial)

        # Balance 
        X_train_balanced, y_train_balanced = balance_training_data(X_train, y_train, trial)

        X_test = X_test.drop(columns=['Unnamed: 0'])
        y_test = y_test.drop(columns=['Unnamed: 0'])

        # Preprocess 
        X_train_balanced, X_test, scaler = preprocess_data(X_train_balanced, X_test)

        # PCA 
        X_train_balanced, X_test, pca = apply_pca(X_train_balanced, X_test, dim)

        # Classifier
        accuracy = train_and_evaluate(classifier, X_train_balanced, y_train_balanced, X_test, y_test)
        means[trial] = accuracy * 100  # Convert to percentage

        # Save the best model and PCA
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_model = classifier
            best_pca = pca  # Save the best PCA object

        # Print 
        if trial % 10 == 0:
            print(f"Trial {trial}: Accuracy = {accuracy * 100:.3f}%")

    mean_accuracy = np.mean(means)
    std_accuracy = np.std(means)

    print(f"Overall mean: {mean_accuracy:.3f}%, sd: {std_accuracy:.3f}%")

    # Save the best model and preprocessing objects
    if save_best_model and best_model is not None:
        joblib.dump(best_model, model_filename)
        joblib.dump(scaler, "scaler.pkl")
        if best_pca is not None:
            joblib.dump(best_pca, "pca.pkl")  # Save the best PCA object
        print(f"Best model: {best_accuracy * 100:.3f}%")

    return mean_accuracy, std_accuracy

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.preprocessing import LabelEncoder, StandardScaler

file_path = "TrainOnMe.csv"
data = pd.read_csv(file_path)

# Split into train and test before preprocessing
X = data.drop('y', axis=1)
y = data['y']

In [7]:
from sklearn.ensemble import RandomForestClassifier

rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Test the classifier and save the best model
mean_acc, std_acc = testClassifier(
    rf_classifier, 
    X, 
    y, 
    dim=11,  
    split=0.7, 
    ntrials=100, 
    save_best_model=True, 
    model_filename="RandomForest_BestModel.pkl"
)

print(f"Mean: {mean_acc:.3f}%")
print(f"sd: {std_acc:.3f}%")


Trial 0: Accuracy = 79.280%
Trial 10: Accuracy = 79.614%
Trial 20: Accuracy = 79.813%
Trial 30: Accuracy = 78.947%
Trial 40: Accuracy = 80.546%
Trial 50: Accuracy = 79.813%
Trial 60: Accuracy = 78.148%
Trial 70: Accuracy = 79.014%
Trial 80: Accuracy = 80.546%
Trial 90: Accuracy = 78.947%
Overall mean: 79.843%, sd: 1.112%
Best model: 83.411%
Mean: 79.843%
sd: 1.112%


In [9]:
import pandas as pd
import joblib

eval_data = pd.read_csv("EvaluateOnMe.csv")
eval_data_processed = eval_data.drop(eval_data.columns[0], axis=1)

model = joblib.load("RandomForest_BestModel.pkl")

# preprocessing objects
scaler = joblib.load("scaler.pkl")
pca = joblib.load("pca.pkl")

def preprocess_eval_data(eval_data, scaler):
    # Keep only float64 columns
    eval_data = eval_data.select_dtypes(include=['float64'])

    # Scale numerical features
    eval_data = scaler.transform(eval_data)

    return eval_data

# preprocessing
eval_data_processed = preprocess_eval_data(eval_data_processed, scaler)
eval_data_processed = pca.transform(eval_data_processed)

predictions = model.predict(eval_data_processed)

with open("prediction.txt", "w") as file:
    for pred in predictions:
        file.write(f"{pred}\n")

print("Predictions successfully saved to prediction.txt")


Predictions successfully saved to prediction.txt
