In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import pickle
import os  # Import os module for directory operations


In [12]:
# Load the training and testing datasets
train_df = pd.read_csv("train_combined_final.csv")
test_df = pd.read_csv("test_final.csv")

# Define the models to be trained
models = {
    "AB": AdaBoostClassifier(),
    "RF": RandomForestClassifier(),
    "LDA": LinearDiscriminantAnalysis(),
    "KNN": KNeighborsClassifier(),
    "SVM": SVC(probability=True),
    "DT": DecisionTreeClassifier(),
    "GNB": GaussianNB(),
    "LR": LogisticRegression(max_iter=1000)
}



In [13]:
# Columns to drop from the training and testing datasets
# We check if the columns exist before dropping them to prevent errors
drop_train_cols = [col for col in ["autism", "Class/ASD", "result", "ID", "used_app_before", "age_desc",
                                   "ethnicity", "country_of_res", "relation"] if col in train_df.columns]
drop_test_cols = [col for col in ["autism", "result", "ID", "used_app_before", "age_desc",
                                  "ethnicity", "country_of_res", "relation"] if col in test_df.columns]

# Separate features (X) and target variable (y) for training and testing sets
X_train = train_df.drop(columns=drop_train_cols)
y_train = train_df['autism']

X_test = test_df.drop(columns=drop_test_cols)
y_test = test_df['autism']

# Create a list of the original training columns for later use
template_columns = X_train.columns.tolist()

# Combine training and testing data for consistent preprocessing
combined = pd.concat([X_train, X_test]).copy()

# Normalize string formatting in the combined dataset
for col in combined.select_dtypes(include='object').columns:
    combined[col] = combined[col].str.lower().str.strip()

# Identify categorical and numerical columns in the combined dataset
categorical_cols = combined.select_dtypes(include='object').columns
numerical_cols = combined.select_dtypes(include='number').columns

# Create a ColumnTransformer to apply different preprocessing to different columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', MinMaxScaler(), numerical_cols),  # Scale numerical columns
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols)  # One-hot encode categorical columns
    ])

# Fit the preprocessor on the combined data and transform it
combined_scaled = preprocessor.fit_transform(combined)

# Get the feature names after OneHotEncoding
categorical_feature_names = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_cols)
all_feature_names = numerical_cols.tolist() + list(categorical_feature_names)

print(all_feature_names)  # Print the list of all feature names

# Convert the processed data back into a DataFrame
combined_scaled = pd.DataFrame(combined_scaled, columns=all_feature_names)

# Split the processed data back into training and testing sets
X_train_scaled = combined_scaled.iloc[:len(X_train)]
X_test_scaled = combined_scaled.iloc[len(X_train):]

['A1_Score', 'A2_Score', 'A3_Score', 'A4_Score', 'A5_Score', 'A6_Score', 'A7_Score', 'A8_Score', 'A9_Score', 'A10_Score', 'age', 'gender_f', 'gender_m', 'jaundice_no', 'jaundice_yes']


In [14]:
# --- Optional: Create an additional test split from the training data ---
# This is useful for validating model performance without relying solely on the external test set.
# The split is performed with a random state to ensure reproducibility if desired.
"""
X_train_internal, X_val_internal, y_train_internal, y_val_internal = train_test_split(
    X_train_scaled, y_train, test_size=0.2, random_state=42  # Adjust test_size as needed
)
"""

'\nX_train_internal, X_val_internal, y_train_internal, y_val_internal = train_test_split(\n    X_train_scaled, y_train, test_size=0.2, random_state=42  # Adjust test_size as needed\n)\n'

In [15]:
# ----------------------------------------------------------------------

# Train and evaluate the models
model_accuracies = {}
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    preds = model.predict(X_test_scaled)
    acc = accuracy_score(y_test, preds)
    model_accuracies[name] = acc

    # Save each trained model
    with open(f"models/{name}_model.pkl", "wb") as f:
        pickle.dump(model, f)

In [16]:
# Save the preprocessor
with open("preprocessor.pkl", "wb") as f:
    pickle.dump(preprocessor, f)

# Save the column names used for training
with open("template_columns.pkl", "wb") as f:
    pickle.dump(combined_scaled.columns, f)

In [17]:
model_accuracies

{'AB': 0.735,
 'RF': 0.705,
 'LDA': 0.71,
 'KNN': 0.675,
 'SVM': 0.7,
 'DT': 0.705,
 'GNB': 0.705,
 'LR': 0.715}