In [1]:
pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [None]:
# Import libraries
import numpy as np
import pandas as pd
import warnings
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import f1_score
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import LabelEncoder
import pickle

warnings.filterwarnings('ignore')

# Load data
train_data = pd.read_csv("/content/Dataset.csv")
test_data = pd.read_csv("/content/Dataset 1.csv")
df_train = train_data.copy()
df_test = test_data.copy()

def preprocess_train(df):
    # Drop rows where all values are NaN
    df = df.dropna(how='all')

    columns_to_drop = [
        "Patient Id", "Family Name", "Institute Name", "Patient First Name",
        "Father's name", "Location of Institute", "Parental consent",
        "Test 1", "Test 2", "Test 3", "Test 4", "Test 5",
        "Mother's age", "Father's age", "Disorder Subclass"
    ]
    df = df.drop(columns=columns_to_drop, errors='ignore')

    cat_columns = ['Symptom 1', 'Symptom 2', 'Symptom 3', 'Symptom 4', 'Symptom 5']
    for col in cat_columns:
        if col in df.columns:
            df[col] = df[col].astype('object')

    disorder_mapping = {
        "Leber's hereditary optic neuropathy": "Mitochondrial genetic inheritance disorders",
        "Leigh syndrome": "Mitochondrial genetic inheritance disorders",
        "Mitochondrial myopathy": "Mitochondrial genetic inheritance disorders",
        "Alzheimer's": "Multifactorial genetic inheritance disorders",
        "Cancer": "Multifactorial genetic inheritance disorders",
        "Diabetes": "Multifactorial genetic inheritance disorders",
        "Cystic fibrosis": "Single-gene inheritance diseases",
        "Hemochromatosis": "Single-gene inheritance diseases",
        "Tay-Sachs": "Single-gene inheritance diseases",
    }

    def fill_genetic_disorder(row):
        if pd.isnull(row["Genetic Disorder"]) and "Disorder Subclass" in row and row["Disorder Subclass"] in disorder_mapping:
            return disorder_mapping[row["Disorder Subclass"]]
        return row["Genetic Disorder"]

    df["Genetic Disorder"] = df.apply(fill_genetic_disorder, axis=1)

    # Simple imputation for Genetic Disorder
    if df['Genetic Disorder'].isna().any():
        df['Genetic Disorder'] = df['Genetic Disorder'].fillna(df['Genetic Disorder'].mode()[0])

    # Drop rows where Genetic Disorder is still missing
    df = df.dropna(subset=["Genetic Disorder"])

    # Group-wise imputation
    grouped = df.groupby(["Genetic Disorder"])
    def fill_nulls(group):
        for column in group.columns:
            if column != "Genetic Disorder" and group[column].isnull().any():
                if group[column].dtype == 'object':
                    mode_value = group[column].mode()
                    if not mode_value.empty:
                        group[column] = group[column].fillna(mode_value[0])
                else:
                    median_value = group[column].median()
                    group[column] = group[column].fillna(median_value)
        return group

    df_filled = grouped.apply(fill_nulls).reset_index(drop=True)

    missing_values = ["No record", "Not available", "Not applicable", "-", "Ambiguous"]
    categorical_cols = df_filled.select_dtypes(include='object').columns
    for col in categorical_cols:
        if col != "Genetic Disorder":
            df_filled[col] = df_filled[col].replace(missing_values, 'Missing')

    symptom_cols = ['Symptom 1', 'Symptom 2', 'Symptom 3', 'Symptom 4', 'Symptom 5']
    if any(col in df_filled.columns for col in symptom_cols):
        df_filled['Symptom Count'] = df_filled[symptom_cols].eq# eq(1.0).sum(axis=1)
        df_filled.drop(columns=symptom_cols, inplace=True, errors='ignore')

    if all(col in df_filled.columns for col in ['Blood cell count (mcL)', 'White Blood cell count (thousand per microliter)']):
        df_filled['Total Blood Cell Count'] = df_filled['Blood cell count (mcL)'] + df_filled['White Blood cell count (thousand per microliter)']
        df_filled.drop(columns=['Blood cell count (mcL)', 'White Blood cell count (thousand per microliter)'], inplace=True)

    return df_filled

# Preprocessing function for test data
def preprocess_test(df):
    # Drop rows where all values are NaN
    df = df.dropna(how='all')

    columns_to_drop = [
        "Patient Id", "Family Name", "Institute Name", "Patient First Name",
        "Father's name", "Location of Institute", "Parental consent",
        "Test 1", "Test 2", "Test 3", "Test 4", "Test 5",
        "Mother's age", "Father's age", "Disorder Subclass"
    ]
    df = df.drop(columns=columns_to_drop, errors='ignore')

    cat_columns = ['Symptom 1', 'Symptom 2', 'Symptom 3', 'Symptom 4', 'Symptom 5']
    for col in cat_columns:
        if col in df.columns:
            df[col] = df[col].astype('object')

    # Simple imputation for all columns
    for column in df.columns:
        if df[column].isnull().any():
            if df[column].dtype == 'object':
                mode_value = df[column].mode()
                if not mode_value.empty:
                    df[column] = df[column].fillna(mode_value[0])
            else:
                median_value = df[column].median()
                df[column] = df[column].fillna(median_value)

    missing_values = ["No record", "Not available", "Not applicable", "-", "Ambiguous"]
    categorical_cols = df.select_dtypes(include='object').columns
    for col in categorical_cols:
        df[col] = df[col].replace(missing_values, 'Missing')

    symptom_cols = ['Symptom 1', 'Symptom 2', 'Symptom 3', 'Symptom 4', 'Symptom 5']
    if any(col in df.columns for col in symptom_cols):
        df['Symptom Count'] = df[symptom_cols].eq(1.0).sum(axis=1)
        df.drop(columns=symptom_cols, inplace=True, errors='ignore')

    if all(col in df.columns for col in ['Blood cell count (mcL)', 'White Blood cell count (thousand per microliter)']):
        df['Total Blood Cell Count'] = df['Blood cell count (mcL)'] + df['White Blood cell count (thousand per microliter)']
        df.drop(columns=['Blood cell count (mcL)', 'White Blood cell count (thousand per microliter)'], inplace=True)

    return df

# Apply preprocessing
df_train_preprocessed = preprocess_data(df_train)
df_test_preprocessed = preprocess_test(df_test)

# Define features and target
features = [col for col in df_train_preprocessed.columns if col not in ['Genetic Disorder']]
x = df_train_preprocessed[features]
y = df_train_preprocessed['Genetic Disorder']

# Encode categorical features
x = pd.get_dummies(x, drop_first=True)

# Encode target for models requiring numerical labels (e.g., XGBoost, ANN)
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Train-test split
x_train, x_test, y_train, y_test = train_test_split(x, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

# Define scalers
scalers = {
    'StandardScaler': StandardScaler(),
    'MinMaxScaler': MinMaxScaler(),
    'RobustScaler': RobustScaler()
}

# Define models and parameter grids
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(),
    'CatBoost': CatBoostClassifier(verbose=0),
    'XGBoost': XGBClassifier(),
    'LightGBM': LGBMClassifier()
}

param_grids = {
    'Logistic Regression': {
        'C': [0.01, 0.1, 1, 10],
        'solver': ['liblinear', 'saga'],
        'penalty': ['l1', 'l2']
    },
    'Random Forest': {
        'n_estimators': [50, 100, 150],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10]
    },
    'SVM': {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf'],
        'gamma': ['scale', 'auto']
    },
    'CatBoost': {
        'iterations': [100, 200],
        'depth': [4, 6, 8],
        'learning_rate': [0.01, 0.1]
    },
    'XGBoost': {
        'n_estimators': [100, 150, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'gamma': [0, 0.1, 0.2]
    },
    'LightGBM': {
        'n_estimators': [100, 150, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7]
    }
}

# ANN model (not included in GridSearch due to different training process)
def build_ann(input_dim):
    model = Sequential([
        Dense(64, activation='relu', input_dim=input_dim),
        Dropout(0.3),
        Dense(32, activation='relu'),
        Dropout(0.3),
        Dense(len(np.unique(y_encoded)), activation='softmax')
    ])
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# Perform grid search and evaluation
grid_search_results = []

for model_name, model in models.items():
    print(f'Performing Grid Search for {model_name}...')
    param_grid = param_grids[model_name]
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='f1_weighted', n_jobs=-1, verbose=1)

    for scaler_name, scaler in scalers.items():
        x_train_scaled = scaler.fit_transform(x_train)
        x_test_scaled = scaler.transform(x_test)

        grid_search.fit(x_train_scaled, y_train)

        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_
        best_score = grid_search.best_score_

        y_train_pred = best_model.predict(x_train_scaled)
        y_test_pred = best_model.predict(x_test_scaled)

        train_f1_weighted = f1_score(y_train, y_train_pred, average='weighted')
        test_f1_weighted = f1_score(y_test, y_test_pred, average='weighted')
        train_f1_macro = f1_score(y_train, y_train_pred, average='macro')
        test_f1_macro = f1_score(y_test, y_test_pred, average='macro')

        grid_search_results.append({
            'Model': model_name,
            'Scaler': scaler_name,
            'Best Params': best_params,
            'Best CV Weighted F1 Score': best_score,
            'Train Weighted F1 Score': train_f1_weighted,
            'Test Weighted F1 Score': test_f1_weighted,
            'Train Macro F1 Score': train_f1_macro,
            'Test Macro F1 Score': test_f1_macro
        })

        print(f'Best Params for {model_name} with {scaler_name}: {best_params}')
        print(f'Best CV Weighted F1 Score: {best_score:.4f}')
        print(f'Train Weighted F1 Score: {train_f1_weighted:.4f}')
        print(f'Test Weighted F1 Score: {test_f1_weighted:.4f}')
        print(f'Train Macro F1 Score: {train_f1_macro:.4f}')
        print(f'Test Macro F1 Score: {test_f1_macro:.4f}')
        print('-' * 40)

# Train ANN separately
print("Training ANN...")
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

ann_model = build_ann(x_train.shape[1])
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
history = ann_model.fit(
    x_train_scaled, y_train,
    validation_data=(x_test_scaled, y_test),
    epochs=100,
    batch_size=32,
    callbacks=[early_stopping],
    verbose=0
)

y_train_pred = np.argmax(ann_model.predict(x_train_scaled), axis=1)
y_test_pred = np.argmax(ann_model.predict(x_test_scaled), axis=1)

train_f1_weighted = f1_score(y_train, y_train_pred, average='weighted')
test_f1_weighted = f1_score(y_test, y_test_pred, average='weighted')
train_f1_macro = f1_score(y_train, y_train_pred, average='macro')
test_f1_macro = f1_score(y_test, y_test_pred, average='macro')

grid_search_results.append({
    'Model': 'ANN',
    'Scaler': 'StandardScaler',
    'Best Params': 'N/A',
    'Best CV Weighted F1 Score': 'N/A',
    'Train Weighted F1 Score': train_f1_weighted,
    'Test Weighted F1 Score': test_f1_weighted,
    'Train Macro F1 Score': train_f1_macro,
    'Test Macro F1 Score': test_f1_macro
})

print(f'ANN with StandardScaler:')
print(f'Train Weighted F1 Score: {train_f1_weighted:.4f}')
print(f'Test Weighted F1 Score: {test_f1_weighted:.4f}')
print(f'Train Macro F1 Score: {train_f1_macro:.4f}')
print(f'Test Macro F1 Score: {test_f1_macro:.4f}')
print('-' * 40)

# Display results
grid_search_df = pd.DataFrame(grid_search_results)
grid_search_df = grid_search_df.sort_values(by='Test Weighted F1 Score', ascending=False).reset_index(drop=True)
print("\nFinal Results:")
display(grid_search_df)

# Save the best model (based on Test Weighted F1 Score)
best_idx = grid_search_df['Test Weighted F1 Score'].idxmax()
best_model_name = grid_search_df.loc[best_idx, 'Model']
if best_model_name != 'ANN':
    best_model = grid_search.best_estimator_
    with open('best_model.pkl', 'wb') as f:
        pickle.dump(best_model, f)
else:
    ann_model.save('best_model.h5')

# Predict on test set for Kaggle submission
x_test_final = pd.get_dummies(df_test_preprocessed[features], drop_first=True)
x_test_final = x_test_final.reindex(columns=x_train.columns, fill_value=0)
x_test_final_scaled = scalers['StandardScaler'].transform(x_test_final)

if best_model_name != 'ANN':
    y_test_pred = best_model.predict(x_test_final_scaled)
else:
    y_test_pred = np.argmax(ann_model.predict(x_test_final_scaled), axis=1)

# Decode predictions back to original labels
y_test_pred_labels = label_encoder.inverse_transform(y_test_pred)

# Create submission file
submission = pd.DataFrame({
    'Patient Id': df_test['Patient Id'],
    'Genetic Disorder': y_test_pred_labels
})
submission.to_csv('submission.csv', index=False)
print("Submission file created: submission.csv")

Initial columns: ['Patient Id', 'Patient Age', "Genes in mother's side", 'Inherited from father', 'Maternal gene', 'Paternal gene', 'Blood cell count (mcL)', 'Patient First Name', 'Family Name', "Father's name", "Mother's age", "Father's age", 'Institute Name', 'Location of Institute', 'Status', 'Respiratory Rate (breaths/min)', 'Heart Rate (rates/min', 'Test 1', 'Test 2', 'Test 3', 'Test 4', 'Test 5', 'Parental consent', 'Follow-up', 'Gender', 'Birth asphyxia', 'Autopsy shows birth defect (if applicable)', 'Place of birth', 'Folic acid details (peri-conceptional)', 'H/O serious maternal illness', 'H/O radiation exposure (x-ray)', 'H/O substance abuse', 'Assisted conception IVF/ART', 'History of anomalies in previous pregnancies', 'No. of previous abortion', 'Birth defects', 'White Blood cell count (thousand per microliter)', 'Blood test result', 'Symptom 1', 'Symptom 2', 'Symptom 3', 'Symptom 4', 'Symptom 5', 'Genetic Disorder', 'Disorder Subclass']
Initial shape: (22083, 45)
Dropping