In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np

In [None]:
df_train = pd.read_csv("/kaggle/input/alzheimers-disease-risk-prediction-eu-business/train.csv")
print(f"The number of columns/features: {df_train.shape[1]}")
print(f"The number of null:\n{df_train.isnull().sum()}")

In [None]:
data_train = df_train.iloc[:, 1:35]
data_train.head(5)
print(data_train["DoctorInCharge"].unique())

In [None]:
# Omit "DoctorInCharge" from list of features for data analysis
data_train = df_train.iloc[:, 1:34]
data_train.head(5)

## Look into Data
### Define type of columns(data)

1. Nominal Categorical - Matthews Correlation Coefficient
    1) Gender
    2) EducationLevel
    3) Ethnicity
    4) Smoking
    5) FamilyHistoryAlzheimers,
    6) CardiovascularDisease,
    7) Diabetes,
    8) Depression,
    9) HeadInjury,
    10) Hypertension,
    11) MemoryComplaints
    12) BehavioralProblems
    13) Confusion
    14) Disorientation
    15) PersonalityChanges
    16) DifficultyCompletingTasks
    17) Forgetfulness
    <br>
    <br>
2. Ordinal Categorical - Cramér's V
    1) Age
    <br>
    <br>
3. Continuous Numerical - Point-biseral Correlation
    1) AlcoholConsumption,
    2) PhysicalActivity,
    3) DietQuality,
    4) SleepQuality,
    5) MMSE: Mini-Mental State Exam score (0–30).
    6) FunctionalAssessment: Functional score (0–10).
    7) ADL: Activities of Daily Living score (0–10).
    8) BMI
    9) SystolicBP
    10) DiastolicBP
    11) CholesterolTotal
    12) CholesterolLDL
    13) CholesterolHDL
    14) CholesterolTriglycerides
    <br>
    <br>
4. Diagnosis - Nominal Categorical

#### 1) Matthews Correlations(MCC) for the whole columns/features

##### Features highly related to Diagnosis
1. FunctionalAssessment(-0.38)
2. ADL(-0.34)
3. MemoryComplaints(0.3)
4. MMSE(-0.22)
5. BehavioralProblems(0.21)

Unexpectedly, "Age" is not related to Alzheimer Diagonis. So we have to check distribution of values in Age

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

data_train_corr = data_train.corr()

plt.rcParams["figure.figsize"] = (20, 15)

mask = np.triu(np.ones_like(data_train_corr))
ax = sns.heatmap(data_train_corr, annot=True, cmap="coolwarm", vmax=1, vmin=-1, mask=mask, fmt=".2f")

#### 2) Matthews Correlations(MCC) for Nominal data

##### Features highly related to Diagnosis
1. MemoryComplaints(0.30)
2. BehavioralProblems(-0.21)

In [None]:
drop_Ord_Con = ["Age", "AlcoholConsumption", "PhysicalActivity", "DietQuality", "SleepQuality", "MMSE", "FunctionalAssessment", "ADL", "BMI", "SystolicBP", "DiastolicBP", "CholesterolTotal", "CholesterolLDL", "CholesterolHDL", "CholesterolTriglycerides"]
data_train_Nom =  data_train.drop(drop_Ord_Con, axis=1)
data_train_Nom
data_train_Nom_corr = data_train_Nom.corr(method="pearson")

In [None]:
plt.rcParams["figure.figsize"] = (10, 6)

mask = np.triu(np.ones_like(data_train_Nom_corr))
ax = sns.heatmap(data_train_Nom_corr, annot=True, cmap="coolwarm", vmax=1, vmin=-1, mask=mask, fmt=".2f")

#### 3) Cramer's V for Ordinal data

Age is not related to Diagnosis at all

In [None]:
data_train_Ord = data_train.iloc[:, [0,-1]]
data_train_Ord.head(5)

In [None]:
df_train["Age"].unique()

#### 4) Matthews Correlations(MCC) for Nominal data

##### Features highly related to Diagnosis
1. MemoryComplaints(0.30)
2. BehavioralProblems(-0.21)

In [None]:
drop_Nom_Ord = ["Gender", "EducationLevel", "Ethnicity", "Smoking", "FamilyHistoryAlzheimers", "CardiovascularDisease", "Diabetes", "Depression", "HeadInjury", "Hypertension", "MemoryComplaints", "BehavioralProblems", "Confusion", "Disorientation", "PersonalityChanges", "DifficultyCompletingTasks", "Forgetfulness", "Age"]
data_train_Con =  data_train.drop(drop_Nom_Ord, axis=1)
data_train_Con

### Feature Normalization
with Min-Max Scaling (0~1)

However, two scores of non-minMAXscaled(V4) and minMAXscaled(V5) are the same, "0.74305"
Thus, the things to notice should be kinds of model and features which I have to use, instead of scaling.

In [None]:
from sklearn.preprocessing import MinMaxScaler

def minMAX(dataset): 
    minMAXscaler = MinMaxScaler()
    minMAXscaler.fit(dataset)
    
    minMAXscaled_data = minMAXscaler.transform(dataset)
    minMAXscaled_data = pd.DataFrame(minMAXscaled_data)
    return minMAXscaled_data

mM_data_train = minMAX(data_train)
mM_data_train.head(5)

### Split train and test from "train.csv"

Why do I need to split train and test set in this project? <br>
Although I have to check if my model works well or not, answers of Diagnosis were not given in "test.csv"<br>
Also, "submission" is limited to maximum of 5 times per participant everyday.

In [None]:
from sklearn.model_selection import train_test_split
train, eval = train_test_split(mM_data_train, test_size=0.2)

In [None]:
# Set up features as "X_train" and Diagnosis as "y_train" from min-MAX scaled dataset
X_train = train.iloc[:, 0:32]
y_train = train.iloc[:, 32]
print(X_train.shape, y_train.shape)

# Set up features as "X_eval" and Diagnosis as "y_eval" from min-MAX scaled dataset
X_eval = eval.iloc[:, 0:32]
y_eval = eval.iloc[:, 32]
print(X_eval.shape, y_eval.shape)

### Import Models

In [None]:
# import model
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

In [None]:
model_LR = LinearRegression()
model_DT = DecisionTreeClassifier(criterion = "entropy")
model_XGB = XGBClassifier()

### Evaluate my model with "train" and "eval" variables
X_train, y_train <br>
X_eval, y_eval

#### 1) Linear Regression (Score: 0.74305)


In [None]:
model_LR.fit(X_train, y_train)
y_eval_pred_LR = model_LR.predict(X_eval)

# Convert Predictions to Binary (Threshold = 0.5)
y_eval_pred_LR = (y_eval_pred_LR >= 0.5).astype(int)  # Convert to 0 or 1

from sklearn.metrics import classification_report
print(classification_report(y_eval_pred_LR, y_eval))

#### 2) Decision Tree (Score: 0.87868)

In [None]:
model_DT.fit(X_train, y_train)
y_eval_pred_DT = model_DT.predict(X_eval).astype(int)

print(classification_report(y_eval_pred_DT, y_eval))

#### 3) XGBoost-Boosting Ensemble (Score: 0.92666)


In [None]:
model_XGB.fit(X_train, y_train)
y_eval_pred_XGB = model_XGB.predict(X_eval).astype(int)

print(classification_report(y_eval_pred_XGB, y_eval))

#### 4) Tune the best model(XGBoost here) to improve performance (GridSearchCV Score: 0.93288)

##### (1) with "class_weight" to assign higher weights to minority class, helping XGBoost learn better
However, it produces almost same F1 score as the base XGBoost

In [None]:
from sklearn.utils.class_weight import compute_sample_weight

sample_weights = compute_sample_weight("balanced", y_train)
model_XGB.fit(X_train, y_train, sample_weight=sample_weights)

y_eval_pred_XGB_sm = model_XGB.predict(X_eval).astype(int)
print(classification_report(y_eval_pred_XGB_sm, y_eval))

##### (2) with "SMOTE" to create synthetic samples of the minority class, balancing the dataset
However, "SMOTE" doesn't overcome the base XGBoost in F1 score

In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE()
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

model_XGB.fit(X_train_resampled, y_train_resampled)
y_eval_pred_XGB_SMOTE = model_XGB.predict(X_eval).astype(int)
print(classification_report(y_eval_pred_XGB_SMOTE, y_eval))

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "max_depth": [3, 5, 7],            # Control overfitting
    "learning_rate": [0.01, 0.1, 0.2], # Reduce learning rate for stability
    "n_estimators": [100, 300, 500],   # Increase trees for better learning
    "gamma": [0, 0.1, 0.3],            # Reduce unnecessary splits
    "subsample": [0.7, 1],             # Prevent overfitting
    "colsample_bytree": [0.7, 1]       # Feature selection
}

model_XGB = XGBClassifier(objective="binary:logistic", eval_metric="logloss")
grid_search = GridSearchCV(model_XGB, param_grid, scoring="f1", cv=3, verbose=1)
grid_search.fit(X_train, y_train)
print("Best Parameters:", grid_search.best_params_)

In [None]:
model_XGB_grid = grid_search.best_estimator_
model_XGB_grid.fit(X_train, y_train)
y_eval_pred_XGB_grid = model_XGB_grid.predict(X_eval).astype(int)
print(classification_report(y_eval_pred_XGB_grid, y_eval))

### Apply the best model to "test.csv"

#### 1) Pre-process "test.csv" to Dataframe

In [None]:
df_test = pd.read_csv("/kaggle/input/alzheimers-disease-risk-prediction-eu-business/test.csv")
print(f"The number of columns/features: {df_test.shape[1]}")
print(f"The number of null:\n{df_test.isnull().sum()}")

In [None]:
# Omit "PatientID" and "DoctorInCharge" from columns
data_test = df_test.iloc[:, 1:33]
data_test.head(5)

In [None]:
mM_data_test = minMAX(data_test)
mM_data_test.head(5)

In [None]:
X_test = mM_data_test

In [None]:
y_pred = model_XGB_grid.predict(X_test).astype(int)

In [None]:
submission = pd.DataFrame({
    "PatientID": df_test["PatientID"],
    "Diagnosis": y_pred
})

submission.to_csv("/kaggle/working/submission.csv", index=False)
print("✅ Submission file saved as submission.csv")

In [None]:
df_sub = pd.read_csv("/kaggle/working/submission.csv")
df_sub.tail(10)