In [1]:
#Importing all necessary libraries 
#Using sckit learn, Random Forest Classifier and XGBoost
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [2]:
#Reading CSV files both train and test data
df_train = pd.read_csv("Train_Data.csv")
df_test = pd.read_csv("Test_Data.csv")

In [3]:
df_train.head()

Unnamed: 0,SEQN,RIAGENDR,PAQ605,BMXBMI,LBXGLU,DIQ010,LBXGLT,LBXIN,age_group
0,73564.0,2.0,2.0,35.7,110.0,2.0,150.0,14.91,Adult
1,73568.0,2.0,2.0,20.3,89.0,2.0,80.0,3.85,Adult
2,73576.0,1.0,2.0,23.2,89.0,2.0,68.0,6.14,Adult
3,73577.0,1.0,2.0,28.9,104.0,,84.0,16.15,Adult
4,73580.0,2.0,1.0,35.9,103.0,2.0,81.0,10.92,Adult


In [4]:
df_test.head()

Unnamed: 0,SEQN,RIAGENDR,PAQ605,BMXBMI,LBXGLU,DIQ010,LBXGLT,LBXIN
0,77017.0,1.0,1.0,32.2,96.0,2.0,135.0,15.11
1,75580.0,2.0,2.0,26.3,100.0,2.0,141.0,15.26
2,73820.0,1.0,2.0,28.6,107.0,2.0,136.0,8.82
3,80489.0,2.0,1.0,22.1,93.0,2.0,111.0,12.13
4,82047.0,1.0,1.0,24.7,91.0,2.0,105.0,3.12


In [5]:
#Dropping SEQUENTIAL ID Column, using Feature engineering as it is not necessary in analyzing and predicting data.
df_train.drop(columns=["SEQN"], inplace=True)
df_test.drop(columns=["SEQN"], inplace=True)

In [6]:
#Using isnull().sum() which tells how many NaN values are there in each columns.
print(df_train.isnull().sum())
print(df_train.shape)

RIAGENDR     18
PAQ605       13
BMXBMI       18
LBXGLU       13
DIQ010       18
LBXGLT       11
LBXIN         9
age_group    14
dtype: int64
(1966, 8)


In [7]:
df_train.info()  #To have basic info

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1966 entries, 0 to 1965
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   RIAGENDR   1948 non-null   float64
 1   PAQ605     1953 non-null   float64
 2   BMXBMI     1948 non-null   float64
 3   LBXGLU     1953 non-null   float64
 4   DIQ010     1948 non-null   float64
 5   LBXGLT     1955 non-null   float64
 6   LBXIN      1957 non-null   float64
 7   age_group  1952 non-null   object 
dtypes: float64(7), object(1)
memory usage: 123.0+ KB


In [8]:
print(df_test.isnull().sum())
print(df_test.shape)

RIAGENDR    2
PAQ605      1
BMXBMI      1
LBXGLU      1
DIQ010      1
LBXGLT      2
LBXIN       1
dtype: int64
(312, 7)


In [9]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 312 entries, 0 to 311
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   RIAGENDR  310 non-null    float64
 1   PAQ605    311 non-null    float64
 2   BMXBMI    311 non-null    float64
 3   LBXGLU    311 non-null    float64
 4   DIQ010    311 non-null    float64
 5   LBXGLT    310 non-null    float64
 6   LBXIN     311 non-null    float64
dtypes: float64(7)
memory usage: 17.2 KB


In [10]:
#Mapping Adult: 0 and Senior: 1 in Training Set for age_group.
df_train["age_group"] = df_train["age_group"].map({"Adult": 0, "Senior": 1})

In [11]:
df_train = df_train.dropna(subset=["age_group"])

In [12]:
#Classifying categorical columns and numerical columns.
categorical_cols = ["RIAGENDR", "PAQ605", "DIQ010"]
numerical_cols = ["BMXBMI", "LBXGLU", "LBXGLT", "LBXIN"]

In [13]:
#Using SimpleImputer() with mode to fill categorical columns.
cat_imputer = SimpleImputer(strategy="most_frequent")
df_train[categorical_cols] = cat_imputer.fit_transform(df_train[categorical_cols])
df_test[categorical_cols] = cat_imputer.transform(df_test[categorical_cols])

In [14]:
#Using SimpleImputer() with median to fill numerical columns.
num_imputer = SimpleImputer(strategy="median")
df_train[numerical_cols] = num_imputer.fit_transform(df_train[numerical_cols])
df_test[numerical_cols] = num_imputer.transform(df_test[numerical_cols])

In [15]:
#Separate features and target
X = df_train.drop(columns=["age_group"])
y = df_train["age_group"]

In [18]:
y

0       0.0
1       0.0
2       0.0
3       0.0
4       0.0
       ... 
1961    0.0
1962    0.0
1963    0.0
1964    0.0
1965    0.0
Name: age_group, Length: 1952, dtype: float64

In [19]:
#Using Standard Scaler to scale the data which will tranform and fit the data.
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(df_test)

In [20]:
#Split for validation (to evaluate performance). Even though the test data is already provided, it doesn't have the target labels (age_group) it's meant only for final prediction and submission.
X_train, X_val, y_train, y_val= train_test_split(X_scaled, y, test_size= 0.2, stratify= y, random_state= 42)

In [21]:
#Handle class imbalance using scale_pos_weight
scale = len(y_train[y_train == 0]) / len(y_train[y_train == 1])

In [22]:
#XGBoost is better at handling class imbalance(adults are many as compared to seniors) via scale_pos_weight. It uses boosting (sequential trees), which often captures minority class patterns better than Random Forest (parallel trees)
#Training XGBoost Classifier (best known settings for imbalance)
model = XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss',
    scale_pos_weight=scale,
    learning_rate=0.05,
    max_depth=4,
    n_estimators=300,
    subsample=0.9,
    colsample_bytree=0.9,
    random_state=42
)

model.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [24]:
# Predict on validation set and evaluate
val_proba = model.predict_proba(X_val)[:, 1]
threshold = 0.35  # lowered to improve Senior detection
val_preds = (val_proba > threshold).astype(int)

In [25]:
#Printing validation results obtained on validation set.
#ROC AUC Score is a threshold independent measures how well the model separates classes across all possible thresholds. It works well for imbalanced datasets better than just using accuracy.
print("\n Validation Results")
print("ROC AUC Score:", roc_auc_score(y_val, val_proba))
print("Classification Report:\n", classification_report(y_val, val_preds))


 Validation Results
ROC AUC Score: 0.6773132017034457
Classification Report:
               precision    recall  f1-score   support

         0.0       0.89      0.68      0.77       328
         1.0       0.26      0.57      0.35        63

    accuracy                           0.66       391
   macro avg       0.57      0.63      0.56       391
weighted avg       0.79      0.66      0.70       391



In [26]:
#Train on full dataset now (train + val) for best performance i.e. full training set.
#Predict on test data.
model.fit(X_scaled, y)
test_proba = model.predict_proba(X_test_scaled)[:, 1]
test_preds = (test_proba > threshold).astype(int)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [28]:
#Save to submission.csv
submission = pd.DataFrame({"age_group": test_preds})
submission.to_csv("submission.csv", index=False)
print("\n submission.csv generated successfully!")


 submission.csv generated successfully!


In [30]:
submission  #This csv has the same shape as the test data set.

Unnamed: 0,age_group
0,0
1,1
2,1
3,0
4,0
...,...
307,0
308,0
309,0
310,1


In [31]:
submission.shape

(312, 1)