In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import mean_squared_error

# Classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier

# Regressors
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import xgboost as xgb

In [3]:
def load_data(train_path, test_path, sample_sub_path):
    train = pd.read_csv(train_path)
    test  = pd.read_csv(test_path)
    sample_sub = pd.read_csv(sample_sub_path)
    return train, test, sample_sub

# Replace these paths with correct locations
TRAIN_PATH = "/content/drive/MyDrive/Other Stuffs/Summer Analytics 2025 IIT-G/Second Hackathon/Train_Data.csv"
TEST_PATH  = "/content/drive/MyDrive/Other Stuffs/Summer Analytics 2025 IIT-G/Second Hackathon/Test_Data.csv"
SUB_PATH   = "/content/drive/MyDrive/Other Stuffs/Summer Analytics 2025 IIT-G/Second Hackathon/Sample_Submission.csv"

train, test, sample_sub = load_data(TRAIN_PATH, TEST_PATH, SUB_PATH)

In [4]:
print("Train shape:", train.shape)
print("Test shape:", test.shape)
print("Columns:", train.columns.tolist())
display(train.head())
print("\nMissing values:\n", train.isnull().sum())

Train shape: (1966, 9)
Test shape: (312, 8)
Columns: ['SEQN', 'RIAGENDR', 'PAQ605', 'BMXBMI', 'LBXGLU', 'DIQ010', 'LBXGLT', 'LBXIN', 'age_group']


Unnamed: 0,SEQN,RIAGENDR,PAQ605,BMXBMI,LBXGLU,DIQ010,LBXGLT,LBXIN,age_group
0,73564.0,2.0,2.0,35.7,110.0,2.0,150.0,14.91,Adult
1,73568.0,2.0,2.0,20.3,89.0,2.0,80.0,3.85,Adult
2,73576.0,1.0,2.0,23.2,89.0,2.0,68.0,6.14,Adult
3,73577.0,1.0,2.0,28.9,104.0,,84.0,16.15,Adult
4,73580.0,2.0,1.0,35.9,103.0,2.0,81.0,10.92,Adult



Missing values:
 SEQN         12
RIAGENDR     18
PAQ605       13
BMXBMI       18
LBXGLU       13
DIQ010       18
LBXGLT       11
LBXIN         9
age_group    14
dtype: int64


In [12]:
# Target and Features
y = train["age_group"]
X = train.drop(columns=["age_group"])

# Manually define column types
categorical_cols = ["RIAGENDR", "DIQ010", "PAQ605"]
numeric_cols = ["BMXBMI", "LBXGLU", "LBXGLT", "LBXIN"]

In [None]:
def build_preprocessor(df, target_col="age_group", id_col="SEQN"):
    num_cols = df.select_dtypes(include=["int64", "float64"]).columns.tolist()
    cat_cols = df.select_dtypes(include=["object", "category"]).columns.tolist()

    # Remove ID and target column if present
    for col in [target_col, id_col]:
        if col in num_cols:
            num_cols.remove(col)
        if col in cat_cols:
            cat_cols.remove(col)

    # Pipelines
    num_pipe = Pipeline([
        ("impute", SimpleImputer(strategy="median")),
        ("scale",  StandardScaler())
    ])
    cat_pipe = Pipeline([
        ("encode", OneHotEncoder(handle_unknown="ignore"))
    ])

    preproc = ColumnTransformer([
        ("num", num_pipe, num_cols),
        ("cat", cat_pipe, cat_cols)
    ])
    return preproc, num_cols, cat_cols

preproc, num_cols, cat_cols = build_preprocessor(train)
print("Numeric columns:", num_cols)
print("Categorical columns:", cat_cols)

In [13]:
# Preprocessing
num_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

cat_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", num_pipe, numeric_cols),
    ("cat", cat_pipe, categorical_cols)
])

# Final Model Pipeline
model_pipeline = Pipeline([
    ("preprocessing", preprocessor),
    ("classifier", LogisticRegression(max_iter=1000))
])

# Train/test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train
model_pipeline.fit(X_train, y_train)

# Predict
y_pred = model_pipeline.predict(X_val)
acc = accuracy_score(y_val, y_pred)

# Evaluate
print(f"Validation Accuracy: {acc:.4f}")
print(classification_report(y_val, y_pred))

ValueError: Input contains NaN