<h1 style="color: #000000;text-align:center;font-size:200%">Table of contents(granular)</h1>

1. [Load Data](#Load)
1. [Exploratory Data Analysis - Univariate](#EDA)
1. [Exploratory Data Analysis - Multivariate](#EDA2)
1. [Feature Engineering](#FE)
1. [Model Building](#ModelB)
    1. [LGBM Model Training & Tuning](#LGBM)
    1. [Best HyperP - Model Training](#Tuning)
1. [Plotting Shap Values](#Shap)
1. [Optimizing Thresholds](#Threshold)
1. [Making Prediction](#Pred)





**PS : I am expecting a shakedown**

<a id='Load'></a>

# Load the Data 

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency
from sklearn.model_selection import train_test_split

import lightgbm as lgb
from catboost import CatBoostClassifier, Pool
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import roc_auc_score, precision_score, recall_score, roc_curve, accuracy_score, f1_score, auc,classification_report
from scipy.stats import ks_2samp

from sklearn.preprocessing import label_binarize,OneHotEncoder, StandardScaler, FunctionTransformer, LabelEncoder
from itertools import cycle

from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import RandomizedSearchCV
import shap

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, PCA

import warnings
warnings.filterwarnings("ignore")

import numpy as np 
import pandas as pd

In [None]:
train = pd.read_csv("/kaggle/input/playground-series-s4e2/train.csv")
test = pd.read_csv("/kaggle/input/playground-series-s4e2/test.csv")

In [None]:
original_data = pd.read_csv('/kaggle/input/obesity-or-cvd-risk-classifyregressorcluster/ObesityDataSet.csv')
train = pd.concat([train, original_data]).drop(['id'], axis=1).drop_duplicates()

**Creating Column List**

In [None]:
target = 'NObeyesdad'
num_col = []
cat_col = []

for i in train.columns.drop([target]) : 
    
    if train[i].dtype == 'object' : 
        cat_col.append(i)
        
    else : 
        num_col.append(i)

In [None]:
print("Numerical Columns : ", *num_col,"\n",sep="\n")
print("Categorical Columns : ", *cat_col,sep="\n")

<a id='EDA'></a>

# EDA

In [None]:
basic_EDA = True

In [None]:
if basic_EDA : 
    print("\n","="*25,"Train","="*25)
    display(train.describe().T)
    display(train.info())
    
    print("\n","="*25,"Test","="*25)
    display(test.describe().T)
    display(test.info())

<a id='EDA2'></a>

# EDA - Multivariate

In [None]:
def corr_heat_map(df,scale=1) :
    # Calculate the correlation matrix
    correlation_matrix = df.corr()

    # Create a mask for the upper triangle
    mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))

    # Set up the matplotlib figure
    plt.figure(figsize=(10//scale, 8//scale))

    # Define a custom color palette
    cmap = sns.diverging_palette(220, 20, as_cmap=True)

    # Draw the heatmap with the mask and correct aspect ratio
    sns.heatmap(correlation_matrix, mask=mask, cmap=cmap, vmax=.3, center=0,
                square=True, linewidths=.5, cbar_kws={"shrink": 0.7})

    plt.title('Correlation Heatmap')

In [None]:
plt.figure(figsize=(20,7))

corr_heat_map(train[num_col],2)
corr_heat_map(test[num_col],2)

plt.show()

In [None]:
plt.figure(figsize=(30,30))
temp=num_col.copy()
temp.extend([target])
sns.pairplot(train[sorted(temp)],hue=target)
plt.show()

In [None]:
plt.figure(figsize=(50,4))

x=1
for i in num_col : 
    
    plt.subplot(1,8,x)
    sns.violinplot(
        data=train,
        x=target,
        y=i
    )
    plt.xticks(rotation=45)
    x+=1

plt.show()

In [None]:
temp = train.copy()
plt.figure(figsize=(30,12))
plot_num = 1
for i in ['Age', 'Height', 'FCVC', 'NCP', 'FAF', 'TUE'] :
    
    sc = StandardScaler()
    p =  PCA(n_components=2,random_state=42)
    temp['Weight2'] = temp['Weight']//5
    
    x=['Weight2','CH2O',i]
    temp[['pca_1_wt_ch20_'+i+"_1",'pca_1_wt_ch20_'+i+"_2"]] = p.fit_transform(temp[x])
    
    plt.subplot(2,3,plot_num)
    plt.title(x)
    sns.scatterplot(x= temp['pca_1_wt_ch20_'+i+"_1"],
                    y=temp['pca_1_wt_ch20_'+i+"_2"],
                    hue=temp[target],
                    palette = 'pastel',
                    legend = 'full'
    )
    sns.regplot(x= temp['pca_1_wt_ch20_'+i+"_1"],
                y=temp['pca_1_wt_ch20_'+i+"_2"],
                scatter=False
    )
    
    plot_num+=1

plt.show()

<a id='FE'></a>
# Feature Engineering

<a id='Label Encoding'></a>
### PipeLine

In [None]:
train = pd.get_dummies(train,
                       columns=cat_col)
test = pd.get_dummies(test, 
                      columns=cat_col)
train.head()

<a id='Label Encoding'></a>
### Label Encoding 

In [None]:
target = 'NObeyesdad'

le = LabelEncoder()
train['NObeyesdad'] = le.fit_transform(train['NObeyesdad'])

### Train-Test-Split

In [None]:
X_train, X_val, y_train, y_val = train_test_split(train.drop([target],axis=1),train[target],test_size=0.2,random_state=42)

In [None]:
X_train.shape , y_train.shape, X_val.shape, y_val.shape 

In [None]:
X_test = test.copy().drop(['id'],axis=1)

<a id='ModelB'></a>
# Model building 

<a id='LGBM'></a>
## LGBM Model

In [None]:
import optuna
ran_optuna = False 

In [None]:
def optimization_function(trial) : 
    
    lgbParams = {
        'num_class': 7,
        'random_state': 42,
        'metric': 'multi_logloss',
        "boosting_type": "gbdt",
        'objective': 'multiclass',
        
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.05),
        'n_estimators': trial.suggest_int('n_estimators', 400, 600),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-1, 10.0),
        'max_depth': trial.suggest_int('max_depth', 6, 20),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 0.9),
        'subsample': trial.suggest_float('subsample', 0.8, 1.0),
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 50),
    }
    
    lgb_model=lgb.LGBMClassifier(**lgbParams)
    
#     skf = StratifiedKFold(n_splits=5,shuffle=False, random_state=None)
#     accuracy = cross_val_score(lgb_model,X_train,y_train, cv=skf,scoring='accuracy')
#     print("="*50,'\nValidation Accuracy:', accuracy.mean())

    lgb_model.fit(X_train,y_train)
    
    acc = accuracy_score(y_val,lgb_model.predict(X_val))
    
    return acc

In [None]:
# %%time 

# # Importing Optuna for hyperparameter optimization
# from optuna.samplers import TPESampler
# import optuna

# # Set up the sampler for Optuna optimization
# sampler = optuna.samplers.TPESampler(seed=42)  # Using Tree-structured Parzen Estimator sampler for optimization

# # Create a study object for Optuna optimization
# study = optuna.create_study(direction="maximize", sampler=sampler)

# # Run the optimization process
# study.optimize(optimization_function, n_trials=200)

# # Get the best parameters after optimization
# best_params = study.best_params

# print('='*50)
# print(best_params)

# ran_optuna = True

In [None]:
if ran_optuna : 

    print('Number of finished trials:', len(study.trials))

    print('Best trial:', study.best_trial.params)

    optuna.visualization.plot_param_importances(study)

    study.trials_dataframe().sort_values('value',ascending=False)

    optuna.visualization.plot_slice(study)

In [None]:
# 100 trials 
# {'objective': 'multiclassova', 'learning_rate': 0.04641200998070569, 'n_estimators': 587, 'reg_alpha': 0.0065043557057678746, 'reg_lambda': 4.460933310544669, 'max_depth': 7, 'colsample_bytree': 0.6833315654013498, 'subsample': 0.8193986843950917, 'min_child_samples': 15}

<a id='Tuning'></a>
# Tuning & Retraining

In [None]:
if ran_optuna : 
    lgbParams = study.best_trial.params

else :
    
#     # 100- traials with PCA seed = None
#     lgbParams = {
#         'objective': 'multiclassova', 
#         'learning_rate': 0.04641200998070569, 
#         'n_estimators': 587, 
#         'reg_alpha': 0.0065043557057678746, 
#         'reg_lambda': 4.460933310544669, 
#         'max_depth': 7, 'colsample_bytree': 0.6833315654013498, 
#         'subsample': 0.8193986843950917, 
#         'min_child_samples': 15
#     }
    
    
    # Moaz HyperParams
    lgbParams = {
        "objective": "multiclass",          # Objective function for the model
        "metric": "multi_logloss",          # Evaluation metric
        "verbosity": -1,                    # Verbosity level (-1 for silent)
        "boosting_type": "gbdt",            # Gradient boosting type
        "random_state": 42,       # Random state for reproducibility
        "num_class": 7,                     # Number of classes in the dataset
        'learning_rate': 0.030962211546832760,  # Learning rate for gradient boosting
        'n_estimators': 500,                # Number of boosting iterations
        'lambda_l1': 0.009667446568254372,  # L1 regularization term
        'lambda_l2': 0.04018641437301800,   # L2 regularization term
        'max_depth': 10,                    # Maximum depth of the trees
        'colsample_bytree': 0.40977129346872643,  # Fraction of features to consider for each tree
        'subsample': 0.9535797422450176,    # Fraction of samples to consider for each boosting iteration
        'min_child_samples': 26             # Minimum number of data needed in a leaf
    }

In [None]:
fixed_params = {
    'boosting_type': 'gbdt',
    'num_class': 7,
    'random_state': 42,
    'metric': 'multi_logloss',
}

In [None]:
for i in fixed_params.keys() : 

    lgbParams[i] = fixed_params[i]

In [None]:
lgbParams

In [None]:
lgb_model_final = lgb.LGBMClassifier(**lgbParams)

In [None]:
lgb_model_final = lgb_model_final.fit(X_train, y_train,verbose=100)

In [None]:
y_pred = lgb_model_final.predict(X_val)

In [None]:
accuracy_score(y_val, y_pred) 

In [None]:
y_pred_proba = lgb_model_final.predict_proba(X_val)

In [None]:
X_val = pd.concat([X_val.reset_index(),pd.DataFrame({'actual':le.inverse_transform(y_val),'guess':le.inverse_transform(y_pred)})],axis=1)
X_val['dummy'] = 1
X_val.pivot_table(index='actual',columns='guess',values='dummy',aggfunc='sum')

In [None]:
def plot_lgbm_feature_importance(model, feature_names=None, top_n=10, plot=True):

    # Get feature importance
    feature_importance = model.feature_importances_
    
    # Get feature names
    if feature_names is None:
        feature_names = model.feature_name()

    # Create DataFrame with feature names and importance scores
    feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})

    # Sort by importance and select top features
    feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)#.head(top_n)

    # Plot the feature importance
    if plot:
        plt.figure(figsize=(10, 10))
        sns.barplot(x='Importance', y='Feature', data=feature_importance_df, palette='viridis')
        plt.title('LightGBM Feature Importance')
        plt.show()

    return feature_importance_df

feature_importance_df = plot_lgbm_feature_importance(lgb_model_final,feature_names=X_train.columns)

<a id='Shap'></a>
# Plot Shap

In [None]:
import shap
def plot_shap_summary(model, X, plot_type='bar', plot=True):
    
    # Create a SHAP explainer object
    explainer = shap.TreeExplainer(model)

    # Calculate SHAP values
    shap_values = explainer.shap_values(X)

    return shap_values

In [None]:
shap_values = plot_shap_summary(lgb_model_final, pd.DataFrame(data=X_train,columns=X_train.columns))

In [None]:
for x in range(len(shap_values)) : 
    print("="*50)
    print(le.inverse_transform([x]))
    shap.summary_plot(shap_values[x], pd.DataFrame(data=X_train,columns=X_train.columns)) #, plot_type=plot_type, show=False)

<a id='Threshold'></a>
# Optimization of Thresholds

In [None]:
threshold_optimization = True
run_optuna_threshold = True

In [None]:
import optuna

def objective(trial):

    # Define the thresholds for each class
    thresholds = {}
    for i in range(num_classes):
        thresholds[f'threshold_{i}'] = trial.suggest_uniform(f'threshold_{i}', 0.0, 1.0)

    # Apply the thresholds to convert probabilities to predictions
    y_pred = apply_thresholds(y_pred_proba, thresholds)

    # Calculate accuracy
    accuracy = accuracy_score(y_val, y_pred)

    return accuracy  

def apply_thresholds(y_proba, thresholds):
    # Apply the specified thresholds to convert probabilities to predicted labels
    y_pred_labels = np.argmax(y_proba, axis=1)
    for i in range(y_proba.shape[1]):
        y_pred_labels[y_proba[:, i] > thresholds[f'threshold_{i}']] = i

    return y_pred_labels

In [None]:
if run_optuna_threshold : 
    num_classes = 7
    y_pred_proba = y_pred_proba  # Example: replace with actual y_pred_proba
    y_val = y_val  # Example: replace with actual y_val

    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=500)

    # Get the best thresholds
    best_thresholds = study.best_params
    print("Best Thresholds:", best_thresholds)
    print("Best Value:", study.best_value)
    threshold_optimization = True


In [None]:
if threshold_optimization : 
    if run_optuna_threshold == False : 
        best_thresholds = {'threshold_0': 0.724201213234911, 'threshold_1': 0.6161299800571379, 'threshold_2': 0.29138887902587174, 'threshold_3': 0.3145837593497076, 'threshold_4': 0.8469398340837189, 'threshold_5': 0.6800824438387787, 'threshold_6': 0.35886959729223455}

In [None]:
# Apply the best thresholds to get final predictions
if threshold_optimization :
    best_y_pred = apply_thresholds(y_pred_proba, best_thresholds)
    accuracy = accuracy_score(y_val, best_y_pred)
    print("Best Accuracy:", accuracy)

<a id='Pred'></a>
# Make Predictions & Submission Files

In [None]:
y_test_label = lgb_model_final.predict(X_test)

display(len(y_test_label))

test['NObeyesdad'] = y_test_label
out = test[['id','NObeyesdad']]
out.NObeyesdad = le.inverse_transform(out.NObeyesdad)
display(out.head(5))

out.to_csv("submission_non_threshold_opt.csv",index=False)

In [None]:
if threshold_optimization : 
    
    y_test_label = lgb_model_final.predict_proba(X_test)
    
    y_test_label = apply_thresholds(y_test_label, best_thresholds)

    display(len(y_test_label))
    
    test['NObeyesdad'] = y_test_label
    out = test[['id','NObeyesdad']]
    out.NObeyesdad = le.inverse_transform(out.NObeyesdad)
    display(out.head(5))

    out.to_csv("submission_threshold_opt.csv",index=False)

# Missclassification Investigation 

In [None]:
feature_importance_df.head(5).Feature

In [None]:
temp = train.copy()

In [None]:
temp.shape[0]

In [None]:
shap_values = plot_shap_summary(lgb_model_final, pd.DataFrame(data=temp.drop(target,axis=1),columns=temp.columns.drop(target)))

In [None]:
shap_magnitude = np.linalg.norm(shap_values,axis=2)
threshold = 6.5
outlier_mask = np.max(shap_magnitude, axis=0) > threshold

In [None]:
# Use numpy's unique function to get unique values and their counts
unique_values, counts = np.unique(outlier_mask, return_counts=True)

# Combine unique values and counts into a dictionary
value_counts = dict(zip(unique_values, counts))

# Print the value counts
print(value_counts)

In [None]:
y_test_label = lgb_model_final.predict(temp.drop(target,axis=1))
temp['NObeyesdad_pred'] = y_test_label

In [None]:
temp['outlier_mask'] = outlier_mask

In [None]:
temp['miss_class'] = temp['NObeyesdad_pred']!=temp['NObeyesdad']

In [None]:
temp2 = temp[temp['miss_class']]

In [None]:
temp['outlier_mask']

In [None]:
temp3 = temp[temp['outlier_mask']]

In [None]:
# First joint plot
plt.figure(figsize=(8, 8))

# Plot the main scatter plot
sns.kdeplot(x=temp.Height, y=temp.Weight, hue=le.inverse_transform(temp[target]), palette='Set2')

# Add the second set of points as red circles
sns.scatterplot(x='Height', y='Weight', data=temp2, color='red', edgecolor='red', marker='o', linewidth=1, alpha=0.7, facecolors='none')

plt.show()

In [None]:
# First joint plot
plt.figure(figsize=(8, 8))

# Plot the main scatter plot
sns.scatterplot(x=temp.Height, y=temp.Weight, hue=le.inverse_transform(temp[target]), palette='Set2')

# Add the second set of points as red circles
sns.scatterplot(x='Height', y='Weight', data=temp2, color='red', edgecolor='red', marker='o', linewidth=1, alpha=0.7, facecolors='none')

plt.show()

In [None]:
# First joint plot
plt.figure(figsize=(8, 8))

# Plot the main scatter plot
sns.scatterplot(x=temp.Height, y=temp.Weight, hue=le.inverse_transform(temp[target]), palette='Set2')

# Add the second set of points as red circles
sns.scatterplot(x='Height', y='Weight', data=temp3, color='red', edgecolor='red', marker='o', linewidth=1, alpha=0.7, facecolors='none')

plt.show()