In [None]:
# #if you didnot install it before run this cell
# !pip install xgboost

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Data Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OrdinalEncoder, OneHotEncoder
from sklearn.impute import KNNImputer, SimpleImputer

# Machine Learning Models
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC, LinearSVC
from xgboost import XGBClassifier

# Model Evaluation & Tuning
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    classification_report, confusion_matrix
)
from sklearn.inspection import DecisionBoundaryDisplay

In [None]:
df=pd.read_csv('/content/train.csv')

In [None]:
df.columns

In [None]:
df.isnull().sum()

In [None]:
df.head()

In [None]:
df.info()

Split The Data


In [None]:
X=df.drop('NObeyesdad',axis=1)
y=df['NObeyesdad']

In [None]:
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=41234)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=1234)

# The Attributes:
## Of Eating Habits:
* Frequent consumption of high caloric food (FAVC).
* Frequency of consumption of vegetables (FCVC).
* Number of main meals (NCP).
* Consumption of food between meals (CAEC).
* Consumption of water daily (CH20).
* Consumption of alcohol (CALC).

##Of Physical Condition:
* Calories consumption monitoring (SCC).
* Physical activity frequency (FAF).
* Time using technology devices (TUE).
* Transportation used (MTRANS).



In [None]:
X_train.head()

In [None]:
X_train.info()

#Descriptive Statistics.


In [None]:
X_train.describe().T

##Demographic Data
* **Age:**

    Mean: 23.81 years.

    Standard Deviation: 5.65.

    Minimum Age: 14 years.

    Maximum Age: 61 years.

    25% of the population is below 20 years old.

    75% is below 26 years old.

* **Height(m):**

    Mean: 1.70 m.

    Standard Deviation: 0.087.
    Minimum Height: 1.45 m.

    Maximum Height: 1.98 m.

    Most individuals are between 1.63 m and 1.76 m.
* **Weight (kg):**

    Mean: 87.81 kg.

    Standard Deviation: 26.36 kg.

    Minimum Weight: 39 kg.

    Maximum Weight: 165.06 kg.

    75th percentile is 111.6 kg.--> indicating a high proportion of individuals with overweight or obesity.

##Habits Of Eat.
* **Frequency of Vegetable Consumption (FCVC):** (Scale: 1-3)

    Mean: 2.44.

    Standard Deviation: 0.53.
   
    75th percentile = 3.00.

    *Most individuals have a moderate to high vegetable consumption.*

* **Number of Main Meals per Day (NCP)** (Scale: 1-4)

    Mean: 2.76 meals/day.

    Standard Deviation: 0.71.

    50% of the population eats 3 meals daily, with some individuals consuming only one meal per day (min) or up to four meals per day (max).

* **Daily Water Consumption (CH2O):** (Scale: 1-3)

    Mean: 2.03.

    Standard Deviation: 0.61.

    25% of individuals consume less than 1.8 liters of water per day, while the majority consume 2+ liters.


##Physical Activity & Techonology usage.
* **physical Activity Frequency (FAF):** (Scale: 0-3)

    Mean: 0.98.

    Standard Deviation: 0.84.

    Half of the population has low physical activity levels ,with some individuals having zero physical activity.

* **Time Spent Using Technology (TUE):** (Scale: 0-2)

    Mean: 0.61.

    Standard Deviation: 0.60.
    
    50% of individuals spend around 0.57 hours daily on devices, while some spend up to 2 hours per day.

# EDA


## Univariant Analysis.

In [None]:
# Distribution of 'Age'
plt.figure(figsize=(8, 6))
sns.histplot(X_train['Age'], kde=True)
plt.title('Distribution of Age')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()



In [None]:
# Boxplot of 'Height'
plt.figure(figsize=(8, 6))
sns.boxplot(y=X_train['Height'])
plt.title('Boxplot of Height')
plt.ylabel('Height')
plt.show()


In [None]:
# Countplot of 'Gender'
plt.figure(figsize=(8, 6))
sns.countplot(x='Gender', data=X_train)
plt.title('Countplot of Gender')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.show()

In [None]:
# Boxplot of 'Weight'
plt.figure(figsize=(8,6))
sns.boxplot(y=X_train['Weight'])
plt.title('Boxplot of Weight')
plt.ylabel('Weight')
plt.show()


In [None]:
# Countplot of 'family_history_with_overweight'
plt.figure(figsize=(8,6))
sns.countplot(x='family_history_with_overweight', data=X_train)
plt.title('Countplot of Family History with Overweight')
plt.xlabel('Family History with Overweight')
plt.ylabel('Count')
plt.show()

In [None]:
# Distribution of 'FCVC'
plt.figure(figsize=(8,6))
sns.histplot(X_train['FCVC'], kde=True)
plt.title('Distribution of Frequency of Consumption of Vegetables')
plt.xlabel('FCVC')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Distribution of 'NCP'
plt.figure(figsize=(8,6))
sns.histplot(X_train['NCP'], kde=True)
plt.title('Distribution of Number of main meals')
plt.xlabel('NCP')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Distribution of 'CH2O'
plt.figure(figsize=(8,6))
sns.histplot(X_train['CH2O'], kde=True)
plt.title('Distribution of Daily Water Consumption')
plt.xlabel('CH2O')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Distribution of 'TUE'
plt.figure(figsize=(8,6))
sns.histplot(X_train['TUE'], kde=True)
plt.title('Distribution of Time using technology devices')
plt.xlabel('TUE')
plt.ylabel('Frequency')
plt.show()


In [None]:
# Distribution of 'FAF'
plt.figure(figsize=(8,6))
sns.histplot(X_train['FAF'], kde=True)
plt.title('Distribution of Physical Activity Frequency')
plt.xlabel('FAF')
plt.ylabel('Frequency')
plt.show()


In [None]:
# Distribution of 'CALC'
plt.figure(figsize=(8,6))
sns.histplot(X_train['CALC'], kde=True)
plt.title('Distribution of Alcohol Consumption')
plt.xlabel('CALC')
plt.ylabel('Frequency')
plt.show()


In [None]:
for col in ['FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS']:
  plt.figure(figsize=(8, 6))
  sns.countplot(x=col, data=X_train)
  plt.title(f'Countplot of {col}')
  plt.xlabel(col)
  plt.ylabel('Count')
  plt.xticks(rotation=45, ha='right')
  plt.show()

In [None]:
X_train.duplicated().sum()

In [None]:
X_train=X_train.drop(['id','SCC','SMOKE'], axis=1)

In [None]:
num_col=X_train.select_dtypes(include=['number'])
num_col.columns

In [None]:
sc=StandardScaler()
X_train[num_col.columns]=sc.fit_transform(X_train[num_col.columns])
X_test[num_col.columns]=sc.transform(X_test[num_col.columns])
X_val[num_col.columns]=sc.transform(X_val[num_col.columns])

Preprocessing on train data

In [None]:
cat_cols=X_train.select_dtypes(include=['object'])
cat_cols.columns

In [None]:
for col in (X_train.select_dtypes(include=['object']).columns):
   print(col,':',X_train[col].unique())

In [None]:
X_train['NCP'] = X_train['NCP'].round()
X_train['FAF'] = X_train['FAF'].round()

In [None]:
binary_col= ['family_history_with_overweight', 'FAVC','Gender']
for col in binary_col:
    encoder = LabelEncoder()
    X_train[col] = encoder.fit_transform(X_train[col])
    X_test[col] = encoder.transform(X_test[col])
    X_val[col] = encoder.transform(X_val[col])

In [None]:
ordinal_features = ['CAEC', 'CALC']
categories = [
    ['no', 'Sometimes', 'Frequently', 'Always'],
    ['no', 'Sometimes', 'Frequently']
]
encoder_ord = OrdinalEncoder(categories=categories)
X_train[ordinal_features] = encoder_ord.fit_transform(X_train[ordinal_features])
X_train[ordinal_features] = X_train[ordinal_features].astype(int)

In [None]:
columns_to_encode = ['Gender', 'family_history_with_overweight', 'FAVC', 'MTRANS']

encoder = OneHotEncoder(drop="first", sparse_output=False)

x_train_onehot = encoder.fit_transform(X_train[columns_to_encode])
onehot_columns = encoder.get_feature_names_out(columns_to_encode)

onehot_df = pd.DataFrame(x_train_onehot, columns= onehot_columns, index=X_train.index)

X_train = X_train.drop(columns=columns_to_encode)

X_train = pd.concat([X_train, onehot_df], axis=1)


In [None]:
encoder_label = LabelEncoder()
y_train = encoder_label.fit_transform(y_train)

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

Preprocessing on Valdiation data

In [None]:
X_val = X_val.drop(["id","SMOKE","SCC"],axis=1)

In [None]:
X_val['NCP'] = X_val['NCP'].round()
X_val['FAF'] = X_val['FAF'].round()

In [None]:
ordinal_features = ['CAEC', 'CALC']
categories = [
    ['no', 'Sometimes', 'Frequently', 'Always'],
    ['no', 'Sometimes', 'Frequently']
]
X_val[ordinal_features] = encoder_ord.transform(X_val[ordinal_features])
X_val[ordinal_features] = X_val[ordinal_features].astype(int)


In [None]:
columns_to_encode = ['Gender', 'family_history_with_overweight', 'FAVC', 'MTRANS']

X_val_onehot = encoder.transform(X_val[columns_to_encode])
onehot_columns = encoder.get_feature_names_out(columns_to_encode)

onehot_df = pd.DataFrame(X_val_onehot, columns= onehot_columns, index=X_val.index)

X_val = X_val.drop(columns=columns_to_encode)

X_val = pd.concat([X_val, onehot_df], axis=1)


In [None]:
encoder_label = LabelEncoder()
y_val = encoder_label.fit_transform(y_val)

In [None]:
X_val = scaler.transform(X_val)

# **Modeling**


**Logistic Regression**

In [None]:
clf = LogisticRegression(max_iter=10000, random_state=0)
clf.fit(X_train, y_train)
y_pred=clf.predict(X_val)
acc = accuracy_score(y_val,y_pred ) * 100
print("Accuracy on valdation=",acc)

**Support Vector Machine**

In [None]:
svm = SVC(kernel="rbf", gamma=0.5, C=1.0)
svm.fit(X_train,y_train)
y_pred = svm.predict(X_val)
score=svm.score(X_train, y_train)
print("Training score: ",score)

score=svm.score(X_val, y_val)
print("Valdiation: ",score)

print(classification_report(y_pred, y_val))

**Decision Tree**

In [None]:
DT = DecisionTreeClassifier(max_depth = 9, random_state=42)
DT.fit(X_train, y_train)
y_pred=DT.predict(X_val)
score=DT.score(X_train, y_train)
print("Accuracy onTrain: ",score)

accuracy = accuracy_score(y_val, y_pred)
print("Accuracy on Valdation:",accuracy)

**Random Forest**

In [None]:
classifier = RandomForestClassifier(n_estimators=100, random_state=42)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_val)

accuracy = accuracy_score(y_val, y_pred)
print(f'Accuracy on Valdation: {accuracy * 100:.2f}%')

**GradientBoosting**

In [None]:
gb = GradientBoostingClassifier(n_estimators = 150)
gb.fit(X_train, y_train)
y_pred = gb.predict(X_val)
score=gb.score(X_train, y_train)
print("Training score: ",score)

report=classification_report(y_val, y_pred)
print("\nClassification Report:")
print(report)


**XGBosot**

In [None]:
xgb = XGBClassifier(n_estimators=100, random_state=42)
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print(f'Accuracy on Valdation: {accuracy * 100:.2f}%')

**Generalization**

In [None]:
params = {'n_estimators': [100, 200, 300],
         'learning_rate': [0.1, 0.01, 0.001],
         'max_depth': [3, 6, 9]}

score = 'accuracy'

In [None]:
model_xgb = GridSearchCV(xgb, params, scoring = score, n_jobs = -1)
model_xgb.fit(X_train, y_train)
print(model_xgb.best_params_)
print(model_xgb.best_score_)