# Stage-1 : Data Exploration and Cleaning

### Importing Required Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import scipy.stats as ss
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

### Import Training and Testing Data "df_train" and "df_test"

In [2]:
df_train = pd.read_csv('/kaggle/input/ml-project-data/train.csv')
df_test = pd.read_csv('/kaggle/input/ml-project-data/test.csv')

### Checking null values by counting missing numbers

In [3]:
print(df_train.isnull().sum())
print(df_test.isnull().sum())

id                                0
Gender                            0
Age                               0
Height                            0
Weight                            0
family_history_with_overweight    0
FAVC                              0
FCVC                              0
NCP                               0
CAEC                              0
SMOKE                             0
CH2O                              0
SCC                               0
FAF                               0
TUE                               0
CALC                              0
MTRANS                            0
WeightCategory                    0
dtype: int64
id                                0
Gender                            0
Age                               0
Height                            0
Weight                            0
family_history_with_overweight    0
FAVC                              0
FCVC                              0
NCP                               0
CAEC           

### Checking duplicate rows it can bias model training so remove if it exists

In [4]:
print("Train duplicates:", df_train.duplicated().sum())
print("Test duplicates:", df_test.duplicated().sum())

Train duplicates: 0
Test duplicates: 0


### Inspecting unique values in categorical columns so that we can understand where to apply one-hot encoding 

In [5]:
for col in ['Gender', 'MTRANS', 'FAVC', 'FCVC', 'CAEC', 'SMOKE', 'SCC', 'CALC']:
    print(f"{col} unique values:", df_train[col].unique())

Gender unique values: ['Male' 'Female']
MTRANS unique values: ['Public_Transportation' 'Automobile' 'Walking' 'Motorbike' 'Bike']
FAVC unique values: ['yes' 'no']
FCVC unique values: [2.         1.880534   3.         2.679664   2.919751   1.99124
 1.397468   2.636719   1.         1.392665   2.203962   2.971588
 2.668949   1.98989905 2.417635   2.219186   2.919526   2.263245
 2.649406   1.754401   2.303656   2.020785   2.068834   2.689929
 2.979383   2.225731   2.843456   2.312528   2.962415   2.945967
 2.108638   1.826885   2.200588   2.598051   2.984425   1.387489
 2.76533    2.941627   2.490776   2.801514   2.336044   1.270448
 2.9673     2.325623   2.722161   2.680375   2.938801   2.431346
 1.994679   2.393837   1.428289   2.341999   2.967853   1.899116
 1.906194   2.859097   2.997951   2.499388   1.4925     2.239634
 2.587789   2.795086   2.805512   2.048962   2.319776   2.823179
 1.188089   2.671238   1.882235   2.61939    2.191429   2.995599
 2.594653   1.369529   2.457548   2.73

### Calculate descriptie statistics for numerical columns so we can identify outliers present in particular column 

In [6]:
df_train[['Height','Weight','Age']].describe()

Unnamed: 0,Height,Weight,Age
count,15533.0,15533.0,15533.0
mean,1.699918,87.785225,23.816308
std,0.08767,26.369144,5.663167
min,1.45,39.0,14.0
25%,1.630927,66.0,20.0
50%,1.7,84.0,22.771612
75%,1.762921,111.600553,26.0
max,1.975663,165.057269,61.0
