### ML Code 

Estimation of obesity levels based on eating habits and physical condition in individuals

In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

In [2]:
from sklearn.preprocessing import LabelEncoder

In [3]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("sahilislam007/college-student-placement-factors-dataset")

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
df = pd.read_csv('college_student_placement_dataset.csv')

In [11]:
df.head()

Unnamed: 0,College_ID,IQ,Prev_Sem_Result,CGPA,Academic_Performance,Internship_Experience,Extra_Curricular_Score,Communication_Skills,Projects_Completed,Placement
0,CLG0030,107,6.61,6.28,8,No,8,8,4,No
1,CLG0061,97,5.52,5.37,8,No,7,8,0,No
2,CLG0036,109,5.36,5.83,9,No,3,1,1,No
3,CLG0055,122,5.47,5.75,6,Yes,1,6,1,No
4,CLG0004,96,7.91,7.69,7,No,8,10,2,No


In [12]:
df.describe()

Unnamed: 0,IQ,Prev_Sem_Result,CGPA,Academic_Performance,Extra_Curricular_Score,Communication_Skills,Projects_Completed
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,99.4718,7.535673,7.532379,5.5464,4.9709,5.5618,2.5134
std,15.053101,1.447519,1.470141,2.873477,3.160103,2.900866,1.715959
min,41.0,5.0,4.54,1.0,0.0,1.0,0.0
25%,89.0,6.29,6.29,3.0,2.0,3.0,1.0
50%,99.0,7.56,7.55,6.0,5.0,6.0,3.0
75%,110.0,8.79,8.77,8.0,8.0,8.0,4.0
max,158.0,10.0,10.46,10.0,10.0,10.0,5.0


In [13]:
df.isnull().sum() #no missibg values

College_ID                0
IQ                        0
Prev_Sem_Result           0
CGPA                      0
Academic_Performance      0
Internship_Experience     0
Extra_Curricular_Score    0
Communication_Skills      0
Projects_Completed        0
Placement                 0
dtype: int64

In [14]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   College_ID              10000 non-null  object 
 1   IQ                      10000 non-null  int64  
 2   Prev_Sem_Result         10000 non-null  float64
 3   CGPA                    10000 non-null  float64
 4   Academic_Performance    10000 non-null  int64  
 5   Internship_Experience   10000 non-null  object 
 6   Extra_Curricular_Score  10000 non-null  int64  
 7   Communication_Skills    10000 non-null  int64  
 8   Projects_Completed      10000 non-null  int64  
 9   Placement               10000 non-null  object 
dtypes: float64(2), int64(5), object(3)
memory usage: 781.4+ KB


In [15]:
print("IQ Feature Range: ",  df['IQ'].min(), "to",df['IQ'].max())

IQ Feature Range:  41 to 158


In [16]:
df_feat= df.copy()

Feature 1: Categorize IQ

In [17]:
#Feature 1: Categorize IQ 
def iq_group(IQ):
    if IQ < 70:
        return "Low"
    elif IQ < 90:
        return "Below Average"
    elif IQ < 110:
        return "Average"
    elif IQ < 130:
        return "Above Average"
    else:
        return "High"

In [18]:
df_feat['IQ_group'] = df_feat["IQ"].apply(iq_group)

Feature 2: Categorize extra-curricular score

In [19]:
def extra_curr(score):
    if score == 0:
        return "None"
    elif score < 4:
        return "Low"
    elif score < 8:
        return "Moderate"
    else:
        return "High"

In [20]:
df_feat['extra_curr_score'] = df_feat['Extra_Curricular_Score'].apply(extra_curr)

Feature 3: Categorize Communication Score

In [21]:
def comm_score(score):
    if score < 4:
        return "Poor"
    elif score < 7:
        return "Fair"
    elif score < 9:
        return "Good"
    else:
        return "Excellent"

In [22]:
df_feat["Comm_score"] = df_feat["Communication_Skills"].apply(comm_score)

In [23]:
df_feat.sample(1)

Unnamed: 0,College_ID,IQ,Prev_Sem_Result,CGPA,Academic_Performance,Internship_Experience,Extra_Curricular_Score,Communication_Skills,Projects_Completed,Placement,IQ_group,extra_curr_score,Comm_score
271,CLG0064,78,6.43,6.34,5,No,7,4,0,No,Below Average,Moderate,Fair


Drop Old Irrelevant Columns

In [24]:
df_feat.drop(columns=['College_ID', 'Extra_Curricular_Score', 'Communication_Skills', 'IQ'])

Unnamed: 0,Prev_Sem_Result,CGPA,Academic_Performance,Internship_Experience,Projects_Completed,Placement,IQ_group,extra_curr_score,Comm_score
0,6.61,6.28,8,No,4,No,Average,High,Good
1,5.52,5.37,8,No,0,No,Average,Moderate,Good
2,5.36,5.83,9,No,1,No,Average,Low,Poor
3,5.47,5.75,6,Yes,1,No,Above Average,Low,Fair
4,7.91,7.69,7,No,2,No,Average,High,Excellent
...,...,...,...,...,...,...,...,...,...
9995,8.41,8.29,4,No,0,Yes,Above Average,Low,Good
9996,9.25,9.34,7,No,2,No,Below Average,,Good
9997,6.08,6.25,3,Yes,5,No,Below Average,Low,Excellent
9998,8.77,8.92,3,No,1,No,Average,Moderate,Fair


Select Features and Target

In [25]:
X = df_feat[['Prev_Sem_Result', 'CGPA', 'Academic_Performance', 'Internship_Experience', 'Projects_Completed', 'IQ_group', 'extra_curr_score', 'Comm_score']]
Y = df_feat['Placement']

In [26]:
X

Unnamed: 0,Prev_Sem_Result,CGPA,Academic_Performance,Internship_Experience,Projects_Completed,IQ_group,extra_curr_score,Comm_score
0,6.61,6.28,8,No,4,Average,High,Good
1,5.52,5.37,8,No,0,Average,Moderate,Good
2,5.36,5.83,9,No,1,Average,Low,Poor
3,5.47,5.75,6,Yes,1,Above Average,Low,Fair
4,7.91,7.69,7,No,2,Average,High,Excellent
...,...,...,...,...,...,...,...,...
9995,8.41,8.29,4,No,0,Above Average,Low,Good
9996,9.25,9.34,7,No,2,Below Average,,Good
9997,6.08,6.25,3,Yes,5,Below Average,Low,Excellent
9998,8.77,8.92,3,No,1,Average,Moderate,Fair


In [27]:
Y

0        No
1        No
2        No
3        No
4        No
       ... 
9995    Yes
9996     No
9997     No
9998     No
9999     No
Name: Placement, Length: 10000, dtype: object

In [28]:
X.sample(1)

Unnamed: 0,Prev_Sem_Result,CGPA,Academic_Performance,Internship_Experience,Projects_Completed,IQ_group,extra_curr_score,Comm_score
5090,5.6,5.93,3,No,3,Below Average,,Poor


Define Numerical and Categorical Featurea

In [29]:
categorical_features = ['IQ_group', 'extra_curr_score', 'Comm_score', 'Internship_Experience']
numerical_features = ['Prev_Sem_Result', 'CGPA', 'Academic_Performance', 'Projects_Completed']
target = ['Placement']

Column Transformer

In [30]:
preprocessor = ColumnTransformer(
    transformers = [
        ("cat", OneHotEncoder(), categorical_features),
        ("num", "passthrough", numerical_features),
    ]
)

In [31]:
le = LabelEncoder()
y = le.fit_transform(Y)

In [33]:
import pickle

In [34]:
# Save the fitted LabelEncoder
with open("label_encoder.pkl", "wb") as f:
    pickle.dump(le, f)

In [35]:
#Creating a pipeline
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(random_state = 42))
])

In [36]:
# Split data and train model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [37]:
pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [38]:
# Predict and evaluate
y_pred = pipeline.predict(X_test)
accuracy_score(y_test, y_pred)

0.9625

Save the Model

In [39]:
import pickle
pickle_model_path = "model.pkl"
with open (pickle_model_path, "wb") as f:
    pickle.dump(pipeline, f)