Random Forest - Classification

In [1]:
#Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
#Importing the dataset
data = pd.read_csv("/content/drive/MyDrive/4th SEM/ML/Datasets/ObesityDataSet.csv")
data.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Female,21.0,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal_Weight
1,Female,21.0,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
2,Male,23.0,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation,Normal_Weight
3,Male,27.0,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking,Overweight_Level_I
4,Male,22.0,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II


In [3]:
#Checking missing values
data.isna().sum()

Gender                            0
Age                               0
Height                            0
Weight                            0
family_history_with_overweight    0
FAVC                              0
FCVC                              0
NCP                               0
CAEC                              0
SMOKE                             0
CH2O                              0
SCC                               0
FAF                               0
TUE                               0
CALC                              0
MTRANS                            0
NObeyesdad                        0
dtype: int64

In [4]:
#Encoding categorical variables
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
cols = ['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS', 'NObeyesdad']
for i in cols:
  data[i] = le.fit_transform(data[i])

In [5]:
#Splitting target and features
x = data.iloc[:,:-1].values
y = data.iloc[:, [-1]].values

In [6]:
#Splitting train and test data
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(x,y,test_size=0.2,random_state=5)

In [7]:
#Training the model
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, random_state=60)
clf.fit(xtrain, ytrain)

  clf.fit(xtrain, ytrain)


In [8]:
y_pred=clf.predict(xtest)

In [9]:
#Confusion matrix
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(ytest, y_pred)
cm

array([[50,  4,  0,  0,  0,  0,  0],
       [ 1, 64,  0,  0,  0,  2,  1],
       [ 0,  2, 73,  0,  0,  0,  1],
       [ 0,  0,  0, 46,  0,  0,  0],
       [ 0,  0,  0,  0, 68,  0,  0],
       [ 0,  5,  0,  0,  0, 48,  1],
       [ 0,  0,  1,  0,  0,  1, 55]])

In [10]:
#Classification Report
from sklearn.metrics import classification_report
print(classification_report(ytest,y_pred))

              precision    recall  f1-score   support

           0       0.98      0.93      0.95        54
           1       0.85      0.94      0.90        68
           2       0.99      0.96      0.97        76
           3       1.00      1.00      1.00        46
           4       1.00      1.00      1.00        68
           5       0.94      0.89      0.91        54
           6       0.95      0.96      0.96        57

    accuracy                           0.96       423
   macro avg       0.96      0.95      0.96       423
weighted avg       0.96      0.96      0.96       423



In [11]:
#Evaluation metrics
from sklearn.metrics import accuracy_score
print("Accuracy:", accuracy_score(ytest, y_pred))

Accuracy: 0.9550827423167849


Random Forest - Regression

In [12]:
#Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [13]:
#Importing the dataset
data = pd.read_csv("/content/drive/MyDrive/4th SEM/ML/Datasets/Salary_Data.csv")
data.head()

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary
0,32.0,Male,Bachelor's,Software Engineer,5.0,90000.0
1,28.0,Female,Master's,Data Analyst,3.0,65000.0
2,45.0,Male,PhD,Senior Manager,15.0,150000.0
3,36.0,Female,Bachelor's,Sales Associate,7.0,60000.0
4,52.0,Male,Master's,Director,20.0,200000.0


In [14]:
#Checking missing values
data.isna().sum()

Age                    2
Gender                 2
Education Level        3
Job Title              2
Years of Experience    3
Salary                 5
dtype: int64

In [15]:
#Handling missing values
data = data.dropna()

In [16]:
#Encoding categorical variables
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
data['Gender'] = le.fit_transform(data['Gender'])
data['Education Level'] = le.fit_transform(data['Education Level'])
data['Job Title'] = le.fit_transform(data['Job Title'])

In [17]:
#Splitting target and features
x = data.iloc[:,:-1].values
y = data.iloc[:, [-1]].values

In [18]:
#Splitting train and test data
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(x,y,test_size=0.2,random_state=5)

In [19]:
#Fitting the model
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=100, max_depth=5)
model.fit(xtrain, ytrain)

  model.fit(xtrain, ytrain)


In [20]:
ypred = model.predict(xtest)

In [21]:
from sklearn.metrics import r2_score
print("R2 Score:",r2_score(ytest,ypred))

R2 Score: 0.8937772275345373
