# Random Forest
random forest model is a type of ensemble learning method, which is an alternative to the bagging method. It is a meta learning model that `combines the predictions of multiple decision trees`. It is also known as an ensemble of decision trees.  

In [48]:
# Import Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, classification_report, accuracy_score, confusion_matrix
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [49]:
#load the dataset of tips
df=sns.load_dataset('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [50]:
# check for null values
df.isnull().sum()

total_bill    0
tip           0
sex           0
smoker        0
day           0
time          0
size          0
dtype: int64

In [51]:
# check for duplicate values
df.duplicated().sum()

1

In [52]:
# drop duplicate values
df.drop_duplicates(inplace=True)
df.duplicated().sum()

0

In [53]:
# check for outliers
df.describe()

Unnamed: 0,total_bill,tip,size
count,243.0,243.0,243.0
mean,19.813868,3.002387,2.572016
std,8.910071,1.385002,0.952356
min,3.07,1.0,1.0
25%,13.38,2.0,2.0
50%,17.81,2.92,2.0
75%,24.175,3.575,3.0
max,50.81,10.0,6.0


In [54]:
# check for categorical columns
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 243 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  243 non-null    float64 
 1   tip         243 non-null    float64 
 2   sex         243 non-null    category
 3   smoker      243 non-null    category
 4   day         243 non-null    category
 5   time        243 non-null    category
 6   size        243 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 9.1 KB


In [55]:
# check for categorical columns
df.select_dtypes(include='category').columns

Index(['sex', 'smoker', 'day', 'time'], dtype='object')

In [56]:
# check for numerical columns
df.select_dtypes(exclude='object').columns

Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size'], dtype='object')

In [57]:
# encode the catagorical and object columns using for loop
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
for col in df.columns:
    if df[col].dtypes=='object' or df[col].dtypes=='category':
        df[col]=le.fit_transform(df[col])
df.head()


Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,0,0,2,0,2
1,10.34,1.66,1,0,2,0,3
2,21.01,3.5,1,0,2,0,3
3,23.68,3.31,1,0,2,0,2
4,24.59,3.61,0,0,2,0,4


## For Classification

In [58]:
# split the data into features(x) and target(y) for classification
X=df.drop('sex', axis=1)
y=df['sex']


In [59]:
# split the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [60]:
# train the model
from sklearn.ensemble import RandomForestClassifier
model=RandomForestClassifier(n_estimators=200, random_state=42)
model.fit(X_train, y_train)

# predict the model
y_pred=model.predict(X_test)

#Evaluate the model
print('accuracy_score', accuracy_score(y_test, y_pred))
print('Classification_Report:\n', classification_report(y_test, y_pred))
print('confusion_matrix', confusion_matrix(y_test, y_pred))


accuracy_score 0.6122448979591837
Classification_Report:
               precision    recall  f1-score   support

           0       0.33      0.36      0.34        14
           1       0.74      0.71      0.72        35

    accuracy                           0.61        49
   macro avg       0.53      0.54      0.53        49
weighted avg       0.62      0.61      0.62        49

confusion_matrix [[ 5  9]
 [10 25]]


## For Regression

In [64]:
# load the dataset of tips
df=sns.load_dataset('tips')
df.head()


Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [68]:
# encode the catagorical and object columns using for loop
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
for col in df.columns:
    if df[col].dtypes=='object' or df[col].dtypes=='continous':
        df[col]=le.fit_transform(df[col])
df.head()


Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,0,0,2,0,2
1,10.34,1.66,1,0,2,0,3
2,21.01,3.5,1,0,2,0,3
3,23.68,3.31,1,0,2,0,2
4,24.59,3.61,0,0,2,0,4


In [69]:
X=df.drop('tip', axis=1)
y=df['tip']

# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [71]:
# Create and traing the model
mdoel = RandomForestRegressor(n_estimators=200, random_state=42)
model.fit(X_train, y_train)
y_pred=model.predict(X_test)

# Evaluate the model
print('mean squared error', mean_squared_error(y_test, y_pred))
print('mean absolute error', mean_absolute_error(y_test, y_pred))
print('r2_score', r2_score(y_test, y_pred))



ValueError: Unknown label type: continuous. Maybe you are trying to fit a classifier, which expects discrete classes on a regression target with continuous values.