## Random Forest Regressor with Pipelines and Hyperparameter Tuning
Here we want to predict the 'total_bill' feature for the 'tips' dataset

In [1]:
import seaborn as sns

In [2]:
df = sns.load_dataset('tips')

In [3]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.4 KB


In [5]:
## Independent and Dependent Features

x = df.drop(labels='total_bill', axis=1)
y = df.total_bill

In [6]:
x.head()

Unnamed: 0,tip,sex,smoker,day,time,size
0,1.01,Female,No,Sun,Dinner,2
1,1.66,Male,No,Sun,Dinner,3
2,3.5,Male,No,Sun,Dinner,3
3,3.31,Male,No,Sun,Dinner,2
4,3.61,Female,No,Sun,Dinner,4


In [7]:
y.head()

0    16.99
1    10.34
2    21.01
3    23.68
4    24.59
Name: total_bill, dtype: float64

In [8]:
## Train-Test Split
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [9]:
from sklearn.impute import SimpleImputer # Handling missing values 
from sklearn.preprocessing import StandardScaler # Feature scaling 
from sklearn.preprocessing import OneHotEncoder # Handling categorical feature
from sklearn.pipeline import Pipeline # Automation
from sklearn.compose import ColumnTransformer # Connecting to the pipeline

In [10]:
categorical_features = ['sex', 'smoker', 'day', 'time']
numerical_features = ['tip', 'size']

### Feature Engineering Automation using Pipelines

In [11]:
numerical_pipeline = Pipeline(
    steps = [
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]
)


categorical_pipeline = Pipeline(
    steps = [
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehotencoder', OneHotEncoder())
    ]
)

In [12]:
preposessor = ColumnTransformer([
    ('numerical_pipeline', numerical_pipeline, numerical_features),
    ('categorical_pipeline', categorical_pipeline, categorical_features)

])

In [13]:
x_train = preposessor.fit_transform(x_train)
x_test = preposessor.transform(x_test)

In [14]:
x_train

array([[-0.2580329 , -0.61214068,  0.        , ...,  0.        ,
         1.        ,  0.        ],
       [-0.74211442, -0.61214068,  0.        , ...,  0.        ,
         1.        ,  0.        ],
       [ 0.6399734 , -0.61214068,  0.        , ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [-1.46472887, -0.61214068,  1.        , ...,  0.        ,
         1.        ,  0.        ],
       [ 0.32426806, -0.61214068,  0.        , ...,  0.        ,
         1.        ,  0.        ],
       [-0.41237773,  0.45363997,  1.        , ...,  0.        ,
         1.        ,  0.        ]])

In [15]:
x_test

array([[ 0.06468811, -0.61214068,  0.        ,  1.        ,  1.        ,
         0.        ,  0.        ,  1.        ,  0.        ,  0.        ,
         1.        ,  0.        ],
       [-0.76316144, -0.61214068,  0.        ,  1.        ,  1.        ,
         0.        ,  0.        ,  0.        ,  1.        ,  0.        ,
         1.        ,  0.        ],
       [-0.76316144,  1.51942062,  0.        ,  1.        ,  1.        ,
         0.        ,  0.        ,  0.        ,  1.        ,  0.        ,
         1.        ,  0.        ],
       [ 1.45379161,  1.51942062,  0.        ,  1.        ,  0.        ,
         1.        ,  0.        ,  1.        ,  0.        ,  0.        ,
         1.        ,  0.        ],
       [-0.76316144, -0.61214068,  1.        ,  0.        ,  0.        ,
         1.        ,  0.        ,  0.        ,  0.        ,  1.        ,
         0.        ,  1.        ],
       [-0.76316144, -0.61214068,  0.        ,  1.        ,  0.        ,
         1.        ,  

### Model Training

In [16]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR

In [17]:
models = {
    'Random Forest' : RandomForestRegressor(),
    'Decison Tree' : DecisionTreeRegressor(),
    'SVR' : SVR

}

In [33]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [21]:
def evaluate_model(x_train, x_test, y_train, y_test, models):
    
    report = {}
    
    for i in range(len(models)):
        model = list(models.values())[i]
        
        model.fit(x_train, y_train)
        
        y_pred = model.predict(x_test)
        
        model_accuracy = accuracy_score(y_test, y_pred)
        
        report[list(models.keys())[i]] = model_accuracy
        
    return report  

In [22]:
evaluate_model(x_train, x_test, y_train, y_test, models)

ValueError: continuous is not supported

In [36]:
def evaluate_model(x_train, y_train, x_test, y_test, models):
    report = {}

    for i in range(len(models)):
        model = list(models.values())[i]

        model.fit(x_train, y_train)

        y_pred = model.predict(x_test)

        model_mse = mean_squared_error(y_test, y_pred) 
        model_r2 = r2_score(y_test, y_pred)
        report[list(models.keys())[i]] = model_r2

    return report


In [37]:
evaluate_model(x_train, y_train, x_test, y_test, models)

TypeError: BaseLibSVM.fit() missing 1 required positional argument: 'y'