In [30]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split , GridSearchCV
from sklearn.preprocessing import OneHotEncoder , StandardScaler , MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline 
from sklearn.compose import ColumnTransformer
from sklearn.metrics import r2_score

from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor,GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor

In [7]:
df = pd.read_csv("data.csv")

In [8]:
df.head()

Unnamed: 0,price,crime_rate,resid_area,air_qual,room_num,age,teachers,poor_prop,airport,n_hos_beds,n_hot_rooms,waterbody,rainfall,parks
0,24.0,0.00632,32.31,0.538,6.575,65.2,24.7,4.98,YES,5.48,11.192,River,23,0.049347
1,21.6,0.02731,37.07,0.469,6.421,78.9,22.2,9.14,NO,7.332,12.1728,Lake,42,0.046146
2,34.7,0.02729,37.07,0.469,7.185,61.1,22.2,4.03,NO,7.394,101.12,,38,0.045764
3,33.4,0.03237,32.18,0.458,6.998,45.8,21.3,2.94,YES,9.268,11.2672,Lake,45,0.047151
4,36.2,0.06905,32.18,0.458,7.147,54.2,21.3,5.33,NO,8.824,11.2896,Lake,55,0.039474


In [9]:
num_feature = df.select_dtypes(exclude = 'O').columns
cat_feature = df.select_dtypes(include = 'O').columns

In [10]:
train_set , test_set = train_test_split(df , test_size = 0.2 , random_state = 30)

In [51]:
train_set['price'].shape

(404,)

In [54]:
y_test_df.shape

NameError: name 'y_test_df' is not defined

In [55]:
x_train_df = train_set.drop(columns = ['price'])
y_train_df = train_set[target_feature]

x_test_df = test_set.drop(columns = ['price'])
y_test_df = test_set[target_feature]

In [14]:
num_feature = x_train_df.select_dtypes(exclude = 'O').columns
cat_feature = x_train_df.select_dtypes(include = 'O').columns

In [17]:
num_pipeline = Pipeline(steps=[
    ("imputer",SimpleImputer(strategy="mean")),
    ("scale",MinMaxScaler())
])

cat_pipeline = Pipeline(steps = [
    ("imputer",SimpleImputer(strategy = 'most_frequent')),
    ('ohe',OneHotEncoder())
])

In [18]:
transformer = ColumnTransformer([
    ('num_pipeline' , num_pipeline , num_feature),
    ("cat_pipeline" , cat_pipeline , cat_feature)
])

In [20]:
x_train_trans = transformer.fit_transform(x_train_df)
x_test_trans  = transformer.transform(x_test_df)

In [59]:
models = {
    "LinearRegression":LinearRegression(),
    "SVR":SVR(),
    "RandomForest":RandomForestRegressor(),
    "DecisionTree":DecisionTreeRegressor(),
    "AdaBoostRegressor":AdaBoostRegressor(),
    "GradientBoost":GradientBoostingRegressor(),
    "Xgboost":XGBRegressor()
}

params = {

    "LinearRegression": {
        "fit_intercept": [True, False],
        "positive": [True, False]
    },

    "SVR": {
        "kernel": ["linear", "rbf"],
        "C": [0.1, 1, 10],
        "epsilon": [0.01, 0.1],
        "gamma": ["scale", "auto"]
    },

    "RandomForest": {
        "n_estimators": [100, 200],
        "max_depth": [None, 10, 20],
        "min_samples_split": [2, 5],
        "min_samples_leaf": [1, 2]
    },

    "DecisionTree": {
        "max_depth": [None, 5, 10, 20],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 2, 5]
    },

    "AdaBoostRegressor": {
        "n_estimators": [50, 100, 200],
        "learning_rate": [0.01, 0.1, 1.0]
    },

    "GradientBoost": {
        "n_estimators": [100, 200],
        "learning_rate": [0.05, 0.1],
        "max_depth": [3, 5],
        "subsample": [0.8, 1.0]
    },

    "Xgboost": {
        "n_estimators": [100, 300],
        "learning_rate": [0.01, 0.1],
        "max_depth": [3, 5, 7],
        "subsample": [0.8, 1.0],
        "colsample_bytree": [0.8, 1.0]
    }
}


In [60]:
np.shape(y_train_df)

(404,)

In [61]:
report = {}
for i in range(len(list(models.keys()))):
    model = list(models.values())[i]
    model_name = list(models.keys())[i]
    param = params[list(models.keys())[i]]
    
    grid = GridSearchCV(model , param , n_jobs=-1)
    grid.fit(x_train_trans , y_train_df)
    
    y_pred = grid.predict(x_test_trans)
    r2 = r2_score(y_test_df , y_pred)
    report[model_name] = r2
    print(model_name)

LinearRegression
SVR
RandomForest
DecisionTree
AdaBoostRegressor
GradientBoost
Xgboost


In [67]:
maximum = max(list(report.values()))

In [85]:
best_model = list(report.keys())[list(report.values()).index(maximum)]

TypeError: list indices must be integers or slices, not str