In [69]:
import pandas as pd
import numpy as np

In [70]:
train_df = pd.read_csv("train.csv")

In [71]:
train_df.head()

Unnamed: 0,ID,X0,X1,X2,X3,X4,X5,X6,X8,X10,...,X376,X377,X378,X379,X380,X382,X383,X384,X385,y
0,0,k,v,at,a,d,u,j,o,0,...,0,1,0,0,0,0,0,0,0,130.81
1,6,k,t,av,e,d,y,l,o,0,...,0,0,0,0,0,0,0,0,0,88.53
2,7,az,w,n,c,d,x,j,x,0,...,0,0,0,0,0,1,0,0,0,76.26
3,9,az,t,n,f,d,x,l,e,0,...,0,0,0,0,0,0,0,0,0,80.62
4,13,az,v,n,f,d,h,d,n,0,...,0,0,0,0,0,0,0,0,0,78.02


In [72]:
train_df.drop("ID", axis=1, inplace=True)

In [73]:
train_df.shape

(4209, 377)

In [74]:
test_df = pd.read_csv("test.csv")

### Preprocessing

In [75]:
# checking null values
train_df.isnull().sum()[train_df.isnull().sum() > 0]

Series([], dtype: int64)

In [76]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4209 entries, 0 to 4208
Columns: 377 entries, X0 to y
dtypes: float64(1), int64(368), object(8)
memory usage: 12.1+ MB


In [77]:
# Removing duplicated columns
def get_duplicate_columns(df):

    duplicate_columns = {}
    seen_columns = {}

    for column in df.columns:
        current_column = df[column]

        # Convert column data to bytes
        try:
            current_column_hash = current_column.values.tobytes()
        except AttributeError:
            current_column_hash = current_column.to_string().encode()

        if current_column_hash in seen_columns:
            if seen_columns[current_column_hash] in duplicate_columns:
                duplicate_columns[seen_columns[current_column_hash]].append(
                    column)
            else:
                duplicate_columns[seen_columns[current_column_hash]] = [column]
        else:
            seen_columns[current_column_hash] = column

    return duplicate_columns

In [78]:
duplicate_columns = get_duplicate_columns(train_df)

In [79]:
duplicate_columns

{'X31': ['X35', 'X37'],
 'X33': ['X39'],
 'X54': ['X76'],
 'X71': ['X84', 'X244'],
 'X11': ['X93',
  'X107',
  'X233',
  'X235',
  'X268',
  'X289',
  'X290',
  'X293',
  'X297',
  'X330',
  'X347'],
 'X90': ['X94', 'X242'],
 'X53': ['X102', 'X214', 'X239'],
 'X48': ['X113', 'X134', 'X147', 'X222'],
 'X118': ['X119'],
 'X88': ['X122', 'X243', 'X320'],
 'X138': ['X146'],
 'X62': ['X172', 'X216'],
 'X112': ['X199'],
 'X67': ['X213'],
 'X152': ['X226', 'X326'],
 'X125': ['X227'],
 'X29': ['X232', 'X279'],
 'X89': ['X245'],
 'X202': ['X247'],
 'X60': ['X248', 'X253', 'X385'],
 'X230': ['X254'],
 'X184': ['X262', 'X266'],
 'X295': ['X296'],
 'X298': ['X299'],
 'X44': ['X302'],
 'X58': ['X324'],
 'X155': ['X360'],
 'X240': ['X364', 'X365'],
 'X17': ['X382']}

In [80]:
for one_list in duplicate_columns.values():
    train_df.drop(columns=one_list, inplace=True)
    test_df.drop(columns=one_list, inplace=True)

In [81]:
print("after dropping duplicate columns")
print(train_df.shape)

after dropping duplicate columns
(4209, 321)


In [82]:
y = train_df["y"].values
X = train_df.drop("y", axis=1)

In [83]:
print("Before One Hot encoding shape of X:", X.shape)

Before One Hot encoding shape of X: (4209, 320)


In [84]:
X = pd.get_dummies(data=X, drop_first=True)

In [85]:

print("After One Hot encoding shape of X:", X.shape)

After One Hot encoding shape of X: (4209, 499)


In [86]:
# finding categerical columns

In [87]:
# preparing train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0)

In [88]:
# scaling X_train,X_test
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train_scaled = sc.fit_transform(X_train)
X_test_scaled = sc.transform(X_test)

# Random Forest Regressor

In [89]:

from sklearn.ensemble import RandomForestRegressor

In [90]:
print(X_train.shape)
X_train.head()

(3367, 499)


Unnamed: 0,X10,X11,X12,X13,X14,X15,X16,X17,X18,X19,...,X8_p,X8_q,X8_r,X8_s,X8_t,X8_u,X8_v,X8_w,X8_x,X8_y
3540,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3748,0,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1287,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2856,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1380,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [91]:
y_train

array([101.9 , 114.08, 106.31, ..., 102.77, 113.94, 110.1 ])

In [92]:
print(X_test.shape)
X_test.head()

(842, 499)


Unnamed: 0,X10,X11,X12,X13,X14,X15,X16,X17,X18,X19,...,X8_p,X8_q,X8_r,X8_s,X8_t,X8_u,X8_v,X8_w,X8_x,X8_y
3431,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2131,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2680,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
195,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3032,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [93]:
# RandomForest Regressor
model_rf = RandomForestRegressor()
model_rf.fit(X_train, y_train)
y_pred = model_rf.predict(X_test)

### Defining functions to get evaluation metric's scores and feature importance

In [94]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [95]:
def regression_results(actual_y, pred_y, X):
    '''This function takes actual and predicted value of target transform tem back and return scores of evaluation metrics'''

    # Reverse transforming the predicted output
    # y_true = np.expm1(actual_y)
    # y_pred = np.expm1(pred_y)
    y_true = actual_y
    y_pred = pred_y

    # Calculating regression metrics

    MAE = mean_absolute_error(y_true, y_pred)

    MSE = mean_squared_error(y_true, y_pred)

    RMSE = np.sqrt(MSE)

    r2 = r2_score(y_true, y_pred)

    adj_r2 = 1-(1-r2)*((X.shape[0]-1)/(X.shape[0]-X.shape[1]-1))

    return (round(MAE, 2), round(MSE, 2), round(RMSE, 2), round(r2, 2), round(adj_r2, 2))

In [96]:
def performance(model, X_train, X_test, y_train, y_test):
    '''This function takes model as input and return different evaluation metrics' score as dataframe'''

    # Doing prediction
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Calling the function regression_results
    train = regression_results(y_train, y_train_pred, X_train)
    test = regression_results(y_test, y_test_pred, X_test)

    # Storing the scores
    score = {'Metric': ['MAE', 'MSE', 'RMSE', 'r2', "adj_r2"],
             'Train Score': [train[0], train[1], train[2], train[3], train[4]],
             "Test Score": [test[0], test[1], test[2], test[3], test[4]]}

    # Create DataFrame
    df = pd.DataFrame(score)
    df.name = model
    return (df)

In [97]:
print("Scores of Random Forest")
performance(model_rf, X_train, X_test, y_train, y_test)

Scores of Random Forest


  df.name = model


Unnamed: 0,Metric,Train Score,Test Score
0,MAE,2.26,5.87
1,MSE,12.78,105.71
2,RMSE,3.57,10.28
3,r2,0.92,0.43
4,adj_r2,0.9,-0.4


In [98]:
from sklearn.decomposition import PCA
pca = PCA(None)
X_train_trf = pca.fit_transform(X_train_scaled)
X_test_trf = pca.transform(X_test_scaled)

In [99]:
pca.explained_variance_ratio_

array([4.69361662e-02, 4.01994759e-02, 3.44037561e-02, 2.66091492e-02,
       2.44469047e-02, 2.08936143e-02, 1.86324586e-02, 1.57457697e-02,
       1.31509596e-02, 1.28808094e-02, 1.24655126e-02, 1.20598248e-02,
       1.19164138e-02, 1.11662100e-02, 1.07597448e-02, 9.81991383e-03,
       9.31954921e-03, 9.07808701e-03, 8.44191027e-03, 8.28137951e-03,
       7.98513955e-03, 7.78959090e-03, 7.71114559e-03, 7.39393170e-03,
       7.14574331e-03, 6.73222411e-03, 6.71507436e-03, 6.62019785e-03,
       6.42888510e-03, 6.29910920e-03, 6.07109575e-03, 6.01054094e-03,
       5.83443117e-03, 5.73931275e-03, 5.64896791e-03, 5.61613082e-03,
       5.45867606e-03, 5.24994272e-03, 5.18383429e-03, 5.09303988e-03,
       5.02354760e-03, 5.01877234e-03, 4.94796743e-03, 4.85295751e-03,
       4.81181739e-03, 4.73491330e-03, 4.67934535e-03, 4.64007512e-03,
       4.57203997e-03, 4.50574381e-03, 4.45818367e-03, 4.41643404e-03,
       4.35735787e-03, 4.33754476e-03, 4.27899918e-03, 4.25342861e-03,
      

In [100]:
import numpy as np
cum_sum = np.cumsum(pca.explained_variance_ratio_)

In [101]:
cum_sum[cum_sum > .9]

array([0.90102602, 0.90252665, 0.90401846, 0.90549594, 0.90696559,
       0.90842579, 0.90986699, 0.9112927 , 0.91270734, 0.91410794,
       0.91550062, 0.91689058, 0.91827461, 0.91963872, 0.92100137,
       0.92235288, 0.92369442, 0.92501766, 0.9263356 , 0.92763335,
       0.92892354, 0.93019543, 0.93145957, 0.93271602, 0.9339517 ,
       0.93518317, 0.93640772, 0.93761528, 0.93881111, 0.93999966,
       0.94117158, 0.94233167, 0.94348871, 0.94462719, 0.94575588,
       0.946872  , 0.9479736 , 0.94906405, 0.95015128, 0.95122622,
       0.95229752, 0.95334647, 0.95438036, 0.95541358, 0.9564125 ,
       0.95740448, 0.95837443, 0.95933331, 0.96027823, 0.96120242,
       0.96211468, 0.9630109 , 0.96389169, 0.96476465, 0.96561605,
       0.96646169, 0.96729026, 0.96811029, 0.9689227 , 0.9697198 ,
       0.97049063, 0.9712516 , 0.97200106, 0.97272133, 0.97342988,
       0.97413306, 0.97482377, 0.97550955, 0.97617451, 0.9768254 ,
       0.97744962, 0.97806365, 0.97867142, 0.97927162, 0.97984

In [103]:
for idx, val in np.ndenumerate(cum_sum):
    if val > .95:
        print(idx, val)
        break

(236,) 0.9501512814642059


In [104]:
cum_sum[236:]

array([0.95015128, 0.95122622, 0.95229752, 0.95334647, 0.95438036,
       0.95541358, 0.9564125 , 0.95740448, 0.95837443, 0.95933331,
       0.96027823, 0.96120242, 0.96211468, 0.9630109 , 0.96389169,
       0.96476465, 0.96561605, 0.96646169, 0.96729026, 0.96811029,
       0.9689227 , 0.9697198 , 0.97049063, 0.9712516 , 0.97200106,
       0.97272133, 0.97342988, 0.97413306, 0.97482377, 0.97550955,
       0.97617451, 0.9768254 , 0.97744962, 0.97806365, 0.97867142,
       0.97927162, 0.97984799, 0.98041895, 0.98097217, 0.98152526,
       0.98206763, 0.98260271, 0.98312132, 0.98363566, 0.98414277,
       0.98463556, 0.98510023, 0.98556261, 0.98601494, 0.98645984,
       0.9868918 , 0.98731158, 0.98770676, 0.988098  , 0.9884806 ,
       0.98886225, 0.98923363, 0.98959264, 0.98994649, 0.99029354,
       0.99063107, 0.99096146, 0.9912746 , 0.99158538, 0.99189241,
       0.99219067, 0.99248321, 0.99276568, 0.99304687, 0.99331631,
       0.9935801 , 0.99384081, 0.99408983, 0.99432765, 0.99456

In [105]:
pca = PCA(n_components=236)
X_train_trf = pca.fit_transform(X_train_scaled)
X_test_trf = pca.transform(X_test_scaled)

In [106]:
rf_reg = RandomForestRegressor()

rf_reg.fit(X_train_trf, y_train)

In [107]:
print("Scores of Random Forest After PCA")
performance(rf_reg, X_train_trf, X_test_trf, y_train, y_test)

Scores of Random Forest After PCA


  df.name = model


Unnamed: 0,Metric,Train Score,Test Score
0,MAE,2.45,6.36
1,MSE,13.99,112.92
2,RMSE,3.74,10.63
3,r2,0.91,0.39
4,adj_r2,0.9,0.15


KeyboardInterrupt: 

### Break

In [168]:
cat_variables = train_df.select_dtypes("O").columns

In [169]:
cat_variables

Index(['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8'], dtype='object')

In [170]:
train_df[cat_variables]

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,X8
0,k,v,at,a,d,u,j,o
1,k,t,av,e,d,y,l,o
2,az,w,n,c,d,x,j,x
3,az,t,n,f,d,x,l,e
4,az,v,n,f,d,h,d,n
...,...,...,...,...,...,...,...,...
4204,ak,s,as,c,d,aa,d,q
4205,j,o,t,d,d,aa,h,h
4206,ak,v,r,a,d,aa,g,e
4207,al,r,e,f,d,aa,l,u


In [9]:
print(train_df[cat_variables].nunique())

NameError: name 'cat_variables' is not defined

In [172]:
# Removing duplicated columns
def get_duplicate_columns(df):

    duplicate_columns = {}
    seen_columns = {}

    for column in df.columns:
        current_column = df[column]

        # Convert column data to bytes
        try:
            current_column_hash = current_column.values.tobytes()
        except AttributeError:
            current_column_hash = current_column.to_string().encode()

        if current_column_hash in seen_columns:
            if seen_columns[current_column_hash] in duplicate_columns:
                duplicate_columns[seen_columns[current_column_hash]].append(
                    column)
            else:
                duplicate_columns[seen_columns[current_column_hash]] = [column]
        else:
            seen_columns[current_column_hash] = column

    return duplicate_columns

In [177]:
# finding correlated columns
corr_matrix = train_df.corr()

  corr_matrix = train_df.corr()


In [178]:
# Get the column names of the DataFrame
columns = corr_matrix.columns

# Create an empty list to keep track of columns to drop
columns_to_drop = []

# Loop over the columns
for i in range(len(columns)):
    for j in range(i + 1, len(columns)):
        # Access the cell of the DataFrame
        if corr_matrix.loc[columns[i], columns[j]] > 0.95:
            columns_to_drop.append(columns[j])

print(len(columns_to_drop))

33


In [179]:
try:
    for one_list in columns_to_drop:
        train_df.drop(columns=one_list, inplace=True)
        test_df.drop(columns=one_list, inplace=True)
except Exception as ex:
    print(ex)

"['X162'] not found in axis"


In [180]:
train_df.shape

(4209, 315)

In [181]:
test_df.shape

(4209, 315)

In [182]:
train_df.head()

Unnamed: 0,ID,X0,X1,X2,X3,X4,X5,X6,X8,X10,...,X374,X375,X376,X377,X378,X379,X380,X383,X384,y
0,0,k,v,at,a,d,u,j,o,0,...,0,0,0,1,0,0,0,0,0,130.81
1,6,k,t,av,e,d,y,l,o,0,...,0,1,0,0,0,0,0,0,0,88.53
2,7,az,w,n,c,d,x,j,x,0,...,0,0,0,0,0,0,0,0,0,76.26
3,9,az,t,n,f,d,x,l,e,0,...,0,0,0,0,0,0,0,0,0,80.62
4,13,az,v,n,f,d,h,d,n,0,...,0,0,0,0,0,0,0,0,0,78.02


In [183]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt

In [184]:
def regression_results(actual_y, pred_y, X):
    '''This function takes actual and predicted value of target transform tem back and return scores of evaluation metrics'''

    # Reverse transforming the predicted output
    # y_true = np.expm1(actual_y)
    # y_pred = np.expm1(pred_y)
    y_true = actual_y
    y_pred = pred_y

    # Calculating regression metrics

    MAE = mean_absolute_error(y_true, y_pred)

    MSE = mean_squared_error(y_true, y_pred)

    RMSE = np.sqrt(MSE)

    r2 = r2_score(y_true, y_pred)

    adj_r2 = 1-(1-r2)*((X.shape[0]-1)/(X.shape[0]-X.shape[1]-1))

    return (round(MAE, 2), round(MSE, 2), round(RMSE, 2), round(r2, 2), round(adj_r2, 2))

In [185]:
def performance(model, X_train, X_test, y_train, y_test):
    '''This function takes model as input and return different evaluation metrics' score as dataframe'''

    # Doing prediction
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Calling the function regression_results
    train = regression_results(y_train, y_train_pred, X_train)
    test = regression_results(y_test, y_test_pred, X_test)

    # Storing the scores
    score = {'Metric': ['MAE', 'MSE', 'RMSE', 'r2', "adj_r2"],
             'Train Score': [train[0], train[1], train[2], train[3], train[4]],
             "Test Score": [test[0], test[1], test[2], test[3], test[4]]}

    # Create DataFrame
    df = pd.DataFrame(score)
    df.name = model
    return (df)

In [186]:
def feature_importance_linear(model, independent_var):
    '''This function takes instance of linear model as input and and return feature importance graph '''
    # Store the coefficients of the model into a dataframe
    coefs = pd.DataFrame(
        model.coef_,
        columns=['Coefficients'], index=independent_var)
    # Plot importance graph
    coefs.plot(kind='barh', figsize=(9, 7))
    plt.title('Model')
    plt.axvline(x=0, color='.5')
    plt.subplots_adjust(left=.3)

    plt.show()

In [187]:
def feature_importance_ensemble(model, independent_var):
    '''This function takes instance of ensemble model as input and returns the feature importance'''
    # Listing features
    features = independent_var
    # storing feature importance
    importances = model.feature_importances_
    indices = np.argsort(importances)
    # Plot the feature importance
    plt.title('Feature Importance')
    plt.barh(range(len(indices)),
             importances[indices], color='red', align='center')
    plt.yticks(range(len(indices)), [features[i] for i in indices])
    plt.xlabel('Relative Importance')

    plt.show()

### Splitting data into train and test

In [188]:
# checking for null values
train_df.isnull().sum()[train_df.isnull().sum() > 0]

Series([], dtype: int64)

In [189]:
train_df[cat_variables]

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,X8
0,k,v,at,a,d,u,j,o
1,k,t,av,e,d,y,l,o
2,az,w,n,c,d,x,j,x
3,az,t,n,f,d,x,l,e
4,az,v,n,f,d,h,d,n
...,...,...,...,...,...,...,...,...
4204,ak,s,as,c,d,aa,d,q
4205,j,o,t,d,d,aa,h,h
4206,ak,v,r,a,d,aa,g,e
4207,al,r,e,f,d,aa,l,u


In [190]:
train_df = pd.get_dummies(data=train_df, drop_first=True)

In [191]:
test_df = pd.get_dummies(data=test_df, drop_first=True)

In [192]:
X = train_df.drop(["y", "ID"], axis=1)
y = train_df["y"].values

In [193]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0)

# Random Forest Regressor

In [194]:

from sklearn.ensemble import RandomForestRegressor

In [195]:
print(X_train.shape)
X_train.head()

(3367, 492)


Unnamed: 0,X10,X11,X12,X13,X14,X15,X16,X17,X18,X19,...,X8_p,X8_q,X8_r,X8_s,X8_t,X8_u,X8_v,X8_w,X8_x,X8_y
3540,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3748,0,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1287,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2856,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1380,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [196]:
y_train

array([101.9 , 114.08, 106.31, ..., 102.77, 113.94, 110.1 ])

In [197]:
print(X_test.shape)
X_test.head()

(842, 492)


Unnamed: 0,X10,X11,X12,X13,X14,X15,X16,X17,X18,X19,...,X8_p,X8_q,X8_r,X8_s,X8_t,X8_u,X8_v,X8_w,X8_x,X8_y
3431,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2131,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2680,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
195,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3032,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [198]:
# RandomForest Regressor
model_rf = RandomForestRegressor()
model_rf.fit(X_train, y_train)
y_pred = model_rf.predict(X_test)

In [199]:
print("Scores of Random Forest")
performance(model_rf, X_train, X_test, y_train, y_test)

Scores of Random Forest


  df.name = model


Unnamed: 0,Metric,Train Score,Test Score
0,MAE,2.28,5.82
1,MSE,12.7,104.74
2,RMSE,3.56,10.23
3,r2,0.92,0.43
4,adj_r2,0.9,-0.36


In [200]:
from sklearn.ensemble import RandomForestRegressor


# Extract feature importance scores for label encoded data
fi_df2 = pd.DataFrame({
    'feature': X.columns,
    'rf_importance': model_rf.feature_importances_
}).sort_values(by='rf_importance', ascending=False)

fi_df2

Unnamed: 0,feature,rf_importance
246,X314,0.396049
247,X315,0.066176
95,X118,0.056117
18,X29,0.025584
206,X263,0.022234
...,...,...
456,X5_y,0.000000
332,X0_g,0.000000
307,X0_ac,0.000000
170,X207,0.000000


In [201]:
selected_50_columns = fi_df2.iloc[:50, :]

In [202]:
X = X[selected_50_columns["feature"].values]

In [203]:
id_columns = test_df["ID"]

In [204]:
test_df = test_df[selected_50_columns["feature"].values]

In [205]:
# train_test_split with new selected colum
X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=0.2, random_state=0)

In [206]:
# RandomForest Regressor
model_rf = RandomForestRegressor()
model_rf.fit(X_train, y_train)
y_pred = model_rf.predict(X_test)

In [207]:
print("Scores of Random Forest")
performance(model_rf, X_train, X_test, y_train, y_test)

Scores of Random Forest


  df.name = model


Unnamed: 0,Metric,Train Score,Test Score
0,MAE,2.28,5.89
1,MSE,12.2,85.88
2,RMSE,3.49,9.27
3,r2,0.92,0.47
4,adj_r2,0.91,0.47


### Trying to predict test dataset

In [208]:
test_pred = model_rf.predict(test_df)

In [209]:
test_df["y"] = test_pred

In [211]:
test_df["ID"] = id_columns

In [212]:
predicted_df = test_df[["ID", "y"]]

In [None]:
predicted_df

Unnamed: 0,ID,y
0,1,77.5206
1,2,91.9538
2,3,78.4345
3,4,78.2195
4,5,110.1608
...,...,...
4204,8410,107.4476
4205,8411,92.2216
4206,8413,89.2165
4207,8414,109.1032


In [None]:
predicted_df.to_csv("predicted_results.csv")