In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.linear_model import LinearRegression

df = pd.read_csv ('/content/drive/MyDrive/Colab Notebooks/ebw_data.csv')

In [2]:
X = df.drop(["Width", "Depth"], axis=1)
y = df[["Width", "Depth"]].copy()

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=0)

In [4]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [5]:
rf = RandomForestRegressor()

In [6]:
n_estimators = [int(x) for x in np.linspace(start=10, stop=100, num=3)]

max_depth = [int(x) for x in np.linspace(5, 80, num=4)]

max_depth.append(None)
min_samples_leaf = [1, 3]

param_grid = {
    'n_estimators': n_estimators,
    'max_depth': max_depth,
    'min_samples_leaf': min_samples_leaf
              }
param_grid

{'n_estimators': [10, 55, 100],
 'max_depth': [5, 30, 55, 80, None],
 'min_samples_leaf': [1, 3]}

In [7]:
rf_random = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, verbose=3)
rf_random.fit(X_train, y_train)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV 1/5] END max_depth=5, min_samples_leaf=1, n_estimators=10;, score=0.918 total time=   0.0s
[CV 2/5] END max_depth=5, min_samples_leaf=1, n_estimators=10;, score=0.915 total time=   0.0s
[CV 3/5] END max_depth=5, min_samples_leaf=1, n_estimators=10;, score=0.885 total time=   0.0s
[CV 4/5] END max_depth=5, min_samples_leaf=1, n_estimators=10;, score=0.938 total time=   0.0s
[CV 5/5] END max_depth=5, min_samples_leaf=1, n_estimators=10;, score=0.927 total time=   0.0s
[CV 1/5] END max_depth=5, min_samples_leaf=1, n_estimators=55;, score=0.915 total time=   0.1s
[CV 2/5] END max_depth=5, min_samples_leaf=1, n_estimators=55;, score=0.921 total time=   0.1s
[CV 3/5] END max_depth=5, min_samples_leaf=1, n_estimators=55;, score=0.905 total time=   0.1s
[CV 4/5] END max_depth=5, min_samples_leaf=1, n_estimators=55;, score=0.909 total time=   0.1s
[CV 5/5] END max_depth=5, min_samples_leaf=1, n_estimators=55;, score=0.948 total t

In [8]:
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

m = np.sqrt(metrics.mean_squared_error(y_test, y_pred))
m

0.07258246627952546

In [11]:
np.array(y_pred.mean(axis=0))

array([1.96805832, 1.20324185])

In [12]:
import pickle
filename = 'finalized_model.sav'
pickle.dump(rf_random, open(filename, 'wb'))

In [13]:
loaded_model = pickle.load(open(filename, 'rb'))
loaded_model.predict(X_test)

array([[1.9789009 , 0.96647697],
       [1.9789009 , 0.96647697],
       [1.77869521, 1.22402294],
       [1.75382893, 1.11798585],
       [2.52094212, 1.50910138],
       [1.77869521, 1.22402294],
       [2.49638706, 1.69240651],
       [1.77869521, 1.22402294],
       [2.09972095, 0.83645976],
       [1.76887753, 1.28214691],
       [1.77869521, 1.22402294]])

In [14]:
np.array(y_pred)

array([[1.97429   , 0.95981667],
       [1.97429   , 0.95981667],
       [1.78165727, 1.21807539],
       [1.70687306, 1.09583413],
       [2.52035558, 1.5087785 ],
       [1.78165727, 1.21807539],
       [2.49391905, 1.70155714],
       [1.78165727, 1.21807539],
       [2.09075143, 0.83465381],
       [1.76153333, 1.3029019 ],
       [1.78165727, 1.21807539]])

In [15]:
import joblib

joblib.dump(rf_random.best_estimator_, 'final_test.pkl', compress = 1)

['final_test.pkl']

In [16]:
joblib.load("final_test.pkl")