In [1]:
# Import dependencies:

import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [2]:
# Read csv file consisting of cleaned data and store it in a dataframe:

data = "Machine_Learning/Data/cleaned_data.csv"
df = pd.read_csv(data)
df.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_dmConLI,SaleType_dmConLw,SaleType_dmNew,SaleType_dmOth,SaleType_dmWD,SaleCondition_dmAdjLand,SaleCondition_dmAlloca,SaleCondition_dmFamily,SaleCondition_dmNormal,SaleCondition_dmPartial
0,60,65.0,8450,7,5,2003,2003,196.0,706,0,...,0,0,0,0,1,0,0,0,1,0
1,20,80.0,9600,6,8,1976,1976,0.0,978,0,...,0,0,0,0,1,0,0,0,1,0
2,60,68.0,11250,7,5,2001,2002,162.0,486,0,...,0,0,0,0,1,0,0,0,1,0
3,70,60.0,9550,7,5,1915,1970,0.0,216,0,...,0,0,0,0,1,0,0,0,0,0
4,60,84.0,14260,8,5,2000,2000,350.0,655,0,...,0,0,0,0,1,0,0,0,1,0


In [3]:
# Keep only the features that we want to use:

df = df[['OverallQual',
        'GrLivArea',
        '2ndFlrSF',
        'TotalBsmtSF',
        '1stFlrSF',
        'LotArea',
        'YearBuilt',
        'GarageCars',
        'SalePrice']]

df.head()

Unnamed: 0,OverallQual,GrLivArea,2ndFlrSF,TotalBsmtSF,1stFlrSF,LotArea,YearBuilt,GarageCars,SalePrice
0,7,1710,854,856,856,8450,2003,2,208500
1,6,1262,0,1262,1262,9600,1976,2,181500
2,7,1786,866,920,920,11250,2001,2,223500
3,7,1717,756,756,961,9550,1915,3,140000
4,8,2198,1053,1145,1145,14260,2000,3,250000


In [4]:
# Extract the target variable from dataset:

X = df.drop(columns = ["SalePrice"])
Y = df["SalePrice"].values.reshape(-1, 1)

In [5]:
# Split data into train and test datasets:

X_train, X_test, y_train, y_test = train_test_split(X,Y,random_state=1)

In [14]:
from sklearn.preprocessing import StandardScaler

# Create a StandardScater model and fit it to the training data

### BEGIN SOLUTION
X_scaler = StandardScaler().fit(X_train)
y_scaler = StandardScaler().fit(y_train)

joblib.dump(X_scaler,"X_scaler.joblib")
joblib.dump(y_scaler,"y_scaler.joblib")



### END SOLUTION

['y_scaler.joblib']

In [7]:
# Transform the training and testing data using the X_scaler and y_scaler models

### BEGIN SOLUTION
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
y_train_scaled = y_scaler.transform(y_train)
y_test_scaled = y_scaler.transform(y_test)
### END SOLUTION

In [8]:
# Create and fit linear regression model to train dataset:

model = RandomForestRegressor()
model.fit(X_train_scaled, y_train_scaled)

  after removing the cwd from sys.path.


RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [9]:
# Make predictions using the model:

predictions = model.predict(X_test_scaled)

In [10]:
# Determine mean_squared_error and model accuracy score:

MSE = mean_squared_error(y_test_scaled, predictions)
r2 = model.score(X_test_scaled, y_test_scaled)

In [11]:
# Print mean squared error and model accuracy score:

print(f"MSE: {MSE}, R2: {r2}")

MSE: 0.14183508902390055, R2: 0.8492745145260413


In [12]:
# predictions = model.predict(X_train_scaled)

predictions = predictions.reshape(-1, 1)
# predictions

predictions.shape

(274, 1)

In [13]:
# Save model as a pkl file:

import joblib
joblib.dump(model,"model.joblib")

['model.joblib']

# Hyperparameter Tuning

In [None]:
model_2 = RandomForestRegressor(max_leaf_nodes=100, random_state=1)
model_2.fit(X_train_scaled, y_train_scaled)
predictions = model_2.predict(X_test_scaled)

MSE = mean_squared_error(y_test_scaled, predictions)
r2 = model_2.score(X_test_scaled, y_test_scaled)
print(f"MSE: {MSE} r2: {r2}")

In [None]:
model_3 = RandomForestRegressor(max_leaf_nodes=27000, random_state=1)
model_3.fit(X_train_scaled, y_train_scaled)
predictions = model_3.predict(X_test_scaled)

MSE = mean_squared_error(y_test_scaled, predictions)
r2 = model_3.score(X_test_scaled, y_test_scaled)
print(f"MSE: {MSE} r2: {r2}")

In [None]:
from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}
# Create a based model
rf = RandomForestRegressor()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [None]:
# Fit the grid search to the data
grid_search.fit(X_train_scaled, y_train_scaled)
grid_search.best_params_
{'bootstrap': True,
 'max_depth': 80,
 'max_features': 3,
 'min_samples_leaf': 5,
 'min_samples_split': 12,
 'n_estimators': 100}
best_grid = grid_search.best_estimator_
grid_accuracy = evaluate(best_grid, test_features, test_labels)
# Model Performance

print('Improvement of {:0.2f}%.'.format( 100 * (grid_accuracy - base_accuracy) / base_accuracy))