In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler

### Getting data, cleaning, and preprocessing it for SML 

In [2]:
df = pd.read_csv('train.csv')

FileNotFoundError: [Errno 2] File train.csv does not exist: 'train.csv'

In [None]:
df.fillna(df.mean(), inplace=True)

In [None]:
df.dropna(axis=1, inplace =True)

In [None]:
#df

In [None]:
df.drop("Id",axis=1, inplace = True)

In [None]:
#df

####  Strings/Words encoded into numbers for Machine Learning

In [None]:
columns_to_be_encoded = list(df.drop(["SalePrice"], axis=1).columns)
for column in columns_to_be_encoded:
    le = preprocessing.LabelEncoder()
    le.fit(list(df[column].values))
    df[column] = le.transform(list(df[column].values))

In [None]:
#df

In [None]:
#list(zip(df.dtypes.index,df.dtypes.values))

###  Splitting data into train and test

In [None]:
X_df = df.drop(["SalePrice"], axis=1)
y_df = df["SalePrice"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_df,y_df,test_size=0.20, random_state = 42)

####  Different Applications of Supervised Machine Learning Models:

#####  1.a  RandomForestRegressor

In [None]:
reg = RandomForestRegressor()

In [None]:
list(X_train.columns)

In [None]:
reg.fit(X_train, y_train)

In [None]:
predictions = list(reg.predict(X_test))

In [None]:
# RMS, R^2, accuracy, 
training_score = reg.score(X_train, y_train)
testing_score = reg.score(X_test, y_test)

In [None]:
print(f"Training Score: {training_score}")
print(f"Testing Score: {testing_score}")

In [None]:
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
print(f"Training Score: {training_score}")
print(f"Testing Score: {testing_score}")

#### 1.b Feature selection applied improve RandomForestRegressor

In [None]:
import matplotlib.pyplot as plt
fig,ax=plt.subplots(figsize = (16,3))
ax.plot(list(y_test))
ax.plot(predictions, 'o')
plt.legend(["actual value", "model's pred"], loc='upper right')
plt.savefig("../images/RandForestPred.png")
plt.show()

In [None]:
features = reg.feature_importances_
features = sorted(features, reverse=True)


In [None]:
plt.bar(x = range(len(features)), height=features)
plt.show()

In [None]:
from sklearn.feature_selection import SelectFromModel
sel = SelectFromModel(reg)
sel

In [None]:
sel.fit(X_train, y_train)
sel.get_support()

In [None]:
sel.fit(X_train, y_train)
sel.get_support()
count = 0
for z in sel.get_support():
    if z == True:
        count+=1
count

In [None]:
X_selected_train, X_selected_test, y_train, y_test = train_test_split(sel.transform(X_df), y_df, random_state=1)

scaler = StandardScaler().fit(X_selected_train)
X_selected_train_scaled = scaler.transform(X_selected_train)
X_selected_test_scaled = scaler.transform(X_selected_test)


In [None]:
reg_sel = RandomForestRegressor().fit(X_selected_train_scaled, y_train)
print(f'Training Score: {reg_sel.score(X_selected_train_scaled, y_train)}')
print(f'Testing Score: {reg_sel.score(X_selected_test_scaled, y_test)}')

In [None]:
features = reg_sel.feature_importances_
features = sorted(features, reverse=True)
features

In [None]:
plt.bar(x = range(len(features)), height=features)
plt.show()

In [None]:
import matplotlib.pyplot as plt
predictions = list(reg_sel.predict(X_selected_test_scaled))
fig,ax=plt.subplots(figsize = (16,3))
ax.plot(list(y_test))
ax.plot(predictions, 'o')
plt.legend(["actual value", "model's pred"], loc='upper right')
plt.savefig("../images/RandForestPredFeature.png")
plt.show()

In [None]:
import joblib

In [None]:
joblib.dump(reg_sel,"trained_rand_forest_selected.joblib")

### 2. a. Linear Regression model before scaled:

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_df,y_df,test_size=0.20, random_state = 42)

In [None]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression().fit(X_train, y_train)
print('Model: Linear Regression')
print(f'Score: {reg.score(X_test, y_test)}\n')
plt.bar(np.arange(len(reg.coef_)), reg.coef_)
plt.title(f'Linear Regression coefficient plot')
plt.savefig("../images/LinearRegCoef.png")
plt.show()

#### 2. b. Linear Regression model scaled test model improvement:

In [None]:
scaler= StandardScaler()
X_df_scaled=scaler.fit_transform(X_df)
X_train_scaled,X_test_scaled,y_train,y_test=train_test_split(X_df_scaled,y_df,test_size=0.2,random_state=42)

In [None]:
from sklearn.linear_model import LinearRegression

reg = LinearRegression().fit(X_train_scaled, y_train)
print('Model: Linear Regression Scaled')
print(f'Score: {reg.score(X_test_scaled, y_test)}\n')
plt.bar(np.arange(len(reg.coef_)), reg.coef_)
plt.title(f'Linear Regression Scaled coefficient plot')
plt.savefig("../images/LinearRegCoefScaled.png")
plt.show()

### 2. c. LASSO Regression model:

In [None]:
from sklearn.linear_model import Lasso
reg = Lasso(max_iter=2000).fit(X_train, y_train)
print('Model: LASSO Regression')
print(f'Score: {reg.score(X_test, y_test)}\n')
plt.bar(np.arange(len(reg.coef_)), reg.coef_)
plt.title(f'LASSO coefficient plot')
plt.savefig("../images/LassoRegCoef.png")
plt.show()    

### 2. d. Ridge Regression model:

In [None]:
from sklearn.linear_model import Ridge
reg = Ridge().fit(X_train, y_train)
print('Model: Ridge Regression')
print(f'Score: {reg.score(X_test, y_test)}\n')
plt.bar(np.arange(len(reg.coef_)), reg.coef_)
plt.title(f'Ridge coefficient plot')
plt.savefig("../images/RidgeRegCoef.png")
plt.show()    

### 2. e. ElasticNet Regression model:

In [None]:
from sklearn.linear_model import ElasticNet
reg = ElasticNet(max_iter=2000).fit(X_train, y_train)
print('Model: ElasticNet Regression')
print(f'Score: {reg.score(X_test, y_test)}\n')
plt.bar(np.arange(len(reg.coef_)), reg.coef_)
plt.title(f'ElasticNet coefficient plot')
plt.savefig("../images/ElasticNetCoef.png")
plt.show()