In [None]:
import os
import sys
import warnings
warnings.filterwarnings('ignore')

## Import required modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
## Load the data
housingData = pd.read_csv('housing.csv')
print('Print first few rows of this data - ')
print()
print(housingData.head())
X = housingData.drop(axis=1, columns=['median_house_value']).values
y = housingData[['median_house_value']].values

In [None]:
## Handle missing values
sys.stderr = open(os.devnull, "w")       # silence stderr
from sklearn.preprocessing import Imputer
sys.stderr = sys.__stderr__              # unsilence stderr
missingValueImputer = Imputer()

# ocean_proximity is not considered as it is categorical data
X[:, :-1] = missingValueImputer.fit_transform(X[:, :-1])
y = missingValueImputer.fit_transform(y)

In [None]:
## Encode categorical data
from sklearn.preprocessing import LabelEncoder
X_labelencoder = LabelEncoder()
X[:, -1] = X_labelencoder.fit_transform(X[:, -1])

In [None]:
## Split the dataset
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size = 0.2, 
                                                    random_state = 0)

In [None]:
## Standardize data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
y_train = scaler.fit_transform(y_train)
y_test = scaler.transform(y_test)

In [None]:
## Perform Linear Regression
from sklearn.linear_model import LinearRegression
linearRegression = LinearRegression()
linearRegression.fit(X_train, y_train)

# Predict output for test dataset using this model
predictionLinear = linearRegression.predict(X_test)

# Print root mean squared error (RMSE) from Linear Regression
from sklearn.metrics import mean_squared_error
mseLinear = mean_squared_error(y_test, predictionLinear)
print('Root mean squared error (RMSE) from Linear Regression = ')
print(mseLinear)

In [None]:
## Perform Decision Tree Regression
from sklearn.tree import DecisionTreeRegressor
DTregressor = DecisionTreeRegressor()
DTregressor.fit(X_train, y_train)

# Predict output for test dataset using this model
predictionDT = DTregressor.predict(X_test)

# Print root mean squared error (RMSE) from Decision Tree Regression
from sklearn.metrics import mean_squared_error
mseDT = mean_squared_error(y_test, predictionDT)
print('Root mean squared error (RMSE) from Decision Tree Regression = ')
print(mseDT)

In [None]:
## Perform Random Forest Regression
sys.stderr = open(os.devnull, "w")                # silence stderr
from sklearn.ensemble import RandomForestRegressor
sys.stderr = sys.__stderr__                       # unsilence stderr
RFregressor = RandomForestRegressor()
RFregressor.fit(X_train, y_train)

# Predict output for test dataset using this model
predictionRF = RFregressor.predict(X_test)

# Print root mean squared error (RMSE) from Random Forest Regression
from sklearn.metrics import mean_squared_error
mseRF = mean_squared_error(y_test, predictionRF)
print('Root mean squared error (RMSE) from Random Forest Regression = ')
print(mseRF)

In [None]:
## Perform Linear Regression with one independent variable
X_train_median_income = X_train[: , [7]]
X_test_median_income = X_test[: , [7]]

# Perform Linear Regression to predict housing values based on median_income
from sklearn.linear_model import LinearRegression
linearRegression2 = LinearRegression()
linearRegression2.fit(X_train_median_income, y_train)

# Predict output for test dataset using the fit model
predictionLinear2 = linearRegression2.predict(X_test_median_income)

# Plot the model for training data as well as for test data
# to check if the model satisfies the test data

# Training data set
plt.scatter(X_train_median_income, y_train, color = 'green')
plt.plot(X_train_median_income, 
          linearRegression2.predict(X_train_median_income), color = 'red')
plt.title('compare Training result - median_income / median_house_value')
plt.xlabel('median_income')
plt.ylabel('median_house_value')
plt.show()

# Testing data set
plt.scatter(X_test_median_income, y_test, color = 'blue')
plt.plot(X_test_median_income, 
          linearRegression2.predict(X_test_median_income), color = 'red')
plt.title('compare Testing result - median_income / median_house_value')
plt.xlabel('median_income')
plt.ylabel('median_house_value')
plt.show()