In [None]:
# With the California Housing Dataset, we are going to perform a multiple 
# linear regression to make more sophisticated housing price predications.

#loading the datasets
from sklearn.datasets import fetch_california_housing

#creating an instance of the dataset
california = fetch_california_housing()

# Displaying the dataset description
#print(california.DESCR)

# confirming the number of features and samples
california.data.shape

california.target.shape

california.feature_names

In [None]:
# Exploring the data with pandas
import pandas as pd

pd.set_option("precision", 4)

pd.set_option("max_columns", 9)

pd.set_option("display.width", None)

california_df = pd.DataFrame(california.data, columns=california.feature_names)

california_df["MedHouseValue"] = pd.Series(california.target)

california_df.head()

california_df.describe()

In [None]:
# Visualizing the features
sample_df = california_df.sample(frac=0.1, random_state=17)

import matplotlib.pyplot as plt
import seaborn as sns

sns.set(font_scale=2)

sns.set_style("whitegrid")

for feature in california.feature_names:
    plt.figure(figsize=(16,9))
    sns.scatterplot(data=sample_df,x=feature,y="MedHouseValue",
    hue="MedHouseValue",palette="cool",legend=False)


In [None]:
# splitting the data for training and testing
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(california.data, california.target,random_state=11)

X_train.shape

X_test.shape

# training the model
from sklearn.linear_model import LinearRegression

linear_regression = LinearRegression()

linear_regression.fit(X=X_train, y=y_train)

for i, name in enumerate(california.feature_names):
    print(f'{name:>10}:{linear_regression.coef_[i]}')



In [None]:
# testing the model
predicted = linear_regression.predict(X_test)

expected = y_test

predicted[:5]

expected[:5]

# visualizing the expected vs predicted prices
df = pd.DataFrame()

df['Expected'] = pd.Series(expected)

df['Predicted'] = pd.Series(predicted)

figure = plt.figure(figsize=(9,9))

axes = sns.scatterplot(data=df,x='Expected',y='Predicted',hue='Predicted',
        palette='cool', legend=False)


# setting the x- and y-axes limits to use the same scale along both axes
start = min(expected.min(), predicted.min())

end = max(expected.max(), predicted.max())

axes.set_xlim(start, end)

axes.set_ylim(start, end)

line = plt.plot([start, end],[start, end],'k--')


In [None]:
# regression model metrics
from sklearn import metrics

metrics.r2_score(expected,predicted)

metrics.mean_squared_error(expected, predicted)

# choosing the best model
from sklearn.linear_model import ElasticNet, Lasso, Ridge

estimators = {
    'LinearRegression':linear_regression,
    'ElasticNet':ElasticNet(),
    'Lasso':Lasso(),
    'Ridge':Ridge()
}

from sklearn.model_selection import cross_val_score, KFold

for estimator_name, estimator_object in estimators.items():
    kfold = KFold(n_splits=10, random_state=11, shuffle=True)
    scores = cross_val_score(estimator=estimator_object,
    X=california.data, y=california.target, cv=kfold
    )
    print(f'{estimator_name:>16}: ' + f'Mean of r2 scores = {scores.mean():.3%}')