In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

: 

#### Load the dataset: Boston housing data

In [3]:
from sklearn.datasets import fetch_california_housing

In [4]:
california = fetch_california_housing()

In [None]:
california.keys()

In [None]:
print(california.DESCR)

In [None]:
california.data

In [None]:
california.target

In [None]:
california.feature_names

### Prepare the data

In [10]:
dataset = pd.DataFrame(data=california.data, columns=california.feature_names)

In [None]:
dataset.head()

In [12]:
dataset['price'] = california.target

In [None]:
dataset.head()

In [None]:
dataset.head()

### Analysis:

In [None]:
dataset.info()

### Summarizing the stats of the dataset

In [None]:
dataset.describe()

### Check the missing values

In [None]:
dataset.isnull()

In [None]:
dataset.isnull().sum()

### Exploratory Data Analysis:
#### Correlation

In [None]:
# in the correlation matrix, we can see the correlation between the features and the target variable
# and we need to check for multicollinearity between the features
dataset.corr()

In [None]:
plt.scatter(dataset['MedInc'], dataset['price'])

In [None]:
plt.scatter(dataset['HouseAge'], dataset['price'])

In [None]:
plt.scatter(dataset['AveRooms'], dataset['price'])

In [None]:
plt.scatter(dataset['AveBedrms'], dataset['price'])

In [None]:
plt.scatter(dataset['Population'], dataset['price'])
plt.xlabel('Population')
plt.ylabel('Price')

In [None]:
plt.scatter(dataset['AveBedrms'], dataset['price'])

In [None]:
import seaborn as sns
sns.regplot(x='MedInc', y='price', data=dataset)

In [None]:
sns.regplot(x='MedInc', y='price', data=dataset)

In [None]:
# Dependent and Independent variables

dataset

In [36]:
X = dataset.iloc[:, :-1]
y = dataset.iloc[:,-1]

In [None]:
X.head()

In [None]:
y.head()

In [39]:
## Split the data into training and testing
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
X_train.head()

In [None]:
y_train.head()

In [None]:
X_test.head()

In [None]:
y_test.head()

In [None]:
X_train.shape

In [None]:
X_test.shape

### Standardize the dataset

In [46]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [47]:
X_train = scaler.fit_transform(X_train)

In [48]:
X_test = scaler.transform(X_test)

In [None]:
import pickle
pickle.dump(scaler, open('scaling.pkl', 'wb'))

In [None]:
X_train

### Model Building

In [50]:
from sklearn.linear_model import LinearRegression

In [51]:
regression = LinearRegression()

In [None]:
regression.fit(X_train, y_train)

In [None]:
### Print the coefficients
print(regression.coef_)
# the coefficients are the weights of the features

In [None]:
print(regression.intercept_)

In [None]:
## on which features the model is trained
regression.get_params()

In [56]:
## predict the values
y_pred = regression.predict(X_test)

In [None]:
y_pred

In [None]:
# plot a scatter plot between the actual and predicted values
plt.scatter(y_test, y_pred)

In [59]:
# prediction with respect to the residuals
residuals = y_test - y_pred

In [None]:
residuals

In [None]:
## plot the residuals

sns.displot(residuals, kind='kde')

In [None]:
# the plot should be normally distributed but we have some outliers

In [None]:
# scatter plot between the residuals and the predicted values
# unfirom distribution of the residuals and the predicted values
plt.scatter(y_pred, residuals)

In [None]:
# to be sure that the model is doing well, we need a performance metric

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
print('Mean Absolute Error:', mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, y_pred)))
print('R2 Score:', r2_score(y_test, y_pred))

In [65]:
# the more the R2 score is closer to 1, the better the model is
r2_score = r2_score(y_test, y_pred)

adj_r2 = 1 - (1 - r2_score) * (len(y_test) - 1) / (len(y_test) - X_test.shape[1] - 1)

In [None]:
adj_r2

### New Data Prediction:

In [None]:
## transformation of new data

scaler.transform(california.data[0].reshape(1, -1))

In [None]:


regression.predict(scaler.transform(california.data[0].reshape(1, -1)))
# what we did here is to reshape the data to a 2D array instead of a 1D array

### Pickling the model

In [74]:
import pickle

In [75]:
pickle.dump(regression, open('regmodel.pkl', 'wb'))

In [76]:
# load the model
model = pickle.load(open('regmodel.pkl', 'rb'))

In [None]:
# prediction with the loaded model
model.predict(scaler.transform(california.data[0].reshape(1, -1)))