In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns


### Load The Dataset California Housing

In [None]:
from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing()

type(housing)


In [None]:
housing

In [None]:
print(housing.DESCR)

In [None]:
print(housing.feature_name)

In [None]:
print(housing.target)

In [None]:
print(housing.data)


### prepare the data

In [None]:
dataset = pd.DataFrame(housing.data, columns=housing.feature_names)

In [None]:
type(dataset)

In [None]:
dataset.head()

In [None]:
dataset.tail()

In [None]:
dataset['Price'] = housing.target

In [None]:
dataset.head()

In [None]:
dataset.info()

In [None]:
dataset.isnull().sum() # check for null values

### Statistical description of the data


In [None]:
dataset.describe()

### EDA - Exploratory Data Analysis

--- 
- understand the data
- data preparation
- EDA (find the correlation coefficient)

In [None]:
dataset.corr()

### observation
---
- AveRooms and AveBedrms = 0.847621 (somewhat +vely correlated)
- Latitude and Longitude = -0.924664 (-vely correlated)

In [None]:
sns.pairplot(dataset)

### boxplot - to detect outliers

In [None]:
fig, ax = plt.subplots(figsize =(7,10))
sns.boxplot(data = dataset, ax = ax)
plt.savefig("boxplot.jpg")

### Split the data into dependent and independent features

In [None]:
X = dataset.iloc[:,:-1]
y = dataset.iloc[:,-1]

In [None]:
X

In [None]:
y

In [None]:
# split the data into train and test set
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42)

In [None]:
X_train

In [None]:
X_test

In [None]:
y_train

In [None]:
y_test

### Normalization of given data points 

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_norm = scaler.fit_transform(X_train)

In [None]:
X_train_norm

In [None]:
fig, ax = plt.subplots(figsize =(15,15))
sns.boxplot(data = X_train_norm, ax = ax)
plt.savefig("boxplotTrainData.jpg")

In [None]:
X_test_norm = scaler.transform(X_test)

In [None]:
X_test_norm

In [None]:
fig, ax = plt.subplots(figsize =(15,15))
sns.boxplot(data = X_test_norm, ax = ax)
plt.savefig("boxplotTestData.jpg")

In [None]:
### train set -> fit_transform(X_train)
### test set  -> transform(X_test)
### why??? 

# The answer is for training
# fit will get the mean and standard deviation of every column
# transfrom will use the mean and standard deviation to normalize the
#  data by applying z value
 
# Formula used is :
# z = (x-mu)/sigma , where mu = population mean, sigma = std. deviation

# for testing we have the value of mu and sigma of training , so we perform only the transform


### Model Training

In [None]:
from sklearn.linear_model import LinearRegression
regression = LinearRegression()
regression.fit(X_train_norm,y_train)

In [None]:
print(regression.coef_)
## y = m1x1+m2x2+...m8x8+c, c=intercept, m1 - m8 = coeffiecients

In [None]:
print(regression.intercept_)

### Model prediction

In [None]:
reg_pred = regression.predict(X_test_norm)  #predicted value
reg_pred

In [None]:
# calculate the error or the residual
residuals = y_test - reg_pred

In [None]:
residuals

In [None]:
# distribution plot of residuals
# it should look like a normal distribution
sns.displot(residuals,kind='kde')

### Model performance

In [None]:
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score

## lower error value we need for MSE and MAE
## higher va;ue r2_score and adjusted r2_score

print(mean_squared_error(y_test,reg_pred))
print(mean_absolute_error(y_test,reg_pred))
print(r2_score(y_test,reg_pred))

#### adjusted r2 

In [None]:
# score = r2_score(y_test,reg_pred)
# adjusted_r2 = 1-((1-score)*len(y_test)-1)/(len(y_test)-X_test_norm.shape[1]-1)
# print(adjusted_r2)

## save the model

In [None]:
# pickle file
import pickle
pickle.dump(regression, open('Housemodel.pkl','wb'))

## use the saved model to predict the house price

In [None]:
model = pickle.load(open("Housemodel.pkl",'rb'))

In [None]:
housing.data[0]

In [None]:
scaler.transform(housing.data[0].reshape(1,-1))

In [None]:
model.predict(scaler.transform(housing.data[0].reshape(1,-1)))