## California housing prediction

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns  # for nicer graphics

from sklearn.datasets import fetch_california_housing

In [2]:
housing = fetch_california_housing()

In [4]:
print(housing.__doc__)

Container object for datasets

    Dictionary-like object that exposes its keys as attributes.

    >>> b = Bunch(a=1, b=2)
    >>> b['b']
    2
    >>> b.b
    2
    >>> b.a = 3
    >>> b['a']
    3
    >>> b.c = 6
    >>> b['c']
    6

    


In [5]:
print(housing.DESCR)

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block
        - HouseAge      median house age in block
        - AveRooms      average number of rooms
        - AveBedrms     average number of bedrooms
        - Population    block population
        - AveOccup      average house occupancy
        - Latitude      house block latitude
        - Longitude     house block longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
http://lib.stat.cmu.edu/datasets/

The target variable is the median house value for California districts.

This dataset was derived from the 1990 U.S. census, using one row per census
block group. A block group is the smallest geographical unit for which the U.S.
Census Bur

In [6]:
housing.keys()

dict_keys(['data', 'target', 'feature_names', 'DESCR'])

In [16]:
housing['feature_names']

data = housing['data']
target = housing['target']

**i cannot use standard scaling if the data has outliers since it will affect the sacaling because it will have effect on mean**

In [19]:
tabledata = np.c_[data,target]
data_names =   housing['feature_names']
target_names = np.array(['Target'])

tableColumns = np.append(data_names,target_names)

Table = pd.DataFrame(data =tabledata,columns =tableColumns)

In [20]:
Table.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Target
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [46]:
Table.columns

Index(['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup',
       'Latitude', 'Longitude', 'Target'],
      dtype='object')

In [47]:
#from description missing values are None

for column in Table.columns:
    print((Table.loc[Table[column]==None]).sum())
    print("{0} column has null count of {1}. ".format(str(column),)

In [59]:
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [60]:
X = Table.iloc[:,0:7]
y = Table.iloc[:,-1]

In [61]:
  X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=29)

In [62]:
from sklearn.linear_model import LinearRegression

In [84]:
linearmodel = LinearRegression()
poly_features = PolynomialFeatures(degree=2, include_bias=False)
scaler = StandardScaler()

#  ("poly_features", poly_features),
# ("scaler", scaler),
regression = Pipeline(
        [
            ("poly_features", poly_features),
            ("scaler", scaler),
            ("model", linearmodel),
        ]
    )

   

In [85]:
regression.fit(X_train,y_train)

Pipeline(memory=None,
     steps=[('poly_features', PolynomialFeatures(degree=2, include_bias=False, interaction_only=False)), ('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('model', LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False))])

In [86]:
regression.score(X_val,y_val)

0.5273430429984214