In [None]:
# Training and running a linear model

In [11]:
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from zlib import crc32



# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# to make this notebook's output stable across runs
np.random.seed(42)


In [12]:
# Load data
housing = pd.read_csv('/Users/monkiky/Desktop/ML/Dataset/housing/housing.csv')

In [13]:
# Some visualization
housing

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


In [21]:
for i in housing:
    
    print("Has ", i, "Na values    ", housing[i].isnull().values.any())

#housing.isnull().values.any()

Has  longitude Na values     False
Has  latitude Na values     False
Has  housing_median_age Na values     False
Has  total_rooms Na values     False
Has  total_bedrooms Na values     True
Has  population Na values     False
Has  households Na values     False
Has  median_income Na values     False
Has  median_house_value Na values     False
Has  ocean_proximity Na values     False


In [16]:
housing.loc[pd.isna(housing["total_bedrooms"]), :].index

Int64Index([  290,   341,   538,   563,   696,   738,  1097,  1350,  1456,
             1493,
            ...
            19932, 19959, 20046, 20069, 20125, 20267, 20268, 20372, 20460,
            20484],
           dtype='int64', length=207)

In [None]:
%matplotlib inline
housing.hist(bins=50, figsize=(20,15))
plt.show()

## Split data

In [None]:
# By using sklearn

train_set, test_set = train_test_split(housing, test_size=0.2)

In [None]:
print(len(train_set), "train +", len(test_set), "test")

## Split data with identification

In [None]:
# This is fine but a better approach is to identify each row and split the df

def test_set_check(identifier, test_ratio):
    return crc32(np.int64(identifier)) & 0xffffffff < test_ratio * 2**32

def split_train_test_by_id(data, test_ratio, id_column):
    ids = data[id_column]
    in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio))
    return data.loc[~in_test_set], data.loc[in_test_set]

In [None]:
housing_with_id = housing.reset_index()   # adds an `index` column
train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, "index")

In [None]:
print(len(train_set), "train +", len(test_set), "test")

In [None]:
train_set

## Split data when your data is not homogeneous
Stratified sampling

In [None]:
# Stratified sampling by median income.
# We need first to stablish a category attribute
housing["income_cat"] = pd.cut(housing["median_income"],
                               bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                               labels=[1, 2, 3, 4, 5])
housing["income_cat"].value_counts()

In [None]:
housing["income_cat"].hist()

In [None]:
# Now you can do stratified sampling based in the income.
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

In [None]:
strat_train_set

In [None]:
# Testing stratified sampling
strat_test_set["income_cat"].value_counts() / len(strat_test_set)

In [None]:
housing["income_cat"].value_counts() / len(housing)

In [None]:
strat_train_set["income_cat"].value_counts() / len(strat_train_set)

## Looking for correlation


In [None]:
corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)

In [None]:
from pandas.plotting import scatter_matrix

attributes = ["median_house_value", "median_income", "total_rooms",
              "housing_median_age"]
scatter_matrix(housing[attributes], figsize=(12, 8))


In [None]:
housing.plot(kind="scatter", x="median_income", y="median_house_value",
             alpha=0.05)
plt.axis([0, 16, 0, 550000])

In [None]:
housing.total_bedrooms.fillna(housing.total_bedrooms.mean())