In [67]:
import os
import tarfile
from six.moves import urllib
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from sklearn import preprocessing
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import math
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [68]:
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

In [69]:
fetch_housing_data()

In [70]:
df = pd.read_csv("datasets/housing/housing.csv")
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


### Label Encoder 'ocean_proximity'

In [71]:
label = preprocessing.LabelEncoder()
label.fit(['NEAR BAY', '<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'ISLAND'])
df['ocean_proximity'] = label.transform(df['ocean_proximity'])

### Fill NA 'total bedrooms'  

In [72]:
median = df["total_bedrooms"].median()
df["total_bedrooms"].fillna(median, inplace = True)

### Features combination 
<blockquote>total_rooms / households => rooms_per_household<br>
total_bedrooms / total_rooms => bedrooms_per_rooms<br>
population / households => population_per_household</blockquote>

In [73]:
df["rooms_per_household"] = df["total_rooms"] / df["households"]
df["bedrooms_per_rooms"] = df["total_bedrooms"] / df["total_rooms"]
df["population_per_household"] = df["population"] / df["households"]


### Get each feature

In [74]:
house = df["households"]
houssing_median_age = df["housing_median_age"]
latitude = df["latitude"]
longitude = df["longitude"]
median_house_value = df["median_house_value"]
median_income = df["median_income"]
population = df["population"]
total_bedrooms = df["total_bedrooms"]
total_rooms = df["total_rooms"]
ocean_proximity = df["ocean_proximity"]

rooms_per_household = df['rooms_per_household']
population_per_household = df['population_per_household']
bedrooms_per_rooms = df['bedrooms_per_rooms']

### Log transform :
<blockquote>rooms_per_household<br>
    population_per_household<br>
    bedrooms_per_rooms<br>
    total_rooms<br>
    total_bedrooms<\blockquote>

In [75]:
df["rooms_per_household_log"] = (rooms_per_household - rooms_per_household.min() + 1).transform(np.log)
df["population_per_household_log"] = (population_per_household  - population_per_household.min() + 1).transform(np.log)
df["bedrooms_per_rooms_log"] = (bedrooms_per_rooms   - bedrooms_per_rooms.min() + 1).transform(np.log)

df = df.drop(['rooms_per_household','population_per_household','bedrooms_per_rooms'], axis =1)

df["total_rooms_log"] = (total_rooms - total_rooms.min() + 1).transform(np.log)
df["total_bedrooms_log"] = (total_bedrooms - total_bedrooms.min() + 1).transform(np.log)

df = df.drop(['total_rooms', 'total_bedrooms','households', 'population'] , axis = 1)

###  Drop outlier in 'housing_median_age' 

In [76]:
factor = 1
upper_lim = df['housing_median_age'].mean () + df['housing_median_age'].std () * factor
lower_lim = df['housing_median_age'].mean () - df['housing_median_age'].std () * factor

df = df[(df['housing_median_age'] < upper_lim) & (df['housing_median_age'] > lower_lim)]


### Train, Valid, Test split

In [77]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13000 entries, 0 to 20638
Data columns (total 11 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   longitude                     13000 non-null  float64
 1   latitude                      13000 non-null  float64
 2   housing_median_age            13000 non-null  float64
 3   median_income                 13000 non-null  float64
 4   median_house_value            13000 non-null  float64
 5   ocean_proximity               13000 non-null  int32  
 6   rooms_per_household_log       13000 non-null  float64
 7   population_per_household_log  13000 non-null  float64
 8   bedrooms_per_rooms_log        13000 non-null  float64
 9   total_rooms_log               13000 non-null  float64
 10  total_bedrooms_log            13000 non-null  float64
dtypes: float64(10), int32(1)
memory usage: 1.1 MB


In [78]:
train_va, test = train_test_split(df, test_size = 0.2, random_state = 0)
train, valid = train_test_split(train_va,test_size = 0.25,random_state = 0)

In [79]:
x_train = np.asanyarray(train_va.drop(['median_house_value'], axis = 1))
y_train = np.asanyarray(train_va["median_house_value"])

x_test =  np.asanyarray(test.drop(['median_house_value'],axis=1))
y_test = np.asanyarray(test['median_house_value'])

x_valid =  np.asanyarray(valid.drop(['median_house_value'],axis=1))
y_valid = np.asanyarray(valid['median_house_value'])

### Fit LR model

In [80]:
regr = LinearRegression()
regr.fit(x_train, y_train)

LinearRegression()

### Coef && Intercept 

In [81]:
print ('Coeficients: ',regr.coef_)
print( 'Intercept: ', regr.intercept_)

Coeficients:  [ -42569.25388531  -43600.99499269     678.37340855   43707.03159289
   -2093.40606215   35425.25293151 -119824.86737315  171235.94703467
  -62870.37631391   68531.02873544]
Intercept:  -3399667.813024145


### Predict 

In [82]:
yhat = regr.predict(x_test)

### Mean squared error 

In [83]:
evaluation = mean_squared_error(y_test,yhat)
evaluation_sq = np.sqrt(evaluation)
evaluation_sq

62874.735484446115

### Raw notes

### df_3
#### 67923.49829006246
<pre>
    Column                        Non-Null Count  Dtype
    ---  ------                        --------------  -----  
    0   longitude                     20640 non-null  float64
    1   latitude                      20640 non-null  float64
    2   housing_median_age            20640 non-null  float64
    3   median_income                 20640 non-null  float64
    4   median_house_value            20640 non-null  float64
    5   ocean_proximity               20640 non-null  int32  
    6   rooms_per_household_log       20640 non-null  float64
    7   population_per_household_log  20640 non-null  float64
    8   bedrooms_per_rooms_log        20640 non-null  float64
</pre>

### df_4
#### 67628.67870921799
<pre>
    Column                        Non-Null Count  Dtype  
    ---  ------                        --------------  -----  
     0   longitude                     20640 non-null  float64
     1   latitude                      20640 non-null  float64
     2   housing_median_age            20640 non-null  float64
     3   households                    20640 non-null  float64
     4   median_income                 20640 non-null  float64
     5   median_house_value            20640 non-null  float64
     6   ocean_proximity               20640 non-null  int32  
     7   rooms_per_household_log       20640 non-null  float64
     8   population_per_household_log  20640 non-null  float64
     9   bedrooms_per_rooms_log        20640 non-null  float64
</pre>
### df_5 
#### 67597.65201828879
<pre>
       Column                        Non-Null Count  Dtype  
    ---  ------                        --------------  -----  
     0   longitude                     20640 non-null  float64
     1   latitude                      20640 non-null  float64
     2   housing_median_age            20640 non-null  float64
     3   total_rooms                   20640 non-null  float64
     4   total_bedrooms                20640 non-null  float64
     5   households                    20640 non-null  float64
     6   median_income                 20640 non-null  float64
     7   median_house_value            20640 non-null  float64
     8   ocean_proximity               20640 non-null  int32  
     9   rooms_per_household_log       20640 non-null  float64
     10  population_per_household_log  20640 non-null  float64
     11  bedrooms_per_rooms_log        20640 non-null  float64
</pre>
### df_6
#### 67547.34811205833
<pre>
        Column                        Non-Null Count  Dtype  
    ---  ------                        --------------  -----  
     0   longitude                     20640 non-null  float64
     1   latitude                      20640 non-null  float64
     2   housing_median_age            20640 non-null  float64
     3   households                    20640 non-null  float64
     4   median_income                 20640 non-null  float64
     5   median_house_value            20640 non-null  float64
     6   ocean_proximity               20640 non-null  int32  
     7   rooms_per_household_log       20640 non-null  float64
     8   population_per_household_log  20640 non-null  float64
     9   bedrooms_per_rooms_log        20640 non-null  float64
     10  total_rooms_log               20640 non-null  float64
     11  total_bedrooms_log            20640 non-null  float64
 </pre>
#### 63153.33311985455
<pre>
     0   longitude                     11702 non-null  float64
     1   latitude                      11702 non-null  float64
     2   housing_median_age            11702 non-null  float64
     3   population                    11702 non-null  float64
     4   households                    11702 non-null  float64
     5   median_income                 11702 non-null  float64
     6   median_house_value            11702 non-null  float64
     7   ocean_proximity               11702 non-null  int32  
     8   rooms_per_household_log       11702 non-null  float64
     9   population_per_household_log  11702 non-null  float64
     10  bedrooms_per_rooms_log        11702 non-null  float64
     11  total_rooms_log               11702 non-null  float64
     12  total_bedrooms_log            11702 non-null  float64
     xu ly outlier housing_median_age - drop
</pre>