In [None]:
import os
import tarfile
from six.moves import urllib
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from sklearn import preprocessing
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import math
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [2]:
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

In [3]:
fetch_housing_data()

In [4]:
df = pd.read_csv("datasets/housing/housing.csv")
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


### Label Encoder 'ocean_proximity'

In [5]:
label = preprocessing.LabelEncoder()
label.fit(['NEAR BAY', '<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'ISLAND'])
df['ocean_proximity'] = label.transform(df['ocean_proximity'])

### Fill NA 'total bedrooms'  

In [6]:
median = df["total_bedrooms"].median()
df["total_bedrooms"].fillna(median, inplace = True)

### Features combination 
<blockquote>total_rooms / households => rooms_per_household<br>
total_bedrooms / total_rooms => bedrooms_per_rooms<br>
population / households => population_per_household</blockquote>

In [7]:
df["rooms_per_household"] = df["total_rooms"] / df["households"]
df["bedrooms_per_rooms"] = df["total_bedrooms"] / df["total_rooms"]
df["population_per_household"] = df["population"] / df["households"]


### Get each feature

In [8]:
house = df["households"]
houssing_median_age = df["housing_median_age"]
latitude = df["latitude"]
longitude = df["longitude"]
median_house_value = df["median_house_value"]
median_income = df["median_income"]
# ?????????????????????? mấy cái này xài per rồi bỏ đi
# population = df["population"]
# total_bedrooms = df["total_bedrooms"]
# total_rooms = df["total_rooms"]
ocean_proximity = df["ocean_proximity"]

rooms_per_household = df['rooms_per_household']
population_per_household = df['population_per_household']
bedrooms_per_rooms = df['bedrooms_per_rooms']

### Log transform :
<blockquote>rooms_per_household<br>
    population_per_household<br>
    bedrooms_per_rooms<br>
    total_rooms<br>
    total_bedrooms<\blockquote>

In [9]:
df["rooms_per_household_log"] = (rooms_per_household - rooms_per_household.min() + 1).transform(np.log)
df["population_per_household_log"] = (population_per_household  - population_per_household.min() + 1).transform(np.log)
df["bedrooms_per_rooms_log"] = (bedrooms_per_rooms   - bedrooms_per_rooms.min() + 1).transform(np.log)

df = df.drop(['rooms_per_household','population_per_household','bedrooms_per_rooms'], axis =1)

# df["total_rooms_log"] = (total_rooms - total_rooms.min() + 1).transform(np.log)
# df["total_bedrooms_log"] = (total_bedrooms - total_bedrooms.min() + 1).transform(np.log)

df = df.drop(['total_rooms', 'total_bedrooms','households', 'population'] , axis = 1)

In [10]:
df

Unnamed: 0,longitude,latitude,housing_median_age,median_income,median_house_value,ocean_proximity,rooms_per_household_log,population_per_household_log,bedrooms_per_rooms_log
0,-122.23,37.88,41.0,8.3252,452600.0,3,1.965429,1.051957,0.103855
1,-122.22,37.86,21.0,8.3014,358500.0,3,1.855045,0.882748,0.112119
2,-122.24,37.85,52.0,7.2574,352100.0,3,2.133217,1.134607,0.088345
3,-122.25,37.85,52.0,5.6431,341300.0,3,1.786948,1.049295,0.137418
4,-122.25,37.85,52.0,3.8462,342200.0,3,1.861861,0.911945,0.126584
...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1.5603,78100.0,1,1.648524,1.053719,0.171828
20636,-121.21,39.49,18.0,2.5568,77100.0,1,1.835438,1.232706,0.163867
20637,-121.22,39.43,17.0,1.7000,92300.0,1,1.678850,0.968248,0.163837
20638,-121.32,39.43,18.0,1.8672,84700.0,1,1.701718,0.888262,0.167835


###  Drop outlier in 'housing_median_age' 

In [11]:
factor = 1
upper_lim = df['housing_median_age'].mean () + df['housing_median_age'].std () * factor
lower_lim = df['housing_median_age'].mean () - df['housing_median_age'].std () * factor

df = df[(df['housing_median_age'] < upper_lim) & (df['housing_median_age'] > lower_lim)]


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13000 entries, 0 to 20638
Data columns (total 9 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   longitude                     13000 non-null  float64
 1   latitude                      13000 non-null  float64
 2   housing_median_age            13000 non-null  float64
 3   median_income                 13000 non-null  float64
 4   median_house_value            13000 non-null  float64
 5   ocean_proximity               13000 non-null  int32  
 6   rooms_per_household_log       13000 non-null  float64
 7   population_per_household_log  13000 non-null  float64
 8   bedrooms_per_rooms_log        13000 non-null  float64
dtypes: float64(8), int32(1)
memory usage: 964.8 KB


# Cap outlier with percentile

In [None]:
upper_lim = df['housing_median_age'].quantile(.95)
lower_lim = df['housing_median_age'].quantile(.5)
df.loc[(df['housing_median_age'] > upper_lim),'housing_median_age'] = upper_lim
df.loc[(df['housing_median_age'] < lower_lim),'housing_median_age'] = lower_lim

In [None]:
df

# Train, Test, Valid
## Chia theo tỉ lệ 6 2 2

In [15]:
train_va, test = train_test_split(df, test_size = 0.2, random_state = 0)
train, valid = train_test_split(train_va,test_size = 0.25,random_state = 0)

In [16]:
x_train = np.asanyarray(train_va.drop(['median_house_value'], axis = 1))
y_train = np.asanyarray(train_va["median_house_value"])

x_test =  np.asanyarray(test.drop(['median_house_value'],axis=1))
y_test = np.asanyarray(test['median_house_value'])

x_valid =  np.asanyarray(valid.drop(['median_house_value'],axis=1))
y_valid = np.asanyarray(valid['median_house_value'])

In [None]:
print(x_train.shape)
print(x_test.shape)
print(df.shape)

# MinMax Scaler

In [None]:
x_train.columns

In [None]:
scaler = df[['longitude', 'latitude','housing_median_age','median_income','ocean_proximity']]

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
x_train = StandardScaler().fit(x_train).transform(x_train.astype(float))

In [None]:
x_test = StandardScaler().fit(x_test).transform(x_test.astype(float))

In [None]:
df[['longitude', 'latitude','housing_median_age','median_income','ocean_proximity']] = scaler

In [None]:
df.columns

In [None]:
data = df[['longitude', 'latitude', 'housing_median_age', 'median_income',
       'median_house_value', 'ocean_proximity', 'rooms_per_household_log',
       'population_per_household_log', 'bedrooms_per_rooms_log']]

In [None]:
data

In [None]:
x_train

# Modeling
### Fit LR model

In [18]:
regr = LinearRegression()
regr.fit(x_train, y_train)

LinearRegression()

### Coefficients && Intercept 

In [19]:
print ('Coeficients: ',regr.coef_)
print( 'Intercept: ', regr.intercept_)

Coeficients:  [ -43279.3122169   -44546.66370663     571.79239994   42407.60024488
   -2199.05548913   32566.08359755 -124186.68083613  452748.86607494]
Intercept:  -3543166.479056815


### Predict 

In [20]:
yhat_valid = regr.predict(x_valid)

In [21]:
yhat_test = regr.predict(x_test)

### Mean squared error 

In [22]:
evaluation_valid = mean_squared_error(y_valid, yhat_valid)
evaluation_test = mean_squared_error(y_test,yhat_test)
evaluation_sq_valid = np.sqrt(evaluation_valid)
evaluation_sq_test = np.sqrt(evaluation_test)
print("MSE trên tập valid:", evaluation_sq_valid)
print("MSE trên tập test:", evaluation_sq_test)

MSE trên tập valid: 65901.92726315008
MSE trên tập test: 62943.72188458306


### Raw notes

#### 63153.33311985455
<pre>
     0   longitude                     11702 non-null  float64
     1   latitude                      11702 non-null  float64
     2   housing_median_age            11702 non-null  float64
     3   population                    11702 non-null  float64
     4   households                    11702 non-null  float64
     5   median_income                 11702 non-null  float64
     6   median_house_value            11702 non-null  float64
     7   ocean_proximity               11702 non-null  int32  
     8   rooms_per_household_log       11702 non-null  float64
     9   population_per_household_log  11702 non-null  float64
     10  bedrooms_per_rooms_log        11702 non-null  float64
     11  total_rooms_log               11702 non-null  float64
     12  total_bedrooms_log            11702 non-null  float64
     xu ly outlier housing_median_age - drop
</pre>