In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
%matplotlib inline

In [2]:
import csv

In [3]:
df = pd.read_csv('HW2housing.csv', delimiter=',')
df

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


### Data preparation

In [4]:
df.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [5]:
df = df.fillna(0)

In [6]:
df['rooms_per_household'] = df['total_rooms']/df['households']

In [7]:
df['bedrooms_per_room'] = df['total_bedrooms']/df['total_rooms']

In [8]:
df['population_per_household'] = df['population']/df['households']

In [9]:
df

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,rooms_per_household,bedrooms_per_room,population_per_household
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY,6.984127,0.146591,2.555556
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,6.238137,0.155797,2.109842
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY,8.288136,0.129516,2.802260
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY,5.817352,0.184458,2.547945
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY,6.281853,0.172096,2.181467
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND,5.045455,0.224625,2.560606
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND,6.114035,0.215208,3.122807
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND,5.205543,0.215173,2.325635
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND,5.329513,0.219892,2.123209


### Q1 What is the most frequent observation (mode) for the column ocean_proximity?

In [10]:
df.ocean_proximity.value_counts().head(1)

<1H OCEAN    9136
Name: ocean_proximity, dtype: int64

### Split the data

In [41]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [42]:
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [43]:
len(df_train), len(df_val), len(df_test)

(12384, 4128, 4128)

In [44]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [45]:
y_train = df_train.median_house_value.values
y_val = df_val.median_house_value.values
y_test = df_test.median_house_value.values

In [46]:
del df_train['median_house_value']
del df_val['median_house_value']
del df_test['median_house_value']

### Q2 What are the two features that have the biggest correlation in this dataset?

In [47]:
numerical = ['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income', 'rooms_per_household', 'bedrooms_per_room', 'population_per_household']

In [48]:
df_corr = df_train.corr()
df_corr

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms_per_household,bedrooms_per_room,population_per_household
longitude,1.0,-0.925005,-0.099812,0.036449,0.06384,0.09167,0.049762,-0.016426,-0.034814,0.10232,0.011022
latitude,-0.925005,1.0,0.002477,-0.025914,-0.05973,-0.100272,-0.063529,-0.076805,0.119118,-0.124507,-0.002301
housing_median_age,-0.099812,0.002477,1.0,-0.363522,-0.324156,-0.292476,-0.306119,-0.119591,-0.181275,0.129456,0.012167
total_rooms,0.036449,-0.025914,-0.363522,1.0,0.931546,0.853219,0.921441,0.198951,0.168926,-0.194185,-0.029452
total_bedrooms,0.06384,-0.05973,-0.324156,0.931546,1.0,0.87734,0.979399,-0.009833,0.010381,0.078094,-0.034301
population,0.09167,-0.100272,-0.292476,0.853219,0.87734,1.0,0.906841,-0.000849,-0.07621,0.031592,0.064998
households,0.049762,-0.063529,-0.306119,0.921441,0.979399,0.906841,1.0,0.011925,-0.085832,0.058004,-0.032522
median_income,-0.016426,-0.076805,-0.119591,0.198951,-0.009833,-0.000849,0.011925,1.0,0.394154,-0.616617,-0.000454
rooms_per_household,-0.034814,0.119118,-0.181275,0.168926,0.010381,-0.07621,-0.085832,0.394154,1.0,-0.500589,0.001801
bedrooms_per_room,0.10232,-0.124507,0.129456,-0.194185,0.078094,0.031592,0.058004,-0.616617,-0.500589,1.0,-0.002851


In [49]:
'total_bedrooms and households 0.979399'


'total_bedrooms and households 0.979399'

### Make median_house_value binary

In [50]:
df.median_house_value.mean()

206855.81690891474

In [51]:
y_train_b = y_train.copy()
for n in range(len(y_train_b)):
    if y_train_b[n] >= df.median_house_value.mean():
        y_train_b[n] = 1
    else:
        y_train_b[n] = 0
y_train_b, y_train

(array([1., 1., 0., ..., 1., 0., 0.]),
 array([241400., 500001.,  64100., ..., 215300., 139000., 181300.]))

In [52]:
y_val_b = y_val.copy()
for n in range(len(y_val_b)):
    if y_val_b[n] >= df.median_house_value.mean():
        y_val_b[n] = 1
    else:
        y_val_b[n] = 0
y_val_b, y_val

(array([0., 0., 1., ..., 1., 1., 0.]),
 array([ 96700.,  75500., 430900., ..., 344200., 387800., 184200.]))

### Q3 What is the value of mutual information?

In [23]:
from sklearn.feature_extraction import DictVectorizer

In [24]:
from sklearn.metrics import mutual_info_score

In [26]:
round(mutual_info_score(y_train_b, df_train.ocean_proximity), 2)

0.1

### Q4 Train a logistic regression. Calculate the accuracy on the validation dataset

In [27]:
from sklearn.linear_model import LogisticRegression

In [80]:
dv_train = DictVectorizer(sparse=False)

In [81]:
dict_train = df_train.to_dict(orient='records')

In [82]:
dv_train.fit(dict_train)

In [83]:
X_train = dv_train.transform(dict_train)

In [84]:
dict_val = df_val.to_dict(orient='records')

In [86]:
X_val = dv_train.transform(dict_val)

In [87]:
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train_b)

In [35]:
model.coef_[0].round(3)

array([ 1.018,  0.004,  0.037,  0.151,  0.098,  1.231,  0.43 , -1.846,
        0.106,  0.065,  0.915, -0.002,  0.01 , -0.007,  0.002, -0.   ])

In [36]:
model.intercept_[0]

-0.3313631943017421

In [37]:
model.predict(X_train)

array([0., 1., 0., ..., 0., 1., 1.])

In [38]:
y_pred_val = model.predict_proba(X_val)[:,1]
price_decigion = y_pred_val >= 0.5
price_decigion.astype(int)

array([0, 0, 1, ..., 1, 1, 0])

In [39]:
round((y_val_b == price_decigion.astype(int)).mean(),2)

0.84

### Q5 Which of following feature has the smallest difference?

In [77]:
small_train = df_train[['total_rooms', 'total_bedrooms', 'population', 'households']]
small_val = df_val[['total_rooms', 'total_bedrooms', 'population', 'households']]

In [78]:
dv_train_small = DictVectorizer(sparse=False)
dict_train_small = small_train.to_dict(orient='records')
dv_train_small.fit(dict_train_small)

In [88]:
X_small_train = dv_train_small.transform(dict_train_small)
dict_val_small = small_val.to_dict(orient='records')
dv_train_small.fit(dict_train_small)
X_small_val = dv_train_small.transform(dict_val)

In [89]:
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model.fit(X_small_train, y_train_b)

In [92]:
y_pred4 = model.predict_proba(X_small_val)[:,1]
price_decigion = y_pred4 >= 0.5
price_decigion.astype(int)
round((y_val == price_decigion.astype(int)).mean(),2)

0.0

### Q6 Which of these alphas leads to the best RMSE on the validation set?

In [54]:
y_train = np.log1p(y_train)
y_val = np.log1p(y_val)

In [63]:
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge
from math import sqrt

In [71]:
alpha = [0, 0.01, 0.1, 1, 10]
answer = {}
for a in alpha:
    model = linear_model.Ridge(alpha=a, solver="sag", random_state=42)
    model.fit(X_train, y_train)
    y_pred_val_a = model.predict(X_val)
    rmse = sqrt(mean_squared_error(y_val, y_pred_val_a))
    answer[a] = round(rmse,3)

answer

{0: 0.524, 0.01: 0.524, 0.1: 0.524, 1: 0.524, 10: 0.524}