In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

In [2]:
df = pd.read_csv('housing.csv')[['latitude',
                                'longitude',
                                'housing_median_age',
                                'total_rooms',
                                'total_bedrooms',
                                'population',
                                'households',
                                'median_income',
                                'median_house_value',
                                'ocean_proximity']]

In [3]:
df = df.fillna(0)

In [4]:
df['rooms_per_household'] = df['total_rooms']/df['households']
df['bedrooms_per_room'] = df['total_bedrooms']/df['total_rooms']
df['population_per_household'] = df['population']/df['households']

In [5]:
df['ocean_proximity'].value_counts().head(1)

<1H OCEAN    9136
Name: ocean_proximity, dtype: int64

In [6]:
df[['total_bedrooms', 'households', 'total_rooms', 'population', 'population_per_household']].corr()

Unnamed: 0,total_bedrooms,households,total_rooms,population,population_per_household
total_bedrooms,1.0,0.966507,0.920196,0.866266,-0.028019
households,0.966507,1.0,0.918484,0.907222,-0.027309
total_rooms,0.920196,0.918484,1.0,0.857126,-0.024581
population,0.866266,0.907222,0.857126,1.0,0.069863
population_per_household,-0.028019,-0.027309,-0.024581,0.069863,1.0


In [7]:
df['above_average'] = df['median_house_value'] > df['median_house_value'].mean()
df['above_average'] = df['above_average'].astype(int)

In [8]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

In [9]:
df_test.shape[0], df_train.shape[0], df_val.shape[0]

(4128, 12384, 4128)

In [10]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train.median_house_value.values
y_val = df_val.median_house_value.values
y_test = df_test.median_house_value.values

del df_train['median_house_value']
del df_val['median_house_value']
del df_test['median_house_value']

In [11]:
round(mutual_info_score(df_train.ocean_proximity, df_train.above_average), 2)

0.11

In [12]:
categorical = ['ocean_proximity']
numerical = ['latitude', 'longitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'rooms_per_household', 'bedrooms_per_room',
       'population_per_household', 'above_average']

In [13]:
dv = DictVectorizer(sparse=False)

train_dict = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)


In [14]:
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

LogisticRegression(max_iter=1000, random_state=42, solver='liblinear')

In [25]:
df_pred = pd.DataFrame()
df_pred['probability'] = y_pred
df_pred['prediction'] = median_house_value_decision.astype(int)
df_pred['actual'] = y_val
df_pred['correct'] = df_pred.prediction == df_pred.actual
df_pred.correct.mean()

0.0