In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
housing_data = pd.read_csv('../../data/csv/housing.csv')
housing_data.sample(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
9882,-121.79,36.68,22.0,6912.0,1513.0,3794.0,1455.0,3.0608,168300.0,<1H OCEAN
15665,-122.43,37.81,52.0,4309.0,942.0,1297.0,798.0,4.6781,500001.0,NEAR BAY
7073,-118.02,33.95,35.0,2085.0,400.0,1112.0,391.0,3.4886,173900.0,<1H OCEAN
8682,-118.34,33.87,28.0,4605.0,1188.0,2558.0,1093.0,3.6988,266600.0,<1H OCEAN
11557,-117.98,33.73,22.0,4232.0,624.0,2408.0,660.0,6.6539,284900.0,<1H OCEAN


In [4]:
housing_data = housing_data.dropna()

In [5]:
housing_data.shape

(20433, 10)

In [14]:
housing_data.loc[housing_data['median_house_value'] == 500001].count()

longitude             958
latitude              958
housing_median_age    958
total_rooms           958
total_bedrooms        958
population            958
households            958
median_income         958
median_house_value    958
ocean_proximity       958
dtype: int64

In [15]:
housing_data = housing_data.drop(housing_data.loc[housing_data['median_house_value'] == 500001].index)

In [16]:
housing_data.shape

(19475, 10)

In [17]:
housing_data['ocean_proximity'].unique()

array(['NEAR BAY', '<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'ISLAND'],
      dtype=object)

In [18]:
housing_data = pd.get_dummies(housing_data, columns=['ocean_proximity'])

In [19]:
housing_data.shape

(19475, 14)

In [20]:
housing_data.sample(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
20498,-118.7,34.29,25.0,1678.0,252.0,862.0,268.0,6.1834,229800.0,1,0,0,0,0
7448,-118.18,33.95,39.0,2121.0,579.0,1991.0,528.0,2.9094,152200.0,1,0,0,0,0
12671,-121.42,38.49,17.0,13180.0,2444.0,7235.0,2335.0,3.363,103000.0,0,1,0,0,0
4585,-118.27,34.06,26.0,513.0,338.0,1204.0,321.0,1.4904,275000.0,1,0,0,0,0
18766,-122.26,40.58,14.0,2539.0,466.0,1271.0,438.0,3.9762,138500.0,0,1,0,0,0


In [21]:
median = housing_data['median_house_value'].median()

median

173800.0

In [22]:
housing_data['above_median'] = (housing_data['median_house_value'] - median) > 0

In [23]:
housing_data.sample(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN,above_median
18237,-122.11,37.4,16.0,1994.0,489.0,1173.0,472.0,4.1875,266400.0,0,0,0,1,0,True
11370,-117.99,33.71,19.0,1967.0,487.0,1251.0,404.0,3.6696,218800.0,1,0,0,0,0,True
6111,-117.9,34.13,25.0,3076.0,856.0,2868.0,752.0,2.6619,117600.0,1,0,0,0,0,False
5772,-118.27,34.15,25.0,3018.0,806.0,2205.0,742.0,3.0199,220200.0,1,0,0,0,0,True
11099,-117.88,33.82,17.0,2247.0,705.0,1382.0,618.0,3.8631,225000.0,1,0,0,0,0,True


In [24]:
X = housing_data.drop(['median_house_value', 'above_median'], axis=1)
y = housing_data['above_median']

In [25]:
X.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'ocean_proximity_<1H OCEAN', 'ocean_proximity_INLAND',
       'ocean_proximity_ISLAND', 'ocean_proximity_NEAR BAY',
       'ocean_proximity_NEAR OCEAN'],
      dtype='object')

In [26]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [27]:
X_train.shape, X_test.shape

((15580, 13), (3895, 13))

In [28]:
y_train.shape, y_test.shape

((15580,), (3895,))

In [31]:
from sklearn.linear_model import LogisticRegression

logistic_model = LogisticRegression(solver='liblinear').fit(X_train, y_train)

In [32]:
print('Training_score : ', logistic_model.score(X_train, y_train))

Training_score :  0.8206033376123235


In [33]:
y_pred = logistic_model.predict(X_test)

In [34]:
df_pred_actual = pd.DataFrame({'predicted' : y_pred, 'actual' : y_test})
df_pred_actual.head(10)

Unnamed: 0,predicted,actual
15841,True,True
5065,False,False
9502,False,False
16234,False,False
4944,False,False
15409,True,False
19387,False,False
5520,True,True
14801,False,False
12926,False,False


In [35]:
from sklearn.metrics import accuracy_score

print('Testing_score : ', accuracy_score(y_test, y_pred))

Testing_score :  0.8218228498074455
