In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
housing_data = pd.read_csv('datasets/kaggle-california-housing.csv')
housing_data.sample(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
18769,-122.31,40.45,10.0,1187.0,236.0,728.0,248.0,2.0469,66800.0,INLAND
12862,-121.39,38.69,30.0,2897.0,506.0,1508.0,478.0,3.865,88400.0,INLAND
12119,-117.21,33.97,3.0,18356.0,2537.0,8437.0,2342.0,5.6409,197700.0,INLAND
15028,-117.01,32.79,31.0,3776.0,815.0,1886.0,799.0,3.4421,155300.0,<1H OCEAN
15225,-117.26,33.06,11.0,2660.0,352.0,1226.0,366.0,7.6832,319800.0,NEAR OCEAN


In [3]:
housing_data.dropna(inplace=True)
housing_data.shape

(20433, 10)

In [4]:
#price data is capped at $500001, creating a skewed dataset
housing_data = housing_data.drop(housing_data.loc[housing_data['median_house_value']==500001].index)

In [5]:
housing_data.shape

(19475, 10)

In [6]:
housing_data['ocean_proximity'].unique()

array(['NEAR BAY', '<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'ISLAND'],
      dtype=object)

In [7]:
#one hot encoding for categorical data, machine learning algorithms don't accept strings as input
housing_data = pd.get_dummies(housing_data, columns=['ocean_proximity'])
housing_data.shape


(19475, 14)

In [8]:
 housing_data.sample(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
16384,-121.28,38.02,8.0,1868.0,392.0,1258.0,389.0,3.175,95900.0,0,1,0,0,0
5623,-118.26,33.77,36.0,886.0,253.0,809.0,219.0,2.4545,164200.0,0,0,0,0,1
19888,-119.16,36.28,18.0,2377.0,414.0,1359.0,424.0,4.4,79300.0,0,1,0,0,0
3012,-118.95,34.83,18.0,3278.0,762.0,1338.0,550.0,2.9891,116500.0,0,1,0,0,0
4975,-118.28,34.0,44.0,2636.0,725.0,2182.0,651.0,1.432,124000.0,1,0,0,0,0


In [9]:
median = housing_data['median_house_value'].median()
median

173800.0

In [10]:
#adding boolean column for houses that are priced above median
housing_data['above_median'] = (housing_data['median_house_value'] - median) > 0

In [12]:
housing_data.sample(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN,above_median
12938,-121.27,38.66,19.0,1891.0,266.0,678.0,255.0,6.1872,188700.0,0,1,0,0,0,True
17277,-119.75,34.43,23.0,2982.0,837.0,1317.0,787.0,3.3776,283200.0,0,0,0,0,1,True
9733,-121.66,36.82,17.0,3921.0,654.0,1895.0,641.0,5.0092,238700.0,1,0,0,0,0,True
4278,-118.31,34.09,28.0,720.0,267.0,891.0,265.0,1.8977,100000.0,1,0,0,0,0,False
4304,-118.3,34.09,25.0,2345.0,852.0,2860.0,862.0,1.4497,205600.0,1,0,0,0,0,True


In [15]:
#binary classification problem, 'above_median' column is either true or false
X = housing_data.drop(['median_house_value', 'above_median'], axis=1)
Y = housing_data['above_median']

In [16]:
X.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'ocean_proximity_<1H OCEAN', 'ocean_proximity_INLAND',
       'ocean_proximity_ISLAND', 'ocean_proximity_NEAR BAY',
       'ocean_proximity_NEAR OCEAN'],
      dtype='object')

In [18]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2)

In [19]:
x_train.shape, x_test.shape

((15580, 13), (3895, 13))

In [20]:
y_train.shape, y_test.shape

((15580,), (3895,))

In [23]:
#solver liblinear is good for binary classification and small datasets
from sklearn.linear_model import LogisticRegression
logistic_model = LogisticRegression(solver='liblinear').fit(x_train, y_train)

In [24]:
print('Training Score: ', logistic_model.score(x_train, y_train))

Training Score:  0.8192554557124518


In [25]:
y_pred = logistic_model.predict(x_test)

In [27]:
#predicted vs actual values
df_pred_actual = pd.DataFrame({'predicted':y_pred, 'actual':y_test})
df_pred_actual.head(10)

Unnamed: 0,predicted,actual
15220,True,True
5811,False,True
12627,False,False
8757,True,True
2730,False,False
20049,False,False
14079,True,False
1514,True,True
11825,False,False
8048,True,True


In [28]:
from sklearn.metrics import accuracy_score
print("Testing Score: ", accuracy_score(y_test,y_pred))

Testing Score:  0.8238767650834403
