In [1]:
!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv

--2022-09-26 23:02:49--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1423529 (1.4M) [text/plain]
Saving to: ‘housing.csv’


2022-09-26 23:02:50 (5.48 MB/s) - ‘housing.csv’ saved [1423529/1423529]



In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score, accuracy_score
from sklearn.linear_model import LogisticRegression

In [2]:
data = pd.read_csv("housing.csv")

In [3]:
features = ['latitude',
'longitude',
'housing_median_age',
'total_rooms',
'total_bedrooms',
'population',
'households',
'median_income',
'median_house_value',
'ocean_proximity']

In [4]:
data[features].isnull().sum()

latitude                0
longitude               0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [5]:
# Add 0's for missing data
data_subset = data[features].copy()
data_subset = data_subset.fillna(0)
data_subset['rooms_per_household'] = data_subset['total_rooms']/data_subset['households']
data_subset['bedrooms_per_room'] = data_subset['total_bedrooms']/data_subset['total_rooms']
data_subset['population_per_household'] = data_subset['population']/data_subset['households']

In [6]:
data_subset.head()

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,rooms_per_household,bedrooms_per_room,population_per_household
0,37.88,-122.23,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY,6.984127,0.146591,2.555556
1,37.86,-122.22,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,6.238137,0.155797,2.109842
2,37.85,-122.24,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY,8.288136,0.129516,2.80226
3,37.85,-122.25,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY,5.817352,0.184458,2.547945
4,37.85,-122.25,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY,6.281853,0.172096,2.181467


In [7]:
#Q1
data_subset['ocean_proximity'].mode()

0    <1H OCEAN
Name: ocean_proximity, dtype: object

In [8]:
#Q2
num_cols = [cols for cols in data_subset if data_subset[cols].dtype != 'object' and cols not in ['latitude', 'longitude']]
data_subset[num_cols].corr()
# total_bedrooms and households

Unnamed: 0,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,rooms_per_household,bedrooms_per_room,population_per_household
housing_median_age,1.0,-0.361262,-0.317063,-0.296244,-0.302916,-0.119034,0.105623,-0.153277,0.125396,0.013191
total_rooms,-0.361262,1.0,0.920196,0.857126,0.918484,0.19805,0.134153,0.133798,-0.174583,-0.024581
total_bedrooms,-0.317063,0.920196,1.0,0.866266,0.966507,-0.007295,0.049148,0.002717,0.122205,-0.028019
population,-0.296244,0.857126,0.866266,1.0,0.907222,0.004834,-0.02465,-0.072213,0.031397,0.069863
households,-0.302916,0.918484,0.966507,0.907222,1.0,0.013033,0.065843,-0.080598,0.059818,-0.027309
median_income,-0.119034,0.19805,-0.007295,0.004834,0.013033,1.0,0.688075,0.326895,-0.573836,0.018766
median_house_value,0.105623,0.134153,0.049148,-0.02465,0.065843,0.688075,1.0,0.151948,-0.238759,-0.023737
rooms_per_household,-0.153277,0.133798,0.002717,-0.072213,-0.080598,0.326895,0.151948,1.0,-0.387465,-0.004852
bedrooms_per_room,0.125396,-0.174583,0.122205,0.031397,0.059818,-0.573836,-0.238759,-0.387465,1.0,0.003047
population_per_household,0.013191,-0.024581,-0.028019,0.069863,-0.027309,0.018766,-0.023737,-0.004852,0.003047,1.0


In [9]:
#Q3 
average_price = data_subset['median_house_value'].mean()
data_subset['above_average'] = (data_subset['median_house_value'] > average_price).astype(int)

In [10]:
# Splitting the dataset
train_subset, val = train_test_split(data_subset, test_size=0.2, random_state=42)
train, test = train_test_split(train_subset, test_size=0.25, random_state=42)

In [11]:
# Dropping cols
drop_cols = ['median_house_value']
train = train.drop(drop_cols, axis=1)
val = val.drop(drop_cols, axis=1)
test = test.drop(drop_cols, axis=1)

In [12]:
# Printing shape
train.shape, val.shape, test.shape

((12384, 13), (4128, 13), (4128, 13))

In [13]:
# Mutual info score
score = mutual_info_score( train['above_average'], train['ocean_proximity'])
print(round(score,2))

0.1


In [14]:
# Q4
from sklearn.feature_extraction import DictVectorizer

train_dict = train.drop(['above_average'], axis=1).to_dict(orient='records')
val_dict = val.drop(['above_average'], axis=1).to_dict(orient='records')
test_dict = test.drop(['above_average'], axis=1).to_dict(orient='records')

y_train = train['above_average'].values
y_val = val['above_average'].values
y_test = test['above_average'].values

dv = DictVectorizer()
X_train = dv.fit_transform(train_dict)

In [15]:
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

X_val = dv.transform(val_dict)
val_pred = model.predict(X_val)
val_accuracy = accuracy_score(y_val, val_pred)
print(round(val_accuracy,2))


0.83


In [19]:
# Q5 Elimination
for feature in ['total_rooms','total_bedrooms','population','households']:
    print(f"Dropping feature {feature}")
    train_dict = train.drop(['above_average', feature], axis=1).to_dict(orient='records')
    val_dict = val.drop(['above_average', feature], axis=1).to_dict(orient='records')
    test_dict = test.drop(['above_average', feature], axis=1).to_dict(orient='records')

    y_train = train['above_average'].values
    y_val = val['above_average'].values
    y_test = test['above_average'].values

    dv = DictVectorizer()
    X_train = dv.fit_transform(train_dict)

    model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)

    X_val = dv.transform(val_dict)
    new_val_pred = model.predict(X_val)
    new_val_accuracy = accuracy_score(y_val, new_val_pred)

    diff = round((val_accuracy - new_val_accuracy),4)

    print(f"diff is {diff} for dropping feature {feature}")

Dropping feature total_rooms
diff is -0.0017 for dropping feature total_rooms
Dropping feature total_bedrooms
diff is -0.0007 for dropping feature total_bedrooms
Dropping feature population
diff is 0.0126 for dropping feature population
Dropping feature households
diff is 0.0044 for dropping feature households


In [20]:
# Q6 
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge

In [21]:
train_subset, val = train_test_split(data_subset, test_size=0.2, random_state=42)
train, test = train_test_split(train_subset, test_size=0.25, random_state=42)
drop_cols = ['above_average']
train = train.drop(drop_cols, axis=1)
val = val.drop(drop_cols, axis=1)
test = test.drop(drop_cols, axis=1)

In [28]:
scores = []
for a in [0, 0.01, 0.1, 1, 10]:
    print(f"Using alpha {a}")
    train_dict = train.drop(['median_house_value'], axis=1).to_dict(orient='records')
    val_dict = val.drop(['median_house_value'], axis=1).to_dict(orient='records')
    test_dict = test.drop(['median_house_value'], axis=1).to_dict(orient='records')

    y_train = np.log1p(train['median_house_value'].values)
    y_val = np.log1p(val['median_house_value'].values)
    y_test = np.log1p(test['median_house_value'].values)

    dv = DictVectorizer()
    X_train = dv.fit_transform(train_dict)

    model = Ridge(alpha=a, solver="sag", random_state=42)
    model.fit(X_train, y_train)

    X_val = dv.transform(val_dict)
    val_pred = model.predict(X_val)
    val_rmse = mean_squared_error(y_val, val_pred, squared=True)

    scores.append(round(val_rmse,3))

    print(f"val rmse is {val_rmse} for alpha {a}")

Using alpha 0




val rmse is 0.3218589497076045 for alpha 0
Using alpha 0.01




val rmse is 0.3218589497016842 for alpha 0.01
Using alpha 0.1




val rmse is 0.3218589496528327 for alpha 0.1
Using alpha 1




val rmse is 0.32185894915839136 for alpha 1
Using alpha 10




val rmse is 0.32185894421102396 for alpha 10


In [29]:
np.array(scores).min()

0.322

In [30]:
min_index = np.argmin(np.array(scores))


In [31]:
min_index

0