In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

import warnings
warnings.filterwarnings("ignore")

# EDA
--Load the data.

--Look at the median_house_value variable. Does it have a long tail?

## Loading the data.
Features

In [2]:
features = ['latitude',
'longitude',
'housing_median_age',
'total_rooms',
'total_bedrooms',
'population',
'households',
'median_income',
'median_house_value',
'ocean_proximity',]

In [3]:
df =pd.read_csv("housing.csv", usecols= features)
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


# Data preparation
- Select only the features from above and fill in the missing values with 0.
- Create a new column rooms_per_household by dividing the column total_rooms by the column households from dataframe.
- Create a new column bedrooms_per_room by dividing the column total_bedrooms by the column total_rooms from dataframe.
- Create a new column population_per_household by dividing the column population by the column households from dataframe.


In [4]:
df.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [5]:
df.fillna(0, inplace=True)


In [6]:
df.isnull().sum()


longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
ocean_proximity       0
dtype: int64

In [7]:
df["rooms_per_household"] = df['total_rooms'] / df['households']
df['bedrooms_per_room'] = df['total_bedrooms'] / df['total_rooms']
df['population_per_household'] = df['population'] / df['households']
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,rooms_per_household,bedrooms_per_room,population_per_household
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY,6.984127,0.146591,2.555556
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,6.238137,0.155797,2.109842
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY,8.288136,0.129516,2.80226
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY,5.817352,0.184458,2.547945
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY,6.281853,0.172096,2.181467


# Question 1
What is the most frequent observation (mode) for the column ocean_proximity?

In [8]:
df.ocean_proximity.mode()

0    <1H OCEAN
Name: ocean_proximity, dtype: object

## Split the data
- Split your data in train/val/test sets, with 60%/20%/20% distribution.
- Use Scikit-Learn for that (the train_test_split function) and set the seed to 42.
- Make sure that the target value (median_house_value) is not in your dataframe.

In [9]:
full_train, X_test = train_test_split(df, test_size=0.2, random_state=42)
X_train, X_val = train_test_split(full_train, test_size=0.25, random_state=42)


X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
X_val.reset_index(drop=True, inplace=True)


y_train = X_train.median_house_value.values
y_test = X_test.median_house_value.values
y_val = X_val.median_house_value.values


X_train.drop(['median_house_value'],  inplace=True, axis=1)
X_val.drop(['median_house_value'],  inplace=True, axis=1)
X_test.drop(['median_house_value'],  inplace=True, axis=1)

# Question 2
- Create the correlation matrix for the numerical features of your train dataset.
-- In a correlation matrix, you compute the correlation coefficient between every pair of features in the dataset.
- What are the two features that have the biggest correlation in this dataset?

In [10]:
full_train.dtypes

longitude                   float64
latitude                    float64
housing_median_age          float64
total_rooms                 float64
total_bedrooms              float64
population                  float64
households                  float64
median_income               float64
median_house_value          float64
ocean_proximity              object
rooms_per_household         float64
bedrooms_per_room           float64
population_per_household    float64
dtype: object

In [11]:
numerical = [
    'latitude',
    'longitude',
    'housing_median_age',
    'total_rooms',
    'total_bedrooms',
    'population',
    'households',
    'median_income',
    'median_house_value',
    'rooms_per_household',
    'bedrooms_per_room',
    'population_per_household'
]

categorical = ['ocean_proximity']

In [12]:
full_train.corr()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,rooms_per_household,bedrooms_per_room,population_per_household
longitude,1.0,-0.924485,-0.101818,0.038676,0.063064,0.094276,0.049306,-0.01704,-0.046349,-0.029339,0.09728,-0.000598
latitude,-0.924485,1.0,0.005296,-0.029224,-0.059998,-0.102499,-0.064061,-0.076571,-0.142983,0.110695,-0.118938,0.005837
housing_median_age,-0.101818,0.005296,1.0,-0.360922,-0.320624,-0.292283,-0.302796,-0.121711,0.103706,-0.160892,0.135495,0.016245
total_rooms,0.038676,-0.029224,-0.360922,1.0,0.930489,0.857936,0.920482,0.198268,0.133989,0.13609,-0.189316,-0.024991
total_bedrooms,0.063064,-0.059998,-0.320624,0.930489,1.0,0.878932,0.980255,-0.009141,0.04798,-0.001659,0.084149,-0.028536
population,0.094276,-0.102499,-0.292283,0.857936,0.878932,1.0,0.907452,0.004122,-0.026032,-0.073733,0.035134,0.07233
households,0.049306,-0.064061,-0.302796,0.920482,0.980255,0.907452,1.0,0.012776,0.063714,-0.083062,0.064185,-0.027656
median_income,-0.01704,-0.076571,-0.121711,0.198268,-0.009141,0.004122,0.012776,1.0,0.690647,0.336013,-0.616669,0.022061
median_house_value,-0.046349,-0.142983,0.103706,0.133989,0.04798,-0.026032,0.063714,0.690647,1.0,0.158485,-0.257419,-0.02203
rooms_per_household,-0.029339,0.110695,-0.160892,0.13609,-0.001659,-0.073733,-0.083062,0.336013,0.158485,1.0,-0.435169,-0.004922


In [13]:
corr_mat = full_train.corr()
upper_corr_mat = corr_mat.where(np.triu(np.ones(corr_mat.shape), k=1).astype(np.bool))

unique_corr_pairs = upper_corr_mat.unstack().dropna()

sorted_mat = unique_corr_pairs.sort_values()
sorted_mat

latitude           longitude             -0.924485
bedrooms_per_room  median_income         -0.616669
                   rooms_per_household   -0.435169
total_rooms        housing_median_age    -0.360922
total_bedrooms     housing_median_age    -0.320624
                                            ...   
population         total_bedrooms         0.878932
households         population             0.907452
                   total_rooms            0.920482
total_bedrooms     total_rooms            0.930489
households         total_bedrooms         0.980255
Length: 66, dtype: float64

## Make median_house_value binary
- We need to turn the median_house_value variable from numeric into binary.
- Let's create a variable above_average which is 1 if the median_house_value is above its mean value and 0 otherwise.

In [14]:
y_train_num = y_train.copy()
y_val_num = y_val.copy()
y_test_num = y_test.copy()

y_train = (y_train > y_train.mean()).astype(int)
y_val = (y_val > y_val.mean()).astype(int)
y_test = (y_test > y_test.mean()).astype(int)

# Question 3
- Calculate the mutual information score between above_average and ocean_proximity . Use the training set only.
- Round it to 2 decimals using round(score, 2)
- What is their mutual information score?

In [15]:
round(mutual_info_score(y_train, X_train.ocean_proximity), 2)

0.1

# Question 4
- Now let's train a logistic regression
- Remember that we have one categorical variable ocean_proximity in the data. Include it using one-hot encoding.
- Fit the model on the training dataset.
-- To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
-- model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
- Calculate the accuracy on the validation dataset and round it to 2 decimal digits.

In [16]:
numerical = [
    'latitude',
    'longitude',
    'housing_median_age',
    'total_rooms',
    'total_bedrooms',
    'population',
    'households',
    'median_income',
    'rooms_per_household',
    'bedrooms_per_room',
    'population_per_household'
]

In [17]:
train_dicts = X_train[categorical + numerical].to_dict(orient='records')

dv = DictVectorizer(sparse=False)
x_train = dv.fit_transform(train_dicts)

val_dicts = X_val[categorical + numerical].to_dict(orient='records')
x_val = dv.transform(val_dicts)

test_dicts = X_test[categorical + numerical].to_dict(orient='records')
x_test = dv.transform(test_dicts)

In [18]:
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model.fit(x_train, y_train)

LogisticRegression(max_iter=1000, random_state=42, solver='liblinear')

In [19]:
y_prod_val = model.predict(x_val)
acc = accuracy_score(y_val, y_prod_val)
round(acc, 2)

0.84

# Question 5
- Let's find the least useful feature using the feature elimination technique.
- Train a model with all these features (using the same parameters as in Q4).
- Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
- For each feature, calculate the difference between the original accuracy and the accuracy without the feature.
- Which of following feature has the smallest difference?
-- total_rooms
-- total_bedrooms
-- population
-- households

In [20]:
results = {}

for feature in X_train.columns:
    model_x = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
    tmp_train = X_train.drop(feature, axis=1)
    tmp_val = X_val.drop(feature, axis=1)
    
    dv = DictVectorizer(sparse=False)
    tmp_train = dv.fit_transform(tmp_train.to_dict(orient='records'))
    tmp_val =  dv.transform(tmp_val.to_dict(orient='records'))
    
    model_x.fit(tmp_train, y_train)
    results[f'model_wo_{feature}'] = accuracy_score(y_val, model_x.predict(tmp_val))
sorted(results.items(), key=lambda kv: kv[1])

[('model_wo_median_income', 0.7865794573643411),
 ('model_wo_ocean_proximity', 0.8204941860465116),
 ('model_wo_population', 0.8275193798449613),
 ('model_wo_housing_median_age', 0.8318798449612403),
 ('model_wo_longitude', 0.8326065891472868),
 ('model_wo_latitude', 0.8330910852713178),
 ('model_wo_households', 0.8347868217054264),
 ('model_wo_rooms_per_household', 0.8359980620155039),
 ('model_wo_population_per_household', 0.8364825581395349),
 ('model_wo_total_rooms', 0.8369670542635659),
 ('model_wo_bedrooms_per_room', 0.8369670542635659),
 ('model_wo_total_bedrooms', 0.8379360465116279)]

In [21]:
for key, value in results.items():
    print(f"Different between original & {key} = {acc - value}")

Different between original & model_wo_longitude = 0.004118217054263629
Different between original & model_wo_latitude = 0.00363372093023262
Different between original & model_wo_housing_median_age = 0.004844961240310086
Different between original & model_wo_total_rooms = -0.0002422480620154488
Different between original & model_wo_total_bedrooms = -0.001211240310077466
Different between original & model_wo_population = 0.009205426356589164
Different between original & model_wo_households = 0.0019379844961240345
Different between original & model_wo_median_income = 0.050145348837209336
Different between original & model_wo_ocean_proximity = 0.016230620155038844
Different between original & model_wo_rooms_per_household = 0.0007267441860465684
Different between original & model_wo_bedrooms_per_room = -0.0002422480620154488
Different between original & model_wo_population_per_household = 0.00024224806201555982


# Question 6
- For this question, we'll see how to use a linear regression model from Scikit-Learn
- We'll need to use the original column 'median_house_value'. Apply the logarithmic transformation to this column.
- Fit the Ridge regression model (model = Ridge(alpha=a, solver="sag", random_state=42)) on the training data.
- This model has a parameter alpha. Let's try the following values: [0, 0.01, 0.1, 1, 10]
- Which of these alphas leads to the best RMSE on the validation set? Round your RMSE scores to 3 decimal digits.
- If there are multiple options, select the smallest alpha.

In [23]:
results = {}
for a in [0, 0.01, 0.1, 1, 10]:
    model = Ridge(alpha=a, solver="sag", random_state=42)
    model.fit(x_train, y_train_num)
    
    results[f'alpha_{a}'] = round(mean_squared_error(y_val_num, model.predict(x_val), squared=False), 3)
sorted(results.items(), key=lambda kv: kv[1])

[('alpha_0', 106337.775),
 ('alpha_0.01', 106337.775),
 ('alpha_0.1', 106337.775),
 ('alpha_1', 106337.775),
 ('alpha_10', 106337.778)]