In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Ridge
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import mutual_info_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

##  Data preparation
### Features

For the rest of the homework, you'll need to use only these columns:

* `'latitude'`,
* `'longitude'`,
* `'housing_median_age'`,
* `'total_rooms'`,
* `'total_bedrooms'`,
* `'population'`,
* `'households'`,
* `'median_income'`,
* `'median_house_value'`,
* `'ocean_proximity'`,

### Data preparation

* Select only the features from above and fill in the missing values with 0.
* Create a new column `rooms_per_household` by dividing the column `total_rooms` by the column `households` from dataframe. 
* Create a new column `bedrooms_per_room` by dividing the column `total_bedrooms` by the column `total_rooms` from dataframe. 
* Create a new column `population_per_household` by dividing the column `population` by the column `households` from dataframe

In [2]:
df = pd.read_csv('housing.csv')

In [3]:
df.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity'],
      dtype='object')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [5]:
df.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0,20640.0
mean,-119.569704,35.631861,28.639486,2635.763081,537.870553,1425.476744,499.53968,3.870671,206855.816909
std,2.003532,2.135952,12.585558,2181.615252,421.38507,1132.462122,382.329753,1.899822,115395.615874
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1447.75,296.0,787.0,280.0,2.5634,119600.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,-118.01,37.71,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


In [6]:
df.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [7]:
df['total_bedrooms'] = df.total_bedrooms.fillna(0)

## Question 1
What is the most frequent observation (mode) for the column `ocean_proximity`?

Options:
* `NEAR BAY`
* `<1H OCEAN`
* `INLAND`
* `NEAR OCEAN`


In [8]:
df.ocean_proximity.mode()

0    <1H OCEAN
Name: ocean_proximity, dtype: object

### Answer: the most frequent observation (mode) for the column ocean_proximity is `<1H OCEAN`

### Question 2

* Create the [correlation matrix](https://www.google.com/search?q=correlation+matrix) for the numerical features of your train dataset.
    - In a correlation matrix, you compute the correlation coefficient between every pair of features in the dataset.
* What are the two features that have the biggest correlation in this dataset?

Options:
* `total_bedrooms` and `households`
* `total_bedrooms` and `total_rooms`
* `population` and `households`
* `population_per_household` and `total_rooms`

In [9]:
df['rooms_per_household'] = df.total_rooms/df.households
df['bedrooms_per_room'] = df.total_bedrooms/df.total_rooms
df['population_per_household'] = df.population/df.households
df.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity', 'rooms_per_household',
       'bedrooms_per_room', 'population_per_household'],
      dtype='object')

In [10]:
df.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity', 'rooms_per_household',
       'bedrooms_per_room', 'population_per_household'],
      dtype='object')

In [11]:
numeric = ['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income','rooms_per_household',
       'bedrooms_per_room', 'population_per_household']
categorical = ['ocean_proximity']

In [12]:
df[numeric]

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms_per_household,bedrooms_per_room,population_per_household
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,6.984127,0.146591,2.555556
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,6.238137,0.155797,2.109842
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,8.288136,0.129516,2.802260
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,5.817352,0.184458,2.547945
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,6.281853,0.172096,2.181467
...,...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,5.045455,0.224625,2.560606
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,6.114035,0.215208,3.122807
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,5.205543,0.215173,2.325635
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,5.329513,0.219892,2.123209


In [13]:
corr_matrix = df[numeric].corr()
corr_matrix

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms_per_household,bedrooms_per_room,population_per_household
longitude,1.0,-0.924664,-0.108197,0.044568,0.068082,0.099773,0.05531,-0.015176,-0.02754,0.084836,0.002476
latitude,-0.924664,1.0,0.011173,-0.0361,-0.065318,-0.108785,-0.071035,-0.079809,0.106389,-0.104112,0.002366
housing_median_age,-0.108197,0.011173,1.0,-0.361262,-0.317063,-0.296244,-0.302916,-0.119034,-0.153277,0.125396,0.013191
total_rooms,0.044568,-0.0361,-0.361262,1.0,0.920196,0.857126,0.918484,0.19805,0.133798,-0.174583,-0.024581
total_bedrooms,0.068082,-0.065318,-0.317063,0.920196,1.0,0.866266,0.966507,-0.007295,0.002717,0.122205,-0.028019
population,0.099773,-0.108785,-0.296244,0.857126,0.866266,1.0,0.907222,0.004834,-0.072213,0.031397,0.069863
households,0.05531,-0.071035,-0.302916,0.918484,0.966507,0.907222,1.0,0.013033,-0.080598,0.059818,-0.027309
median_income,-0.015176,-0.079809,-0.119034,0.19805,-0.007295,0.004834,0.013033,1.0,0.326895,-0.573836,0.018766
rooms_per_household,-0.02754,0.106389,-0.153277,0.133798,0.002717,-0.072213,-0.080598,0.326895,1.0,-0.387465,-0.004852
bedrooms_per_room,0.084836,-0.104112,0.125396,-0.174583,0.122205,0.031397,0.059818,-0.573836,-0.387465,1.0,0.003047


In [14]:
corr_matrix.style.background_gradient(cmap='coolwarm').set_precision(2)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms_per_household,bedrooms_per_room,population_per_household
longitude,1.0,-0.92,-0.11,0.04,0.07,0.1,0.06,-0.02,-0.03,0.08,0.0
latitude,-0.92,1.0,0.01,-0.04,-0.07,-0.11,-0.07,-0.08,0.11,-0.1,0.0
housing_median_age,-0.11,0.01,1.0,-0.36,-0.32,-0.3,-0.3,-0.12,-0.15,0.13,0.01
total_rooms,0.04,-0.04,-0.36,1.0,0.92,0.86,0.92,0.2,0.13,-0.17,-0.02
total_bedrooms,0.07,-0.07,-0.32,0.92,1.0,0.87,0.97,-0.01,0.0,0.12,-0.03
population,0.1,-0.11,-0.3,0.86,0.87,1.0,0.91,0.0,-0.07,0.03,0.07
households,0.06,-0.07,-0.3,0.92,0.97,0.91,1.0,0.01,-0.08,0.06,-0.03
median_income,-0.02,-0.08,-0.12,0.2,-0.01,0.0,0.01,1.0,0.33,-0.57,0.02
rooms_per_household,-0.03,0.11,-0.15,0.13,0.0,-0.07,-0.08,0.33,1.0,-0.39,-0.0
bedrooms_per_room,0.08,-0.1,0.13,-0.17,0.12,0.03,0.06,-0.57,-0.39,1.0,0.0


In [15]:
sorted_matrix =(corr_matrix.unstack().sort_values(ascending=False))
round(sorted_matrix[:12],2)

longitude                 longitude                   1.00
latitude                  latitude                    1.00
bedrooms_per_room         bedrooms_per_room           1.00
rooms_per_household       rooms_per_household         1.00
median_income             median_income               1.00
households                households                  1.00
total_bedrooms            total_bedrooms              1.00
total_rooms               total_rooms                 1.00
housing_median_age        housing_median_age          1.00
population                population                  1.00
population_per_household  population_per_household    1.00
total_bedrooms            households                  0.97
dtype: float64

### Answer 2: Features with big correlation is  total bedrooms and households

### Make `median_house_value` binary

* We need to turn the `median_house_value` variable from numeric into binary.
* Let's create a variable `above_average` which is `1` if the `median_house_value` is above its mean value and `0` otherwise.

### Split the data

* Split your data in train/val/test sets, with 60%/20%/20% distribution.
* Use Scikit-Learn for that (the `train_test_split` function) and set the seed to 42.
* Make sure that the target value (`median_house_value`) is not in your dataframe.

### Question 3

* Calculate the mutual information score with the (binarized) price for the categorical variable that we have. Use the training set only.
* What is the value of mutual information?
* Round it to 2 decimal digits using `round(score, 2)`

Options:
- 0.26
- 0
- 0.10
- 0.16

In [16]:
mean = df.median_house_value.mean()
mean

206855.81690891474

In [17]:
df['above_average'] = (df.median_house_value >= mean).astype(int)
df['above_average']

0        1
1        1
2        1
3        1
4        1
        ..
20635    0
20636    0
20637    0
20638    0
20639    0
Name: above_average, Length: 20640, dtype: int32

In [18]:
df

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,rooms_per_household,bedrooms_per_room,population_per_household,above_average
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY,6.984127,0.146591,2.555556,1
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,6.238137,0.155797,2.109842,1
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY,8.288136,0.129516,2.802260,1
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY,5.817352,0.184458,2.547945,1
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY,6.281853,0.172096,2.181467,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND,5.045455,0.224625,2.560606,0
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND,6.114035,0.215208,3.122807,0
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND,5.205543,0.215173,2.325635,0
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND,5.329513,0.219892,2.123209,0


In [19]:
#Split the data
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

In [20]:
len(df_train), len(df_val), len(df_test)

(12384, 4128, 4128)

In [21]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [22]:
y_train = df_train.above_average.values
y_val = df_val.above_average.values
y_test = df_test.above_average.values

del df_train['median_house_value']
del df_val['median_house_value']
del df_test['median_house_value']

# del df_train['above_average']
# del df_val['above_average']
# del df_test['above_average']

In [23]:
round(mutual_info_score(df_train.above_average,df_train.ocean_proximity),2)

0.1

In [24]:
def mutual_info_MHV_score(series):
    return mutual_info_score(series, df_train.above_average)

In [25]:
mutual_info = df_train[categorical]
round(mutual_info.apply(mutual_info_MHV_score),2)

ocean_proximity    0.1
dtype: float64

### Answer: Value of mutual information of categorial values is `0.1`

### Question 4

* Now let's train a logistic regression
* Remember that we have one categorical variable `ocean_proximity` in the data. Include it using one-hot encoding.
* Fit the model on the training dataset.
    - To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
    - `model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)`
* Calculate the accuracy on the validation dataset and round it to 2 decimal digits.

Options:
- 0.60
- 0.72
- 0.84
- 0.95

In [26]:
dv = DictVectorizer(sparse=False)

train_dict = df_train[categorical + numeric].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical + numeric].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [27]:
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

LogisticRegression(max_iter=1000, random_state=42, solver='liblinear')

In [28]:
model.intercept_[0]

-0.24225915731145212

In [29]:
model.coef_[0].round(3)

array([ 0.835,  0.006,  0.037,  0.13 ,  0.093,  1.226,  0.446, -1.872,
        0.166,  0.168,  0.85 , -0.002,  0.004,  0.015,  0.   , -0.   ])

In [30]:
y_pred = model.predict_proba(X_val)[:, 1]

In [31]:
above_avg_price = (y_pred >= 0.5)

In [32]:
original_accuracy_c = (y_val == above_avg_price).mean()
original_accuracy = round((y_val == above_avg_price).mean(),2)
original_accuracy

0.82

### Answer: The accuracy on the validation dataset is 0.82 --> `0.84`

### Question 5 

* Let's find the least useful feature using the *feature elimination* technique.
* Train a model with all these features (using the same parameters as in Q4).
* Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
* For each feature, calculate the difference between the original accuracy and the accuracy without the feature. 
* Which of following feature has the smallest difference? 
   * `total_rooms`
   * `total_bedrooms` 
   * `population`
   * `households`

> **note**: the difference doesn't have to be positive

In [33]:
small_features = ['total_rooms','total_bedrooms','population','households']
for feature in small_features:
    df_train_new = df_train[categorical + numeric].drop(feature,1)
    train_dict_new = df_train_new.to_dict(orient='records')
    X_train_new = dv.fit_transform(train_dict_new)

    df_val_new = df_val[categorical + numeric].drop(feature, axis = 1)
    val_dict_new= df_val_new.to_dict(orient='records')
    X_val_new = dv.transform(val_dict_new)
    model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train_new, y_train)
    y_pred = model.predict_proba(X_val_new)[:, 1]
    abv_avg_price = (y_pred >= 0.5)
    new_model_accuracy = (y_val == abv_avg_price).mean()
    print(f'Feature:{feature} has {round(abs((original_accuracy_c-new_model_accuracy)),5)} difference from original accuracy, with new accuracy of {round(new_model_accuracy,4)}')

Feature:total_rooms has 0.00097 difference from original accuracy, with new accuracy of 0.8241
Feature:total_bedrooms has 0.00048 difference from original accuracy, with new accuracy of 0.8227
Feature:population has 0.00921 difference from original accuracy, with new accuracy of 0.814
Feature:households has 0.00484 difference from original accuracy, with new accuracy of 0.8183


### Answer : `total_bedrooms ` feature has the smallest difference.

### Question 6

* For this question, we'll see how to use a linear regression model from Scikit-Learn
* We'll need to use the original column `'median_house_value'`. Apply the logarithmic transformation to this column.
* Fit the Ridge regression model (`model = Ridge(alpha=a, solver="sag", random_state=42)`) on the training data.
* This model has a parameter `alpha`. Let's try the following values: `[0, 0.01, 0.1, 1, 10]`
* Which of these alphas leads to the best RMSE on the validation set? Round your RMSE scores to 3 decimal digits.

If there are multiple options, select the smallest `alpha`.

Options:
- 0
- 0.01
- 0.1
- 1
- 10

In [34]:
y_train_log = np.log1p(y_train)
y_val_log = np.log1p(y_val)
y_test_log = np.log1p(y_test)

In [35]:
alpha = [0, 0.01, 0.1, 1, 10]
for a in alpha:
    r_model = Ridge(alpha=a, solver="sag", random_state=42)
    r_model.fit(X_train, y_train_log)
    r_model_predictions = r_model.predict(X_val)
    
    rmse = np.sqrt(mean_squared_error(y_val_log, r_model_predictions))
    print(f'RMSE value with alpha {a}: {round(rmse, 3)}')

RMSE value with alpha 0: 0.322
RMSE value with alpha 0.01: 0.322
RMSE value with alpha 0.1: 0.322
RMSE value with alpha 1: 0.322
RMSE value with alpha 10: 0.322


### Answer: RMSE value is same for all alphas, So smallest alpha is `0`