In [1]:
#imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

#### data link: https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv

In [2]:
df_housing = pd.read_csv("housing.csv")
print(len(df_housing))
df_housing.head()

20640


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


### Data preparation
Select only the features from above and fill in the missing values with 0.
Create a new column rooms_per_household by dividing the column total_rooms by the column households from dataframe.
Create a new column bedrooms_per_room by dividing the column total_bedrooms by the column total_rooms from dataframe.
Create a new column population_per_household by dividing the column population by the column households from dataframe.

In [3]:
df_housing.dtypes

longitude             float64
latitude              float64
housing_median_age    float64
total_rooms           float64
total_bedrooms        float64
population            float64
households            float64
median_income         float64
median_house_value    float64
ocean_proximity        object
dtype: object

In [4]:
features = ['latitude','longitude','housing_median_age','total_rooms','total_bedrooms','population','households',
            'median_income','median_house_value','ocean_proximity']
df_housing = df_housing[features]

In [5]:
df_housing.isnull().sum()

latitude                0
longitude               0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [6]:
df_housing['total_bedrooms'] = df_housing['total_bedrooms'].fillna(0)

In [7]:
df_housing.isnull().sum()

latitude              0
longitude             0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
ocean_proximity       0
dtype: int64

In [14]:
#Create a new column rooms_per_household by dividing the column total_rooms by the column households from dataframe.
df_housing['rooms_per_household'] = df_housing['total_rooms'] / df_housing['households']

#Create a new column bedrooms_per_room by dividing the column total_bedrooms by the column total_rooms from dataframe.
df_housing['bedrooms_per_room'] = df_housing['total_bedrooms'] / df_housing['total_rooms']

#Create a new column population_per_household by dividing the column population by the column households from dataframe.
df_housing['population_per_household'] = df_housing['population'] / df_housing['households']
df_housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,rooms_per_household,bedrooms_per_room,population_per_household
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY,6.984127,0.146591,2.555556
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,6.238137,0.155797,2.109842
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY,8.288136,0.129516,2.80226
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY,5.817352,0.184458,2.547945
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY,6.281853,0.172096,2.181467


### Question 1
What is the most frequent observation (mode) for the column ocean_proximity?

In [9]:
df_housing.ocean_proximity.value_counts()

<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: ocean_proximity, dtype: int64

### Split the data
Split your data in train/val/test sets, with 60%/20%/20% distribution.
Use Scikit-Learn for that (the train_test_split function) and set the seed to 42.
Make sure that the target value (median_house_value) is not in your dataframe.

In [5]:
from sklearn.model_selection import train_test_split

In [11]:
df_full_train, df_test = train_test_split(df_housing, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)
len(df_train), len(df_val), len(df_test)

(12384, 4128, 4128)

In [12]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [13]:
y_train = df_train.median_house_value.values
y_val = df_val.median_house_value.values
y_test = df_test.median_house_value.values

del df_train['median_house_value']
del df_val['median_house_value']
del df_test['median_house_value']

### Question 2: Create the correlation matrix for the numerical features of your train dataset.
In a correlation matrix, you compute the correlation coefficient between every pair of features in the dataset.
What are the two features that have the biggest correlation in this dataset?
Options:
total_bedrooms and households
total_bedrooms and total_rooms
population and households
population_per_household and total_rooms

In [14]:
df_full_train.dtypes

latitude                    float64
longitude                   float64
housing_median_age          float64
total_rooms                 float64
total_bedrooms              float64
population                  float64
households                  float64
median_income               float64
median_house_value          float64
ocean_proximity              object
rooms_per_household         float64
bedrooms_per_room           float64
population_per_household    float64
dtype: object

In [12]:
categorical = ['ocean_proximity']
numerical = ['latitude','longitude','housing_median_age','total_rooms','total_bedrooms','population','households',
             'median_income','rooms_per_household','bedrooms_per_room','population_per_household']

In [61]:
for i in numerical:
    print(f'{i} : ')
    print(df_full_train[numerical].corrwith(df_full_train[i]))
    print()

latitude : 
latitude                    1.000000
longitude                  -0.924485
housing_median_age          0.005296
total_rooms                -0.029224
total_bedrooms             -0.059998
population                 -0.102499
households                 -0.064061
median_income              -0.076571
rooms_per_household         0.110695
bedrooms_per_room          -0.118938
population_per_household    0.005837
dtype: float64

longitude : 
latitude                   -0.924485
longitude                   1.000000
housing_median_age         -0.101818
total_rooms                 0.038676
total_bedrooms              0.063064
population                  0.094276
households                  0.049306
median_income              -0.017040
rooms_per_household        -0.029339
bedrooms_per_room           0.097280
population_per_household   -0.000598
dtype: float64

housing_median_age : 
latitude                    0.005296
longitude                  -0.101818
housing_median_age          1.000

### Make median_house_value binary
We need to turn the median_house_value variable from numeric into binary.
Let's create a variable above_average which is 1 if the median_house_value is above its mean value and 0 otherwise.

In [51]:
above_average = [1 if i >= df_full_train.median_house_value.mean() else 0 for i in df_full_train.median_house_value.values]
above_average[:10]

[0, 1, 0, 0, 0, 1, 0, 1, 0, 1]

### Question 3
Calculate the mutual information score with the (binarized) price for the categorical variable that we have. Use the training set only.
What is the value of mutual information?
Round it to 2 decimal digits using round(score, 2)

In [52]:
from sklearn.metrics import mutual_info_score

In [56]:
score = mutual_info_score(df_full_train.ocean_proximity , above_average).round(2)
score

0.1

### Question 4
Now let's train a logistic regression
Remember that we have one categorical variable ocean_proximity in the data. Include it using one-hot encoding.
Fit the model on the training dataset.
To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
Calculate the accuracy on the validation dataset and round it to 2 decimal digits.

In [10]:
#Use Scikit-Learn to encode categorical features
from sklearn.feature_extraction import DictVectorizer

In [62]:
dv = DictVectorizer(sparse=False)

train_dict = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [63]:
#Logistic regression
from sklearn.linear_model import LogisticRegression


In [64]:
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
# solver='lbfgs' is the default solver in newer version of sklearn
# for older versions, you need to specify it explicitly
model.fit(X_train, y_train)


In [65]:
y_pred = model.predict_proba(X_val)[:, 1]

In [66]:
decision = (y_pred >= 0.5)

In [74]:
y_vals = [1 if i >= y_val.mean() else 0 for i in y_val]
y_vals[:10], decision[:10]

([0, 0, 1, 1, 1, 0, 0, 0, 0, 0],
 array([False, False, False, False, False, False, False, False, False,
        False]))

In [75]:
(y_vals == decision).mean().round(2)

0.6

### Question 5 : Let's find the least useful feature using the feature elimination technique.
Train a model with all these features (using the same parameters as in Q4).
Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
For each feature, calculate the difference between the original accuracy and the accuracy without the feature.
Which of following feature has the smallest difference?
total_rooms
total_bedrooms
population
households 

In [76]:
dicts_full_train = df_full_train[categorical + numerical].to_dict(orient='records')

In [77]:
dv = DictVectorizer(sparse=False)
X_full_train = dv.fit_transform(dicts_full_train)

In [79]:
y_full_train = df_full_train.median_house_value.values

In [104]:
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
# solver='lbfgs' is the default solver in newer version of sklearn
# for older versions, you need to specify it explicitly
model.fit(X_full_train, y_full_train)

In [105]:
test_dict = df_test[categorical + numerical].to_dict(orient='records')
X_test = dv.transform(test_dict)
    
y_pred = model.predict_proba(X_test)[:, 1]
decision = (y_test >= 0.5)
y_vals = [1 if i >= y_test.mean() else 0 for i in y_test]
(y_vals == decision).mean()

0.40261627906976744

In [106]:
def reg_without_feature(data,feature_list):
    results = {}
    for i in features:
        features_list = feature_list.copy()
        features_list.remove(i)
        
        df = data.copy()
        df['total_bedrooms'] = df['total_bedrooms'].fillna(0)
        
        df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
        df_full_train = df_full_train.reset_index(drop=True)
        df_test = df_test.reset_index(drop=True)
        
        y_full_train = df_full_train.median_house_value.values
        y_test = df_test.median_house_value.values

        del df_full_train['median_house_value']
        del df_test['median_house_value']
        
        dicts_full_train = df_full_train[features_list].to_dict(orient='records')
        dv = DictVectorizer(sparse=False)

        test_dict = df_test[features_list].to_dict(orient='records')
        X_full_train = dv.fit_transform(dicts_full_train)
        X_test = dv.transform(test_dict)
        
        model.fit(X_full_train, y_full_train)
        y_pred = model.predict_proba(X_test)[:, 1]
        decision = (y_test >= 0.5)
        y_vals = [1 if i >= y_test.mean() else 0 for i in y_test]
        results[i] = (y_vals == decision).mean()
        
    return results    
        
        

In [None]:
features = ['latitude','longitude','housing_median_age','total_rooms','total_bedrooms','population','households',
            'median_income','ocean_proximity']
df_housing = pd.read_csv('housing.csv')
scores = reg_without_feature(df_housing,features)

In [103]:
scores

{'latitude': 0.4,
 'longitude': 0.4,
 'housing_median_age': 0.4,
 'total_rooms': 0.4,
 'total_bedrooms': 0.4,
 'population': 0.4,
 'households': 0.4,
 'median_income': 0.4,
 'ocean_proximity': 0.4}

### Question 6
For this question, we'll see how to use a linear regression model from Scikit-Learn
We'll need to use the original column 'median_house_value'. Apply the logarithmic transformation to this column.
Fit the Ridge regression model (model = Ridge(alpha=a, solver="sag", random_state=42)) on the training data.
This model has a parameter alpha. Let's try the following values: [0, 0.01, 0.1, 1, 10]
Which of these alphas leads to the best RMSE on the validation set? Round your RMSE scores to 3 decimal digits.
If there are multiple options, select the smallest alpha.

In [3]:
from sklearn.linear_model import Ridge

In [16]:
df_full_train, df_test = train_test_split(df_housing, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)
len(df_train), len(df_val), len(df_test)

(12384, 4128, 4128)

In [17]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [18]:
y_train = np.log1p(df_train.median_house_value.values)
y_val = np.log1p(df_val.median_house_value.values)
y_test = np.log1p(df_test.median_house_value.values)

del df_train['median_house_value']
del df_val['median_house_value']
del df_test['median_house_value']

In [19]:
dv = DictVectorizer(sparse=False)

train_dict = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [23]:
def rmse(y, y_pred):
    se = (y - y_pred) ** 2
    mse = se.mean()
    score = np.sqrt(mse)
    return score

In [24]:
rmse_vals = []
alpha = [0, 0.01, 0.1, 1, 10]
for a in alpha:
    model = Ridge(alpha=a, solver="sag", random_state=42)
    model.fit(X_train,y_train)

    y_pred = model.predict(X_val)
    rmse_vals.append(rmse(y_val, y_pred))


In [25]:
rmse_vals

[0.524063570701514,
 0.524063570718629,
 0.5240635708812071,
 0.5240635725155536,
 0.5240635888333284]

In [26]:
idx = [i for i,val in enumerate(rmse_vals) if val==min(rmse_vals)]
min_r = [alpha[i] for i in idx ]
min(min_r)

0