Importing Libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
import seaborn as sns
from sklearn.linear_model import LogisticRegression, Ridge, LinearRegression
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

Loading Data

In [2]:
data = pd.read_csv('C:/Users/Godwin/Documents/Workflow/ML Zoomcamp/Classification/Califonia Housing Data/housing.csv')

#Formatting the column names to lower case and replacing empty spaces with '_'
data.columns = data.columns.str.replace(' ', '_').str.lower()

#Formatting the strings in the data to lower case and replacing empty spaces with '_'
categorical_col = data.dtypes[data.dtypes == 'object'].index.tolist()
for col in categorical_col:
    data[col] = data[col].str.replace(' ', '_').str.lower()
    
data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,near_bay
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,near_bay
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,near_bay
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,near_bay
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,near_bay


In [3]:
#filling missing values with 0
data.fillna(0, inplace = True)

#Feature Engineering
data['rooms_per_household'] = data['total_rooms']/ data['households']
data['bedrooms_per_room'] = data['total_bedrooms']/data['total_rooms']
data['population_per_household'] = data['population']/ data['households']

Question 1

In [4]:
#Mode for the 'ocean_proximity' column in the data
data['ocean_proximity'].mode()[0]

'<1h_ocean'

Question 2

In [5]:
#Column Correlation
corr = data.corr()

#Remocing self correlation and duplicate correlation
upper_triangle = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
unique = upper_triangle.unstack().dropna()

#Returning columns with the maximum correlation
#unique.sort_values(ascending = False)

max_corr_value = unique.max()
unique[unique == max_corr_value]

households  total_bedrooms    0.966507
dtype: float64

In [6]:
mean_value = data['median_house_value'].mean()
new_df = data.copy()
new_df['above_average'] = (data['median_house_value'] > mean_value).astype('int')
del new_df['median_house_value']

In [7]:
columns = new_df.columns.tolist()
new_df = new_df[columns]

train_df, test_df = train_test_split(new_df, test_size = 0.2, random_state = 42)
train_df, val_df = train_test_split(train_df, test_size = 0.25, random_state = 42)

Question 3

In [8]:
#Chacking the mutual information between the target variable and the categorical variables
round(mutual_info_score(new_df['above_average'], new_df['ocean_proximity']), 2)

0.1

In [9]:
#Splitting the data
train_df, test_df = train_test_split(new_df, test_size = 0.2, random_state = 42)
train_df, val_df = train_test_split(train_df, test_size = 0.25, random_state = 42)

y_train = train_df.pop('above_average')
y_test = test_df.pop('above_average')
y_val = val_df.pop('above_average')

In [10]:
categorical_col = train_df.dtypes[train_df.dtypes == 'object'].index.to_list()
numerical_col = train_df.dtypes[train_df.dtypes != 'object'].index.to_list()

#Vetorizing data
dv = DictVectorizer(sparse = False)
dv.fit(train_df[numerical_col + categorical_col].to_dict(orient = 'records'))
names = dv.get_feature_names()

X_train = dv.transform(train_df[numerical_col + categorical_col].to_dict(orient = 'records'))
X_test = dv.transform(test_df[numerical_col + categorical_col].to_dict(orient = 'records'))
X_val = dv.transform(val_df[numerical_col + categorical_col].to_dict(orient = 'records'))

Question 4: Model Accuracy

In [11]:
#Training the model
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

#Model Accuracy
prediction = model.predict_proba(X_val)[:, 1]
decision = (prediction >=0.5)
accuracy = round((y_val == decision).mean(), 2)
print(accuracy)

0.84


Question 5: Least Feature with Feature elimination

In [12]:
full_col = categorical_col + numerical_col

feature = []
global_acc = []
diff = []
for col in full_col:
    new_col = [i for i in full_col if i != col]
    X_train = dv.transform(train_df[new_col].to_dict(orient = 'records'))
    X_val = dv.transform(val_df[new_col].to_dict(orient = 'records'))

    model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)

    prediction = model.predict_proba(X_val)[:, 1]
    decision = (prediction >=0.5)
    new_acc = round((y_val == decision).mean(), 5)
    global_acc.append(new_acc)
    feature.append(col)
    diff.append(round((accuracy - new_acc), 5))

out = pd.DataFrame({'Feature':feature, 'Accuracy_score': global_acc, 'Difference': diff})
out.sort_values(by = ['Difference']).reset_index(drop = True)

Unnamed: 0,Feature,Accuracy_score,Difference
0,bedrooms_per_room,0.83697,0.00303
1,total_rooms,0.83672,0.00328
2,total_bedrooms,0.836,0.004
3,population_per_household,0.83576,0.00424
4,rooms_per_household,0.83479,0.00521
5,latitude,0.83309,0.00691
6,households,0.83309,0.00691
7,housing_median_age,0.83115,0.00885
8,longitude,0.82897,0.01103
9,population,0.82631,0.01369


Linear Regression Model

In [13]:
#DAta Preparation
new_df = data.copy()

train_df, test_df = train_test_split(new_df, test_size = 0.2, random_state = 42)
train_df, val_df = train_test_split(train_df, test_size = 0.25, random_state = 42)

#Transforming target variable
y_train = np.log1p(train_df.pop('median_house_value'))
y_test = np.log1p(test_df.pop('median_house_value'))
y_val = np.log1p(val_df.pop('median_house_value'))

In [14]:
#Vectorising data
dv.fit(train_df[numerical_col + categorical_col].to_dict(orient = 'records'))
names = dv.get_feature_names()

X_train = dv.transform(train_df[numerical_col + categorical_col].to_dict(orient = 'records'))
X_test = dv.transform(test_df[numerical_col + categorical_col].to_dict(orient = 'records'))
X_val = dv.transform(val_df[numerical_col + categorical_col].to_dict(orient = 'records'))

In [15]:
def rmse(actual, predicted):
    '''Returns root mean squared error'''
    error = actual - predicted
    rms = np.square(error)
    rms = np.mean(rms)
    return np.sqrt(rms)

#Training Model with different alpha
alpha_value = [0, 0.01, 0.1, 1, 10]
errors = []
for a in alpha_value:
    model = Ridge(alpha=a, solver="sag", random_state=42)
    model.fit(X_train, y_train)
    prediction = model.predict(X_val)
    error = round(rmse(y_val, prediction), 3)
    errors.append(error)

output = pd.DataFrame({'Alpha Value': alpha_value, 'Error': errors})

Question 6: Selecting model with the best alpha

In [16]:
#Selecting Value with the lowest error
output[output['Error'] == output['Error'].min()]['Alpha Value']

0     0.00
1     0.01
2     0.10
3     1.00
4    10.00
Name: Alpha Value, dtype: float64

In [17]:
#Selecting Based on the smallest value
output[output['Alpha Value'] == output['Alpha Value'].min()]['Alpha Value']

0    0.0
Name: Alpha Value, dtype: float64

The best alpha is 0