In [258]:
import pandas as pd
import numpy as np

In [259]:
df = pd.read_csv("housing.csv")

In [260]:
columns = ['latitude',
'longitude',
'housing_median_age',
'total_rooms',
'total_bedrooms',
'population',
'households',
'median_income',
'median_house_value',
'ocean_proximity']

In [261]:
df = df[columns].fillna(0)
df['rooms_per_household'] = df['total_rooms']/df['households']
df['bedrooms_per_room'] = df['total_bedrooms']/df['total_rooms']
df['population_per_household'] = df['population']/df['households']

In [262]:
df_origianl = df.copy()

# Question 1

In [263]:
df.ocean_proximity.value_counts(ascending= False)

<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: ocean_proximity, dtype: int64

# Question 2

In [264]:
df['above_average'] = (df['median_house_value'] > df['median_house_value'].mean()).astype(int)
del df['median_house_value']

In [265]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   latitude                  20640 non-null  float64
 1   longitude                 20640 non-null  float64
 2   housing_median_age        20640 non-null  float64
 3   total_rooms               20640 non-null  float64
 4   total_bedrooms            20640 non-null  float64
 5   population                20640 non-null  float64
 6   households                20640 non-null  float64
 7   median_income             20640 non-null  float64
 8   ocean_proximity           20640 non-null  object 
 9   rooms_per_household       20640 non-null  float64
 10  bedrooms_per_room         20640 non-null  float64
 11  population_per_household  20640 non-null  float64
 12  above_average             20640 non-null  int64  
dtypes: float64(11), int64(1), object(1)
memory usage: 2.0+ MB


In [266]:
from sklearn.model_selection import train_test_split

In [267]:
df_full_train , df_test = train_test_split(df , test_size = 0.2 , random_state= 42)

In [268]:
df_train , df_val = train_test_split(df_full_train , test_size = 0.25 , random_state= 42)

In [269]:
len(df_full_train) , len(df_val) , len(df_test)

(16512, 4128, 4128)

In [270]:
df_train.reset_index(inplace= True , drop = True)
df_val.reset_index(inplace= True , drop = True)
df_test.reset_index(inplace= True , drop = True)

In [271]:
y_train = df_train.above_average.values
y_val = df_val.above_average.values
y_test = df_test.above_average.values

In [272]:
print(df_train.total_bedrooms.corr(df_train.households))

print(df_train.total_bedrooms.corr(df_train.total_rooms))

print(df_train.population.corr(df_train.households))

print(df_train.population_per_household.corr(df_train.total_rooms))

0.979399352769416
0.9315462999468427
0.9068406743022558
-0.029451679411510768


total_bedrooms and households = 0.979399

total_bedrooms and total_rooms = 0.931546

population and households = 0.906841

population_per_household and total_rooms =  -0.029452

#  Question 3

In [273]:
from sklearn.metrics import mutual_info_score

In [274]:
mutual_info_score(df_train.ocean_proximity,df_train.above_average)

0.10138385763624205

# Question 4

In [275]:
from sklearn.feature_extraction import DictVectorizer

In [276]:
train_dicts = df_train.drop(columns= 'above_average').to_dict(orient = 'records')
dv = DictVectorizer(sparse= False)
X_train = dv.fit_transform(train_dicts)
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model.fit(X_train,y_train)
val_dicts = df_val.to_dict(orient = 'records')
X_val = dv.transform(val_dicts)
y_val_pred = model.predict(X_val)
(y_val_pred == y_val).mean().round(2)

0.84

In [277]:
dv.get_feature_names_out()

array(['bedrooms_per_room', 'households', 'housing_median_age',
       'latitude', 'longitude', 'median_income',
       'ocean_proximity=<1H OCEAN', 'ocean_proximity=INLAND',
       'ocean_proximity=ISLAND', 'ocean_proximity=NEAR BAY',
       'ocean_proximity=NEAR OCEAN', 'population',
       'population_per_household', 'rooms_per_household',
       'total_bedrooms', 'total_rooms'], dtype=object)

In [278]:
from sklearn.linear_model import LogisticRegression

In [279]:
dict(zip(dv.get_feature_names_out(),model.coef_[0].round(3)))

{'bedrooms_per_room': 0.851,
 'households': 0.004,
 'housing_median_age': 0.036,
 'latitude': 0.134,
 'longitude': 0.092,
 'median_income': 1.218,
 'ocean_proximity=<1H OCEAN': 0.429,
 'ocean_proximity=INLAND': -1.862,
 'ocean_proximity=ISLAND': 0.089,
 'ocean_proximity=NEAR BAY': 0.204,
 'ocean_proximity=NEAR OCEAN': 0.853,
 'population': -0.002,
 'population_per_household': 0.011,
 'rooms_per_household': -0.0,
 'total_bedrooms': 0.002,
 'total_rooms': -0.0}

# Question 5

In [280]:
q5 = ['total_rooms','total_bedrooms','population','households']

In [281]:
train_dicts = df_train[q5].to_dict(orient = 'records')
dv = DictVectorizer(sparse= False)
X_train = dv.fit_transform(train_dicts)
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model.fit(X_train,y_train)
val_dicts = df_val.to_dict(orient = 'records')
X_val = dv.transform(val_dicts)
y_val_pred = model.predict(X_val)
(y_val_pred == y_val).mean().round(2)

0.71

In [282]:
train_dicts = df_train[q5].drop(columns = 'total_rooms').to_dict(orient = 'records')
dv = DictVectorizer(sparse= False)
X_train = dv.fit_transform(train_dicts)
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model.fit(X_train,y_train)
val_dicts = df_val.to_dict(orient = 'records')
X_val = dv.transform(val_dicts)
y_val_pred = model.predict(X_val)
(y_val_pred == y_val).mean().round(2)

0.63

In [283]:
train_dicts = df_train[q5].drop(columns = 'total_bedrooms').to_dict(orient = 'records')
dv = DictVectorizer(sparse= False)
X_train = dv.fit_transform(train_dicts)
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model.fit(X_train,y_train)
val_dicts = df_val.to_dict(orient = 'records')
X_val = dv.transform(val_dicts)
y_val_pred = model.predict(X_val)
(y_val_pred == y_val).mean().round(2)

0.66

In [284]:
train_dicts = df_train[q5].drop(columns = 'population').to_dict(orient = 'records')
dv = DictVectorizer(sparse= False)
X_train = dv.fit_transform(train_dicts)
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model.fit(X_train,y_train)
val_dicts = df_val.to_dict(orient = 'records')
X_val = dv.transform(val_dicts)
y_val_pred = model.predict(X_val)
(y_val_pred == y_val).mean().round(2)

0.66

In [285]:
train_dicts = df_train[q5].drop(columns = 'households').to_dict(orient = 'records')
dv = DictVectorizer(sparse= False)
X_train = dv.fit_transform(train_dicts)
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model.fit(X_train,y_train)
val_dicts = df_val.to_dict(orient = 'records')
X_val = dv.transform(val_dicts)
y_val_pred = model.predict(X_val)
(y_val_pred == y_val).mean().round(2)

0.67

# Question 6

In [286]:
df = df_origianl.copy()

In [287]:
df.median_house_value = np.log(df.median_house_value)

In [288]:
df_full_train , df_test = train_test_split(df , test_size = 0.2 , random_state= 42)
df_train , df_val = train_test_split(df_full_train , test_size = 0.25 , random_state= 42)

df_train.reset_index(inplace= True , drop = True)
df_val.reset_index(inplace= True , drop = True)
df_test.reset_index(inplace= True , drop = True)

y_train = df_train.median_house_value.values
y_val = df_val.median_house_value.values
y_test = df_test.median_house_value.values

In [298]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

In [305]:
train_dicts = df_train.drop(columns= 'median_house_value').to_dict(orient = 'records')
dv = DictVectorizer(sparse= False)
X_train = dv.fit_transform(train_dicts)
val_dicts = df_val.to_dict(orient = 'records')
X_val = dv.transform(val_dicts)
for a in [0, 0.01, 0.1, 1, 10]:
    
    model = Ridge(alpha=a, solver="sag", random_state=42)
    model.fit(X_train,y_train)
    
    y_val_pred = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val,y_val_pred))
    print(a , round(rmse,3))

0 0.524
0.01 0.524
0.1 0.524
1 0.524
10 0.524
