In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('housing.csv')

In [3]:
data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [4]:
data = data.fillna(0)

In [5]:
data['rooms_per_household'] = data['total_rooms'] / data['households']

In [6]:
data['bedrooms_per_room'] = data['total_bedrooms'] / data['total_rooms']

In [7]:
data['population_per_household'] = data['population'] / data['households']

# Question 1

In [8]:
data['ocean_proximity'].mode()

0    <1H OCEAN
dtype: object

In [9]:
data['ocean_proximity'].value_counts()

<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: ocean_proximity, dtype: int64

Answer: the mode is <1H OCEAN

# Question 2

In [10]:
cor = data.corr()

In [11]:
cor['total_bedrooms']['households']

0.966507240042043

In [12]:
cor['total_bedrooms']['total_rooms']

0.9201961721166215

In [13]:
cor['population']['households']

0.9072222660959659

In [14]:
cor['population_per_household']['total_rooms']

-0.02458065899398796

Answer: the biggest correlation is between total_bedrooms and households

In [15]:
data['above_average'] = data['median_house_value'] > data['median_house_value'].mean()

In [16]:
data['above_average'] = data['above_average'].astype(int) 

In [17]:
data['above_average'].value_counts()

0    12255
1     8385
Name: above_average, dtype: int64

In [18]:
from sklearn.model_selection import train_test_split

features = [col for col in data.columns if col!='median_house_value' and col!='above_average' ]
X = data[features]
y = data['above_average']
X_temp, X_test, y_temp, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

In [19]:
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42)

In [20]:
print(X_train.shape, y_train.shape)
print(X_val.shape, y_val.shape)
print(X_test.shape, y_test.shape)

(12384, 12) (12384,)
(4128, 12) (4128,)
(4128, 12) (4128,)


# Question 3

In [21]:
from sklearn.metrics import mutual_info_score

round(mutual_info_score(y_train, X_train['ocean_proximity']), 2)

0.1

Answer: 0.1

# Question 4

In [22]:
from sklearn.feature_extraction import DictVectorizer

train_dict = X_train.to_dict(orient='records')
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)
X_train = dv.transform(train_dict)
X_train.shape

(12384, 16)

In [23]:
list(dv.get_feature_names_out())

['bedrooms_per_room',
 'households',
 'housing_median_age',
 'latitude',
 'longitude',
 'median_income',
 'ocean_proximity=<1H OCEAN',
 'ocean_proximity=INLAND',
 'ocean_proximity=ISLAND',
 'ocean_proximity=NEAR BAY',
 'ocean_proximity=NEAR OCEAN',
 'population',
 'population_per_household',
 'rooms_per_household',
 'total_bedrooms',
 'total_rooms']

### Fitting the model

In [24]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver="liblinear", max_iter=1000,  C=1.0, random_state=42)

In [25]:
model.fit(X_train, y_train)

LogisticRegression(max_iter=1000, random_state=42, solver='liblinear')

In [26]:
val_dict = X_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

In [27]:
round(model.score(X_val, y_val), 2)

0.84

Answer: 0.84

# Question 5

In [28]:
model2 = LogisticRegression(solver="liblinear", max_iter=1000,  C=1.0, random_state=42)

all_features = list(dv.get_feature_names_out())

features_to_exclude = ['total_rooms','total_bedrooms','population','households']

for feature in features_to_exclude:
    
    X_temp, X_test, y_temp, y_test = train_test_split(X,y, test_size=0.2, random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42)
    
    X_train_2 = X_train.drop([feature], axis=1)
    X_val_2 = X_val.drop([feature], axis=1)
    
    train_dict = X_train_2.to_dict(orient='records')
    dv = DictVectorizer(sparse=False)
    dv.fit(train_dict)
    X_train_2 = dv.transform(train_dict) 
    
    val_dict = X_val_2.to_dict(orient='records')
    X_val_2 = dv.transform(val_dict)
    
    
    model.fit(X_train_2, y_train)
    acc = round(model.score(X_val_2, y_val), 2)
    
    print(f"Missing feature: {feature} --- score: {acc}")

Missing feature: total_rooms --- score: 0.84
Missing feature: total_bedrooms --- score: 0.84
Missing feature: population --- score: 0.83
Missing feature: households --- score: 0.83


# Question 6

In [29]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

features = [col for col in data.columns if col!='median_house_value']
X = data[features]
y = np.log1p(data['median_house_value'])

X_temp, X_test, y_temp, y_test = train_test_split(X,y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42)

train_dict = X_train.to_dict(orient='records')
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)
X_train = dv.transform(train_dict)

val_dict = X_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

for a in [0, 0.01, 0.1, 1, 10]:
    model = Ridge(alpha=a, solver="sag", random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    rmse = round(mean_squared_error(y_val, y_pred)**0.5, 3)
    
    print(f"Alpha: {a} --- RMSE score: {rmse}")


Alpha: 0 --- RMSE score: 0.524
Alpha: 0.01 --- RMSE score: 0.524
Alpha: 0.1 --- RMSE score: 0.524
Alpha: 1 --- RMSE score: 0.524
Alpha: 10 --- RMSE score: 0.524
