In [45]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
from sklearn.linear_model import LinearRegression
from statsmodels.formula.api import ols
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
import plotly.express as px
from scipy import stats
from scipy.stats import chi2_contingency
from sklearn.compose import ColumnTransformer
from statsmodels.formula.api import ols
import statsmodels.api as sm

# First iteration : Baseline

In [46]:
first_it = pd.read_pickle("data/first_iteration.pkl")
first_it.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16512 entries, 0 to 16511
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Unnamed: 0          16512 non-null  int64  
 1   longitude           16512 non-null  float64
 2   latitude            16512 non-null  float64
 3   housing_median_age  16512 non-null  float64
 4   total_rooms         16512 non-null  float64
 5   total_bedrooms      16512 non-null  float64
 6   population          16512 non-null  float64
 7   households          16512 non-null  float64
 8   median_income       16512 non-null  float64
 9   median_house_value  16512 non-null  float64
 10  ocean_proximity     16512 non-null  object 
dtypes: float64(9), int64(1), object(1)
memory usage: 1.4+ MB


In [47]:
# define X, y
y = first_it["median_house_value"]
X = first_it.drop(["median_house_value","ocean_proximity"], axis=1)

In [48]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=.3, 
                                                    random_state=1)

In [49]:
log_model = LinearRegression()

log_model.fit(X_train, y_train)

# Evaluate the model with the test set
log_model.score(X_test, y_test)

0.6311871393336708

In [50]:
cv_results = cross_validate(log_model, X, y, cv=5)
cv_results["test_score"].mean()

0.6348249131619211

# Second iteration : Outliers

In [51]:
second_it = pd.read_pickle("data/second_iteration.pkl")
second_it.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15262 entries, 0 to 16511
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Unnamed: 0          15262 non-null  int64  
 1   longitude           15262 non-null  float64
 2   latitude            15262 non-null  float64
 3   housing_median_age  15262 non-null  float64
 4   total_rooms         15262 non-null  float64
 5   total_bedrooms      15262 non-null  float64
 6   population          15262 non-null  float64
 7   households          15262 non-null  float64
 8   median_income       15262 non-null  float64
 9   median_house_value  15262 non-null  float64
 10  ocean_proximity     15262 non-null  object 
dtypes: float64(9), int64(1), object(1)
memory usage: 1.4+ MB


In [52]:
# define X, y
y = second_it["median_house_value"]
X = second_it.drop(["ocean_proximity","median_house_value"], axis=1)

In [53]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=.3, 
                                                    random_state=1)

In [54]:
log_model.fit(X_train, y_train)

# Evaluate the model with the test set
log_model.score(X_test, y_test)

0.6076476528019652

In [55]:
cv_results = cross_validate(log_model, X, y, cv=5)
cv_results["test_score"].mean()

0.6334931888672133

# Third iteration : Median imputation

In [56]:
third_it = pd.read_pickle("data/third_iteration.pkl")
third_it.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16512 entries, 0 to 16511
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Unnamed: 0          16512 non-null  int64  
 1   longitude           16512 non-null  float64
 2   latitude            16512 non-null  float64
 3   housing_median_age  16512 non-null  float64
 4   total_rooms         16512 non-null  float64
 5   total_bedrooms      16512 non-null  float64
 6   population          16512 non-null  float64
 7   households          16512 non-null  float64
 8   median_income       16512 non-null  float64
 9   median_house_value  16512 non-null  float64
 10  ocean_proximity     16512 non-null  object 
dtypes: float64(9), int64(1), object(1)
memory usage: 1.4+ MB


In [57]:
# define X, y
y = third_it["median_house_value"]
X = third_it.drop(["median_house_value","ocean_proximity"], axis=1)

In [58]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=.3, 
                                                    random_state=1)

In [59]:
log_model.fit(X_train, y_train)

# Evaluate the model with the test set
log_model.score(X_test, y_test)

0.6322340999259999

In [60]:
cv_results = cross_validate(log_model, X, y, cv=5)
cv_results["test_score"].mean()

0.6358408199481806

# Fourth iteration : Mean imputation

In [61]:
fourth_it = pd.read_pickle("data/fourth_iteration.pkl")

In [62]:
# define X, y
y = fourth_it["median_house_value"]
X = fourth_it.drop(["median_house_value","ocean_proximity"], axis=1)

In [63]:
cv_results = cross_validate(log_model, X, y, cv=5)
cv_results["test_score"].mean()

0.6358408199481806

# Fifth iteration : iterative imputation

In [64]:
fifth_it = pd.read_pickle("data/fifth_iteration.pkl")

In [65]:
# define X, y
y = fifth_it["median_house_value"]
X = fifth_it.drop(["median_house_value","ocean_proximity"], axis=1)

In [66]:
cv_results = cross_validate(log_model, X, y, cv=5)
cv_results["test_score"].mean()

0.6348249131619182

# Sixth iteration : ocean_proximity encoding

In [67]:
sixth_it = pd.read_pickle("data/sixth_iteration.pkl")

In [68]:
# define X, y
y = sixth_it["median_house_value"]
X = sixth_it.drop(["median_house_value"], axis=1)

In [69]:
cv_results = cross_validate(log_model, X, y, cv=5)
cv_results["test_score"].mean()

0.6464622037671589

# Seventh iteration : normalization

In [70]:
seventh_it = pd.read_pickle("data/seventh_iteration.pkl")

In [71]:
# define X, y
y = seventh_it["median_house_value"]
X = seventh_it.drop(["median_house_value"], axis=1)

In [72]:
cv_results = cross_validate(log_model, X, y, cv=5)
cv_results["test_score"].mean()

0.6464622037671547