### Importing libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.linear_model import LinearRegression, PoissonRegressor
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error

from sklearn.neighbors import KNeighborsRegressor

from sklearn.decomposition import PCA

### Reading dataset

In [2]:
raw_df = pd.read_csv('data/4054a881-9509-4cc0-9501-1174d5bbf6fc.txt')
raw_df

Unnamed: 0.1,Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,2072,-119.84,36.77,6.0,1853.0,473.0,1397.0,417.0,1.4817,72000.0,INLAND
1,10600,-117.80,33.68,8.0,2032.0,349.0,862.0,340.0,6.9133,274100.0,<1H OCEAN
2,2494,-120.19,36.60,25.0,875.0,214.0,931.0,214.0,1.5536,58300.0,INLAND
3,4284,-118.32,34.10,31.0,622.0,229.0,597.0,227.0,1.5284,200000.0,<1H OCEAN
4,16541,-121.23,37.79,21.0,1922.0,373.0,1130.0,372.0,4.0815,117900.0,INLAND
...,...,...,...,...,...,...,...,...,...,...,...
16507,1099,-121.90,39.59,20.0,1465.0,278.0,745.0,250.0,3.0625,93800.0,INLAND
16508,18898,-122.25,38.11,49.0,2365.0,504.0,1131.0,458.0,2.6133,103100.0,NEAR BAY
16509,11798,-121.22,38.92,19.0,2531.0,461.0,1206.0,429.0,4.4958,192600.0,INLAND
16510,6637,-118.14,34.16,39.0,2776.0,840.0,2546.0,773.0,2.5750,153500.0,<1H OCEAN


In [3]:
# We need to drop the column "unnamed O"
clean_df = raw_df.drop('Unnamed: 0', axis=1)
clean_df

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-119.84,36.77,6.0,1853.0,473.0,1397.0,417.0,1.4817,72000.0,INLAND
1,-117.80,33.68,8.0,2032.0,349.0,862.0,340.0,6.9133,274100.0,<1H OCEAN
2,-120.19,36.60,25.0,875.0,214.0,931.0,214.0,1.5536,58300.0,INLAND
3,-118.32,34.10,31.0,622.0,229.0,597.0,227.0,1.5284,200000.0,<1H OCEAN
4,-121.23,37.79,21.0,1922.0,373.0,1130.0,372.0,4.0815,117900.0,INLAND
...,...,...,...,...,...,...,...,...,...,...
16507,-121.90,39.59,20.0,1465.0,278.0,745.0,250.0,3.0625,93800.0,INLAND
16508,-122.25,38.11,49.0,2365.0,504.0,1131.0,458.0,2.6133,103100.0,NEAR BAY
16509,-121.22,38.92,19.0,2531.0,461.0,1206.0,429.0,4.4958,192600.0,INLAND
16510,-118.14,34.16,39.0,2776.0,840.0,2546.0,773.0,2.5750,153500.0,<1H OCEAN


### Duplicates and Outliers

In [4]:
# Duplicates
clean_df.duplicated().sum()

0

In [11]:
# Outliers
# Not yet

### Split into train/test

In [5]:
from sklearn.model_selection import train_test_split

X = clean_df.drop("median_house_value", axis=1).copy()
y = clean_df["median_house_value"].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=1)

In [6]:
X_train.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'ocean_proximity'],
      dtype='object')

### Create pipeline

In [6]:
# Select num et cat columns
cat_columns = X_train.select_dtypes(include=["object"]).columns.tolist()
num_columns = X_train.select_dtypes(exclude=["object"]).columns.tolist()

In [7]:
# # Select num et cat columns
# cat_columns = X_train.select_dtypes(include=["object"]).columns.tolist()
# num_columns = X_train.select_dtypes(exclude=["object"]).columns.tolist()

# cat data pipeline
cat_preprocessing = Pipeline([
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse=False))
])

# num data pipeline
num_preprocessing = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

# Combine both pipeline

preprocessing = ColumnTransformer([
    ("num", num_preprocessing, num_columns),
    ("cat", cat_preprocessing, cat_columns)
])

# preprocessing

In [8]:
from sklearn.model_selection import cross_val_score

# Full pipe with preprocessing pipes and regression pipe
full_pipe = Pipeline(
    [
        ('preprocess', preprocessing),
        ('regressor', LinearRegression())
    ]
)

# measure model performance
scores = cross_val_score(
    full_pipe, X_train, y_train, cv=5
)

# Fitting our model
full_pipe.fit(X_train, y_train)

# Storing the predictions
y_pred = full_pipe.predict(X_test)

# Evaluating the model
# full_pipe.score(X_test, y_test)

print(f'cross_val_score : {scores.mean()}')
print(f'y_pred min : {y_pred.min()}')
print(f'r² score : {full_pipe.score(X_test, y_test)}')
print(f'mae score : {mean_absolute_error(y_test, y_pred)}')
print(f'mae % score : {mean_absolute_percentage_error(y_test, y_pred)}')
print(f'mse score : {mean_squared_error(y_test, y_pred)}')
print(f'rmse score : {np.sqrt(mean_squared_error(y_test, y_pred))}')

cross_val_score : 0.6443418950023683
y_pred min : -165574.49370790733
r² score : 0.6442059715211986
mae score : 49430.23563332091
mae % score : 0.28572992503474937
mse score : 4640220231.537339
rmse score : 68119.1619996704


#### First iteration conclusion:
After using onehotencoding for categorical value, simpleimputing (median) and standardscaling on numerical columns, our first score is 0.64 with a Linear regression model. We have an issue, because our model can predict negative values, whereas the price of a house can go below 0. Because of our outliers, we can't use mse and rmse for evaluating the model. Mae is less sensitive to outliers, so we will focus on r² and mae metrics

### Second iteration, with MinMaxScaler instead of StandardScaler

In [9]:
# cat data pipeline
cat_preprocessing_2 = Pipeline([
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse=False))
])

# num data pipeline
num_preprocessing_2 = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", MinMaxScaler())
])

# Combine both pipeline

preprocessing_2 = ColumnTransformer([
    ("num", num_preprocessing_2, num_columns),
    ("cat", cat_preprocessing_2, cat_columns)
])

# preprocessing


# Full pipe with preprocessing pipes and regression pipe
full_pipe_2 = Pipeline(
    [
        ('preprocessing', preprocessing_2),
        ('regressor', LinearRegression())
    ]
)

full_pipe_2.fit(X_train, y_train)

y_pred_2 = full_pipe_2.predict(X_test)

full_pipe_2.score(X_test, y_test)
# full_pipe_minmax

print(f'y_pred min : {y_pred_2.min()}')
print(f'r² score : {full_pipe_2.score(X_test, y_test)}')
print(f'mae score : {mean_absolute_error(y_test, y_pred_2)}')
print(f'mae % score : {mean_absolute_percentage_error(y_test, y_pred_2)}')
print(f'mse score : {mean_squared_error(y_test, y_pred_2)}')
print(f'rmse score : {np.sqrt(mean_squared_error(y_test, y_pred_2))}')


y_pred min : -165120.0
r² score : 0.6440563800571593
mae score : 49428.40896245458
mae % score : 0.28584977064887623
mse score : 4642171184.286233
rmse score : 68133.4806412107


#### Second iteration conclusion:
After using minmaxscaling instead of standardscaling on numerical columns, our r² score is also 0.64 with a Linear regression model. The mae score is slightly better, but it's a really small improvement, so I think it's not relevant enough. We still have an issue with negative predictions.

### Third iteration, with PoissonRegressor instead of LinearRegression.

In [10]:
# cat data pipeline
cat_preprocessing_3 = Pipeline([
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse=False))
])

# num data pipeline
num_preprocessing_3 = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", MinMaxScaler())
])

# Combine both pipeline

preprocessing_3 = ColumnTransformer([
    ("num", num_preprocessing_3, num_columns),
    ("cat", cat_preprocessing_3, cat_columns)
])

# preprocessing


# Full pipe with preprocessing pipes and regression pipe
full_pipe_3 = Pipeline(
    [
        ('preprocessing', preprocessing_3),
        ('regressor', PoissonRegressor(max_iter=1000))
    ]
)

full_pipe_3.fit(X_train, y_train)

y_pred_3 = full_pipe_3.predict(X_test)

full_pipe_3.score(X_test, y_test)
# full_pipe_minmax

print(f'y_pred min : {y_pred_3.min()}')
print(f'r² score : {full_pipe_3.score(X_test, y_test)}')
print(f'mae score : {mean_absolute_error(y_test, y_pred_3)}')
print(f'mae % score : {mean_absolute_percentage_error(y_test, y_pred_3)}')
print(f'mse score : {mean_squared_error(y_test, y_pred_3)}')
print(f'rmse score : {np.sqrt(mean_squared_error(y_test, y_pred_3))}')

y_pred min : 17947.284286868944
r² score : 0.6534173538320218
mae score : 49342.101057320804
mae % score : 0.28969798973830807
mse score : 4917193992.911787
rmse score : 70122.7066855793


#### Third iteration conclusion:
After using PoissonRegressor instead of LinearRegression, our r² score has improved (0.65). The mae score is reduced, but it's still a really small improvement. We don't have negative prediction, so it's great improvement with this specific model. 

### Fourth iteration, with Feature Selection added.

In [11]:
# cat data pipeline
cat_preprocessing_4 = Pipeline([
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse=False))
])

# num data pipeline
num_preprocessing_4 = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", MinMaxScaler())
])

# Combine both pipeline

preprocessing_4 = ColumnTransformer([
    ("num", num_preprocessing_4, num_columns),
    ("cat", cat_preprocessing_4, cat_columns)
])

# preprocessing


# Full pipe with preprocessing pipes and regression pipe
full_pipe_4 = Pipeline(
    [
        ('preprocessing', preprocessing_4),
        ('regressor', KNeighborsRegressor())
    ]
)

full_pipe_4.fit(X_train, y_train)

y_pred_4 = full_pipe_4.predict(X_test)

full_pipe_4.score(X_test, y_test)
# full_pipe_minmax

print(f'y_pred min : {y_pred_4.min()}')
print(f'r² score : {full_pipe_4.score(X_test, y_test)}')
print(f'mae score : {mean_absolute_error(y_test, y_pred_4)}')
print(f'mae % score : {mean_absolute_percentage_error(y_test, y_pred_4)}')
print(f'mse score : {mean_squared_error(y_test, y_pred_4)}')
print(f'rmse score : {np.sqrt(mean_squared_error(y_test, y_pred_4))}')

y_pred min : 48020.0
r² score : 0.6966984644250284
mae score : 42824.85660072668
mae % score : 0.23110970115439392
mse score : 3955619850.194251
rmse score : 62893.71868632233


#### Fourth iteration conclusion:
After using KNeighborsRegressor, our r² score has improved (0.69). The mae score is reduced, so it's a new improvement of our model. We don't have negative prediction, so it's the same thing than with the PoissonRegressor().

### Fifth iteration, adding PCA in our pipeline.

In [12]:
# cat data pipeline
cat_preprocessing_5 = Pipeline(
    [
        ("encoder", OneHotEncoder(handle_unknown="ignore", sparse=False))
    ]
)

# num data pipeline
num_preprocessing_5 = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", MinMaxScaler()),
    ('pca', PCA(n_components=3))
])

# Combine both pipeline

preprocessing_5 = ColumnTransformer([
    ("num", num_preprocessing_5, num_columns),
    ("cat", cat_preprocessing_5, cat_columns)
])

# preprocessing


# Full pipe with preprocessing pipes and regression pipe
full_pipe_5 = Pipeline(
    [
        ('preprocessing', preprocessing_5),
        ('regressor', KNeighborsRegressor())
    ]
)

full_pipe_5.fit(X_train, y_train)

y_pred_5 = full_pipe_5.predict(X_test)

full_pipe_5.score(X_test, y_test)
# full_pipe_minmax

print(f'y_pred min : {y_pred_5.min()}')
print(f'r² score : {full_pipe_5.score(X_test, y_test)}')
print(f'mae score : {mean_absolute_error(y_test, y_pred_5)}')
print(f'mae % score : {mean_absolute_percentage_error(y_test, y_pred_5)}')
print(f'mse score : {mean_squared_error(y_test, y_pred_5)}')
print(f'rmse score : {np.sqrt(mean_squared_error(y_test, y_pred_5))}')

y_pred min : 48440.0
r² score : 0.6735907572662747
mae score : 44833.209043197414
mae % score : 0.24479582999975807
mse score : 4256987612.6632543
rmse score : 65245.59458433385


#### Fifth iteration conclusion:
After adding PCA, our r² score has deteriorated (0.67). The mae score is also worst. We don't have negative prediction.

### Sixth iteration, testing grid search ?