<a href="https://colab.research.google.com/github/Karczel/01219114-2022f-w8-bag/blob/main/model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pycaret feature-engine imbalanced-learn folium mapclassify



In [2]:
import os
import numpy as np
import pandas as pd
os.environ['USE_PYGEOS'] = '0'
import geopandas as gpd
import pickle

import pycaret
from pycaret.regression import *

import sklearn
from sklearn import set_config
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler, RobustScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.feature_selection import SelectKBest, SelectPercentile, mutual_info_regression, f_regression

from imblearn.pipeline import Pipeline as imbPipeline

import feature_engine
from feature_engine.selection import DropFeatures, DropConstantFeatures, DropDuplicateFeatures, DropCorrelatedFeatures
from feature_engine.imputation import MeanMedianImputer, RandomSampleImputer, CategoricalImputer
from feature_engine.outliers import OutlierTrimmer
from feature_engine.transformation import YeoJohnsonTransformer, LogCpTransformer
from feature_engine.encoding import OneHotEncoder as Ohe

from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV

import warnings
warnings.filterwarnings('ignore')

set_config(display='diagram')

pd.options.display.max_rows = 999999
pd.options.display.max_columns = 999999
pd.set_option('display.float_format', lambda x: '%.03f' % x)

In [3]:
print(pd.__version__)
print(gpd.__version__)
print(pycaret.__version__)

2.1.4
1.0.1
3.3.2


# Mount Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# Read Data

In [None]:
df = pd.read_excel('raw_dataset_eng.xlsx')
df.columns = [col.lower() for col in df.columns]
print(df.shape)
df[:3]

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
df = df[df['date'] != 0].reset_index(drop=True)

In [None]:
df['date'] = pd.to_datetime(df['date'], format='%Y%m%d')

In [None]:
df = df.sort_values(by='date').reset_index(drop=True)

In [None]:
df[:3]

In [None]:
df = df.drop(index=[0])

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.info()

In [None]:
del df['id'], df['date']

In [None]:
df = df.rename(columns={'house_price': 'target'})

In [None]:
df.shape

# Visualization

In [None]:
gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.longitude, df.latitude), crs='EPSG:4326')
gdf = gdf.to_crs(epsg=24047)
print(gdf.shape)

In [None]:
gdf.explore()

# Some clean

In [None]:
# Data cleansing - out-of-bound deletion

boundary = gpd.read_file('Nonthaburi.shp')
boundary = boundary.to_crs(gdf.crs.to_string())
boundary = boundary[['geometry']]

gdf = gpd.sjoin(gdf, boundary, how='left', predicate='within')
gdf = gdf.dropna(subset=['index_right'])
del gdf['index_right']
print(gdf.shape)
gdf[:3]

In [None]:
# Remove some outliers

def remove_outliers(df, field_name):
    q25 = np.nanpercentile(df[field_name], 25)
    q75 = np.nanpercentile(df[field_name], 75)
    iqr = q75 - q25
    upperbound = q75 + iqr * 1.5
    lowerbound = q25 - iqr * 1.5

    df = df[(df[field_name] <= upperbound) & (df[field_name] >= lowerbound)]

    return df.reset_index(drop=True)

gdf = remove_outliers(gdf, 'target')
gdf.shape

In [None]:
gdf.drop(columns=['geometry']).to_csv('initial_data.csv', index=False)

In [None]:
from google.colab import files
files.download('/content/initial_data.csv')

In [None]:
m = gdf.explore(column='target', cmap='Reds')
boundary.explore(m=m, style_kwds={'stroke': True, 'color': 'black', 'fill': False, 'fillOpacity': 0})
m

# Find Promising Algorithm

In [None]:
s = setup(gdf.drop(columns=['geometry']), target='target', fold=2, session_id=123)

In [None]:
best = compare_models()

In [None]:
rf = create_model('rf')
rf

In [None]:
plot_model(rf, plot='residuals')

In [None]:
plot_model(rf, plot='error')

# Pipeline Model

## Custom Transformer Class

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class QualityTransformer(TransformerMixin):
  def __init__(self):
    self.mapper = {'very bad': 1, 'bad': 2, 'fair': 3, 'good': 4, 'very good': 5}

  def fit(self, X, y=None):
    return self

  def transform(self, X, y=None):
    _X = X.copy()
    _X['quality'] = _X['quality'].map(self.mapper)

    return _X

## Train-Test-Split

In [None]:
X = df.drop(columns=['target'])
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=0)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
X_train[:1]

## Pipeline - Basic

In [None]:
num_cols = df.drop(columns=['target']).select_dtypes(include=['int', 'float']).columns.tolist()
cat_cols = df.drop(columns=['target']).select_dtypes(include=['object']).columns.tolist()

print(num_cols)
print(cat_cols)

In [None]:
num_pipeline = Pipeline(steps=[
                                ('impute', SimpleImputer(strategy='mean')),
                                ('robust_scale', RobustScaler())
                            ])
cat_pipeline = Pipeline(steps=[
                                ('impute', SimpleImputer(strategy='most_frequent')),
                                ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
                            ])

In [None]:
col_trans = ColumnTransformer(transformers=[
                                            ('num_pipeline', num_pipeline, num_cols),
                                            ('cat_pipeline', cat_pipeline, cat_cols)
                                            ],
                                            remainder='drop',
                                            n_jobs=-1)

In [None]:
rf = RandomForestRegressor(n_estimators=100, max_depth=5, random_state=0)

In [None]:
pipeline = Pipeline(steps=[
                            ('quality_trans', QualityTransformer()),
                            ('col_trans', col_trans),
                            ('features_selector', SelectPercentile(mutual_info_regression, percentile=25)),
                            ('model', rf)
                        ])

In [None]:
display(pipeline)

In [None]:
pipeline.fit(X_train, y_train)
score = pipeline.score(X_test, y_test)
print(f'Model R2: {score:.4f}')

In [None]:
# grid_params = {
#     'model__n_estimators': [300, 750],
#     'model__max_depth': [7, 9]
# }

# pipeline = GridSearchCV(pipeline, grid_params, cv=2, scoring='r2')
# pipeline.fit(X_train, y_train)

# print('Best Score of train set: ' + str(pipeline.best_score_))
# print('Best parameter set: ' + str(pipeline.best_params_))
# print('Test Score: ' + str(pipeline.score(X_test, y_test)))

## Pipeline - Feature-Engine Library

In [None]:
gbt = GradientBoostingRegressor(n_estimators=100, random_state=0)
knn = KNeighborsRegressor(n_neighbors=7, n_jobs=-1)
rf = RandomForestRegressor(n_estimators=100, max_depth=5, n_jobs=-1, random_state=0)

gbt_knn_rf = VotingRegressor(estimators=[('gbt', gbt), ('knn', knn), ('rf', rf)], weights=[5, 2, 3], n_jobs=-1, verbose=True)

In [None]:
X_train.info()

In [None]:
pipeline = Pipeline([
    # Step 1: Drop features containing only 1 value
    ('drop_constant_values', DropConstantFeatures(tol=1, missing_values='ignore')),

    # Step 2: Drop duplicated features
    ('drop_duplicates', DropDuplicateFeatures()),

    # Step 3: Drop correlated features
    ('drop_correlated', DropCorrelatedFeatures(method='pearson', threshold=0.7)),

    # Step 4: Some transformations
    ('quality_trans', QualityTransformer()),

    # Step 5: Imputations
    ('impute_num', MeanMedianImputer(imputation_method='mean')),
    ('impute_cat', CategoricalImputer(imputation_method='frequent')),

    # # Step 6: Log transformations
    ('log', LogCpTransformer()),

    # Step 7: One hot encoding
    ('ohe', Ohe()),

    # Step 8: Regressor
    ('ensemble', rf)
])

display(pipeline)

In [None]:
pipeline.fit(X_train, y_train)
score = pipeline.score(X_test, y_test)
print(f'Model R2: {score:.4f}')

In [None]:
pipeline[:-1].fit_transform(X_train)[:3]

In [None]:
pipeline[:-1].fit_transform(X_train).columns.tolist()

In [None]:
importances = pipeline.steps[-1][1].feature_importances_
feature_imp = pd.DataFrame(importances,
                           columns=['importance_score'],
                           index=pipeline[:-1].fit_transform(X_train).columns.tolist()).reset_index(drop=False).rename(columns={'index': 'features'}).sort_values(by='importance_score', ascending=False).reset_index(drop=True)
feature_imp

## Pipeline - Feature-Engine + Sklearn

In [None]:
from feature_engine.wrappers import SklearnTransformerWrapper

In [None]:
pipeline = Pipeline([
    # Step 1: Drop features containing only 1 value
    ('drop_constant_values', DropConstantFeatures(tol=1, missing_values='ignore')),

    # Step 2: Drop duplicated features
    ('drop_duplicates', DropDuplicateFeatures()),

    # Step 3: Drop correlated features
    ('drop_correlated', DropCorrelatedFeatures(method='pearson', threshold=0.7)),

    # Step 4: Some transformations
    ('quality_trans', QualityTransformer()),

    # Step 5: Imputations
    ('impute_num', MeanMedianImputer(imputation_method='mean')),
    ('impute_cat', CategoricalImputer(imputation_method='frequent')),

    # Step 6: Robust scaling
    ('robust_scale', SklearnTransformerWrapper(RobustScaler())),

    # Step 7: One hot encoding
    ('ohe', Ohe()),

    # Step 8: Voting regressor
    ('ensemble', gbt_knn_rf)
])

display(pipeline)

In [None]:
pipeline[:-1].fit_transform(X_train)[:3]

In [None]:
pipeline.fit(X_train, y_train)
score = pipeline.score(X_test, y_test)
print(f'Model R2: {score:.4f}')

## Pipeline - Feature Selection

In [None]:
pipeline = Pipeline([
    # Step 1: Drop features containing only 1 value
    ('drop_constant_values', DropConstantFeatures(tol=1, missing_values='ignore')),

    # Step 2: Drop duplicated features
    ('drop_duplicates', DropDuplicateFeatures()),

    # Step 3: Drop correlated features
    ('drop_correlated', DropCorrelatedFeatures(method='pearson', threshold=0.7)),

    # Step 4: Some transformations
    ('quality_trans', QualityTransformer()),

    # Step 5: Imputations
    ('impute_num', MeanMedianImputer(imputation_method='mean')),
    ('impute_cat', RandomSampleImputer(random_state=0)),

    # Step 6: Robust scaling
    ('robust_scale', SklearnTransformerWrapper(RobustScaler())),

    # Step 7: One hot encoding
    ('ohe', Ohe()),

    # Step 8: Feature selection
    ('feature_selector', SelectPercentile(mutual_info_regression, percentile=50)),

    # Step 9: Voting regressor
    ('ensemble', gbt_knn_rf)
])

display(pipeline)

In [None]:
pipeline[:-1].fit_transform(X_train, y_train)[:3]

In [None]:
pipeline.fit(X_train, y_train)
score = pipeline.score(X_test, y_test)
print(f'Model R2: {score:.4f}')

## Pipeline - imbPipeline

In [None]:
pipeline = imbPipeline([
    # Step 1: Drop features containing only 1 value
    ('drop_constant_values', DropConstantFeatures(tol=1, missing_values='ignore')),

    # Step 2: Drop duplicated features
    ('drop_duplicates', DropDuplicateFeatures()),

    # Step 3: Drop correlated features
    ('drop_correlated', DropCorrelatedFeatures(method='pearson', threshold=0.7)),

    # Step 4: Some transformations
    ('quality_trans', QualityTransformer()),

    # Step 5: Imputations
    ('impute_num', MeanMedianImputer(imputation_method='mean')),
    ('impute_cat', RandomSampleImputer(random_state=0)),

    # Step 6: Robust scaling
    ('robust_scale', SklearnTransformerWrapper(RobustScaler())),

    # Step 7: One hot encoding
    ('ohe', Ohe()),

    # Step 8: Feature selection
    ('feature_selector', SelectPercentile(mutual_info_regression, percentile=50)),

    # Step 9: Voting regressor
    ('ensemble', gbt_knn_rf)
])

display(pipeline)

In [None]:
pipeline.fit(X_train, y_train)
score = pipeline.score(X_test, y_test)
print(f'Model R2: {score:.4f}')

## Hyperparameter Tuning

In [None]:
pipeline.get_params()

In [None]:
params = {
    'ensemble__gbt__n_estimators': [300, 500],
    'ensemble__knn__n_neighbors': [7, 9],
    'ensemble__rf__max_depth': [5, 7],
    'ensemble__rf__n_estimators': [300, 500],
}

In [None]:
%%time
grid = RandomizedSearchCV(pipeline, params, scoring='r2', verbose=2, cv=2)
grid.fit(X_train, y_train)

print('Best Score of train set: ' + str(grid.best_score_))
print('Best parameter set: ' + str(grid.best_params_))
print('Test Score: ' + str(grid.score(X_test, y_test)))

## Prediction

In [None]:
X_test[X_test['direction'].notna()][:1].to_dict('records')

In [None]:
pipeline.fit(X_train, y_train).predict(X_test[X_test['direction'].notna()][:1])[0]

## Save Pipeline

In [None]:
# import pickle

In [None]:
# pickle.dump(pipeline1, open('final_model.pkl'), 'wb'))