# PRE-EDA: DATA LOADING
----

In [None]:
# Data reading and visualization
import os
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
%matplotlib inline

# Statistical analysis
from scipy.stats import norm

# Scikit-learn
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

# XGBoost & LightGBM
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

import warnings
warnings.filterwarnings('ignore')

## CONFIGS
---

In [None]:
BASE_PATH = "./"

rf_params = {
    'n_estimators': 100,
    'max_depth': 4,
    'min_samples_split': 2,
    'min_samples_leaf': 1
    }

xgb_params = {
    'n_estimators': 1000,
    'max_depth': 4,
    'min_child_weight': 2,
    'learning_rate': 0.01,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'booster': 'gbtree'
    }

lgb_params = {
    'n_estimators': 1000,
    'max_depth': 4,
    'learning_rate': 0.01,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'objective': 'regression'
    }

## HELPER FUNCTIONS
---

In [None]:
# Print dataset usage stats
def print_info(df):
    print(f"\nDataframe Shape: {df.shape}")
    print(f"\nDataframe Columns: {df.columns}")
    print(f"\nDataframe dtypes: \n{df.dtypes.value_counts()}")
    print(
        f"\nDataframe memory usage: {round(df.memory_usage().sum() / 1024**2, 2)} MB")

# Visualize misssing data
def visualize_missing_data(df):
    m_data = (df.isnull().sum() / len(df)) * 100
    m_data = m_data.drop(m_data[m_data == 0].index).sort_values()
    m_data = m_data.rename({'index': 'Feature', 0: 'Missing (%)'})

    fig = px.bar(x=m_data.index, y=m_data,
                 title='Missing Data by Feature', template='plotly_dark')
    fig.update_xaxes(title_text="Feature")
    fig.update_yaxes(title_text="Missing (%)")

    fig.show()

# Plot Histogram of dataset
def plot_histogram(df, distline=True):
    if distline:
        fig = plt.figure(figsize=(15, 15))
        for i, column in enumerate(df.columns):
            plt.subplot(4, 4, i+1)
            plt.title(column)
            plt.xlabel(column)
            sns.distplot(df[column], fit=norm)
        plt.tight_layout()
        plt.show()

    if not distline:
        fig = plt.figure(figsize=(15, 15))
        hist = df.hist(figsize=(15, 15), bins=50)
        plt.tight_layout()
        plt.show()

    plt.tight_layout()
    plt.show()

# Fit data to model(s)
def fit_robust_pipeline(model, X_train, y_train, X_test, y_test):
    pipe = Pipeline([('scaler', RobustScaler()), ('model', model)])
    pipe.fit(X_train, y_train)
    score = pipe.score(X_test, y_test)

    return round(score, 2)

# Scaling Data
def scale_data(scaler, df, feats_to_transform):
    scaled_df = df.copy()
    features = scaled_df[feats_to_transform]
    features = scaler.fit_transform(features.values)

    scaled_df[feats_to_transform] = features

    return scaled_df

for feat in cat_feats:
    label_enc = LabelEncoder()
    label_enc_df[feat] = label_enc.fit_transform(label_enc_df[feat])

# OneHotEncoding
for feat in cat_feats:
    onehot_enc = OneHotEncoder(handle_unknown='ignore')
    transformed = pd.DataFrame(onehot_enc.fit_transform(onehot_enc_df[feat].values.reshape(-1, 1)).toarray())
    transformed.columns = [f"{feat}_{i}" for i in transformed.columns]
    onehot_enc_df = onehot_enc_df.join(transformed)
    onehot_enc_df.drop([feat], axis=1, inplace=True)


# Encode Data
def encode_data(df, encoder_type, feats_to_encode):
    if encoder_type == 'Label':
        for feat in cat_feats:
            label_enc = LabelEncoder()
            label_enc_df[feat] = label_enc.fit_transform(label_enc_df[feat])

        return label_enc_df

    elif encoder_type == 'OneHot':
        for feat in cat_feats:
            onehot_enc = OneHotEncoder(handle_unknown='ignore')
            transformed = pd.DataFrame(onehot_enc.fit_transform(onehot_enc_df[feat].values.reshape(-1, 1)).toarray())
            transformed.columns = [f"{feat}_{i}" for i in transformed.columns]
            onehot_enc_df = onehot_enc_df.join(transformed)
            onehot_enc_df.drop([feat], axis=1, inplace=True)
        
        return onehot_enc_df
    else:
        print("Invalid Encoder Type")

# Fit data to model(s)
def evaluate_performance(X, Y, test_size=0.2, scale_data=False, scaler=None, feats_to_transform=None):
    # Define models
    rf = RandomForestRegressor(**rf_params)
    xgb = XGBRegressor(**xgb_params)
    lgb = LGBMRegressor(**lgb_params)

    # Transform data
    if scale_data:
        X = scale_data(scaler, X, feats_to_transform)
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size)  

    # Pass models to pipeline
    rf_score = fit_robust_pipeline(rf, X_train, y_train, X_test, y_test)
    xgb_score = fit_robust_pipeline(xgb, X_train, y_train, X_test, y_test)
    lgb_score = fit_robust_pipeline(lgb, X_train, y_train, X_test, y_test)

    # Print scores
    print(f"Model Scores: \n{'-'*25}\n")
    print(f"RandomForestRegressor Score: {rf_score}")
    print(f"XGBRegressor Score: {xgb_score}")
    print(f"LGBMRegressor Score: {lgb_score}")

    return rf, xgb, lgb

# Plot feature importance
def plot_feature_importance(features, title, model):
    fig = px.bar(y=features, x=model.feature_importances_, template='plotly_dark')
    fig.update_layout(title=f"{title}")
    fig.update_xaxes(title_text="Feature Importance")
    fig.update_yaxes(title_text="Feature")

    fig.show()

# EXPLORATORY DATA ANALYSIS (EDA)
----

# DATA PREPROCESSING
----

## NORMALIZATION 
----

# FEATURE GENERATION
----

# FEATURE SELECTION
----