# Table of Content
- Imports
- Add Functions
- Load Data
- Feature Extraction & Data Prep
- Remove Multicolinearity
- Train Test Split
- Encoding
- Pipeline & Hyper-parameter Tuning
- Feature Selection
- 

# Import

In [1]:
import datetime
from itertools import combinations 
import numpy as np
import matplotlib.pyplot as plt
import os
import pandas as pd

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.inspection import permutation_importance

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PowerTransformer
from sklearn import neighbors
from sklearn.base import BaseEstimator

import scipy.stats as st

import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant


import warnings
warnings.filterwarnings('ignore')

from xgboost import XGBRegressor

%matplotlib inline


The sklearn.metrics.classification module is  deprecated in version 0.22 and will be removed in version 0.24. The corresponding classes / functions should instead be imported from sklearn.metrics. Anything that cannot be imported from sklearn.metrics is now part of the private API.



# Add Functions

In [2]:
%run -i "scripts/functions.py"

# Load Data

In [3]:
file_path = os.path.join('data', 'kickstarter_2016_clean.csv')
df = pd.read_csv(file_path)

# Feature Extraction & Data Cleaning

In [4]:
# Change the features to the correct data type
df['deadline'] = df['deadline'].astype('datetime64[ns]') 
df['launched'] = df['launched'].astype('datetime64[ns]') 

#Feature Extract
df['deadline_year'] =  df['deadline'].map(lambda x: x.year)
df['deadline_month'] =  df['deadline'].map(lambda x: x.month)
df['deadline_day'] =  df['deadline'].map(lambda x: x.day)
df['launch_year'] =  df['launched'].map(lambda x: x.year)
df['launch_month'] =  df['launched'].map(lambda x: x.month)
df['launch_day'] =  df['launched'].map(lambda x: x.day)
df['name_word_count'] = df['name'].map(lambda x: len(x.split()))
df['name_char_count'] = df['name'].map(lambda x: len(x))


#[Fix] Salt of the Earth: A Dead Sea Movie (Canceled)
df.iloc[2443]["launched"] = df.iloc[2443]["launched"].replace(year=2010, month=7, day=23) 

#[Remove] Could not confirm the rest of the observation where launch year is 1970.
df.drop(df[df["launch_year"] < 2009].index, inplace=True)

# Remove Multicolinearity

In [5]:
# VIF values implied correlation between country and currency. Country had a slightly higher VIF. 
# Currency was chosen and country was dropped.
df.drop(columns=['country', 'launched', 'name', 'deadline'], inplace=True)

# Train Test Split

In [6]:
X = df.drop(columns = ['state', 'backers', 'usd_pledged'])
y = df['pledged']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.25)

# Encoding

In [7]:
tmp = ['category', 'main_category', 'currency']
encoded_X_train = mean_encoding(X_train, tmp, y.name)
encoded_X_test = mean_encoding(X_test, tmp, y.name)
encoded_X_train.drop(columns=['pledged'], inplace=True)
encoded_X_test.drop(columns=['pledged'], inplace=True)

# Save Train and Test Data

In [20]:
(encoded_X_train
    .merge(df[['state', 'backers', 'usd_pledged', 'pledged']], left_index=True, right_index=True)
    .to_csv('data/kickstarter_train_2016_prep.csv', index=False)
)

(encoded_X_test
    .merge(df[['state', 'backers', 'usd_pledged', 'pledged']], left_index=True, right_index=True)
    .to_csv('data/kickstarter_test_2016_prep.csv', index=False)
)

# Pipeline & Hyper-parameter Tuning

In [20]:
class DummyEstimator(BaseEstimator):
    def fit(self): pass
    def score(self): pass

# Create a pipeline
pipe = Pipeline([('reg', DummyEstimator())]) # Placeholder Estimator

# Candidate learning algorithms and their hyperparameters
search_space = [{'reg': [XGBRegressor()], # Actual Estimator
                 'reg__eta': [0.2, 0.3, 0.7, 0.9],
                 'reg__max_depth': [6, 10, 17],
                 'reg__max_iter': [100, 200, 300]},
                
                {'reg':[GradientBoostingRegressor()], # Actual Estimator
                 'reg__learning_rate': [0.1, 0.2, 0.4],  #Parameters
                 'reg__n_estimators': [100, 120, 150]
                },
                
                {'reg': [RandomForestRegressor()],
                 'reg__n_estimators' : [25, 50, 100],
                 'reg__max_depth' : [None, 40, 100],
                 'reg__oob_score' :[True]},             
               ]


# Create grid search 
gs = GridSearchCV(pipe, search_space)

In [None]:
gs.fit(encoded_X_test, y_test)

In [None]:
gs.best_estimator_

In [None]:
model_file_name = 'semi_final_model.pkl'

#Save model
with open(model_file_name, 'wb') as out:
    pickle.dump(gs.best_estimator_, out)
    
#Read Model
#with open(model_file_name , 'rb') as inp:
#    clf = pickle.load(inp)

# Feature Importance

In [None]:
results = permutation_importance(model, X, y, scoring='neg_mean_squared_error')
importance = results.importances_mean
# summarize feature importance
for i,v in enumerate(importance):
    print('Feature: %0d, Score: %.5f' % (i,v))
# plot feature importance
plt.bar([x for x in range(len(importance))], importance)

# Research

- [Box-Cox transform and the Yeo-Johnson transform](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PowerTransformer.html)
- [Box vs Yeo](https://stats.stackexchange.com/questions/430419/box-cox-vs-yeo-johnson)
- [Normalizing Guidelines]((https://stackoverflow.com/questions/49444262/normalize-data-before-or-after-split-of-training-and-testing-data#:~:text=You%20first%20need%20to%20split,set%20could%20be%20useful%20too).&text=Therefore%2C%20you%20should%20perform%20feature,variance%20of%20training%20explanatory%20variables.)
- [Rate per capital](https://www.robertniles.com/stats/percap.shtml)
- [Penalize Regression with categorical](https://stats.stackexchange.com/questions/359015/ridge-lasso-standardization-of-dummy-indicators)
- [Feature Importance](https://medium.com/bigdatarepublic/feature-importance-whats-in-a-name-79532e59eea3)
- [Better feature importance](https://machinelearningmastery.com/calculate-feature-importance-with-python/)