In [2]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import optuna
from sklearn import metrics
import sklearn.cluster
import xgboost as xgb
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

file = '..\data\external\Legally_Operating_Businesses.csv'
df = pd.read_csv(file)
df = df[df['Latitude'].isna() == False]
df = df[df['Business Name'].isna()==False]
df = df[df['Address State']=='NY']
df['License Creation Date'] = pd.to_datetime(df['License Creation Date'], format = '%m/%d/%Y')
df['License Expiration Date'] = pd.to_datetime(df['License Expiration Date'], format = '%m/%d/%Y')
df['License Status'] = df['License Status']=='Active'
df['License Status'] = df['License Status'].astype('int')
drop_cols = ['DCA License Number', 'License Type', 'Business Name', 'Business Name 2', 'Address Building', 'Address Street Name', 'Secondary Address Street Name', 'Address City', 'Address State', 'Address ZIP', 'Contact Phone Number', 'Address Borough','Borough Code','Community Board','Council District','BIN','BBL','NTA','Census Tract','Detail','Location']
df.drop(labels = drop_cols, axis=1, inplace = True)
df = df[df['Longitude'] > -76]
df.reset_index(drop = True, inplace = True)
df['date_diffs']=(df['License Expiration Date']-df['License Creation Date']).dt.days
df=df[df['date_diffs']>0]
df['Start_date']=(df['License Creation Date']-np.min(df['License Creation Date'])).dt.days
df_small = df.sample(frac=0.10)
df.head()

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,License Expiration Date,License Status,License Creation Date,Industry,Longitude,Latitude,date_diffs,Start_date
0,2022-06-30,1,2007-01-02,Electronic & Appliance Service,-73.835446,40.838469,5658.0,10935
1,2022-05-01,1,2018-10-31,Employment Agency,-73.795002,40.710524,1278.0,15255
2,2022-06-30,1,2015-10-27,Electronic & Appliance Service,-74.010425,40.645018,2438.0,14155
3,2022-05-01,1,2012-07-13,Employment Agency,-73.969382,40.792751,3579.0,12954
4,2022-06-30,1,2012-01-10,Electronic & Appliance Service,-73.825319,40.733833,3824.0,12769


In [494]:
class Cluster_Adder():
    def __init__(self, K):
        self.K = K
    
    def fit(self, X , y):
        
        kmeans = sklearn.cluster.KMeans(n_clusters=self.K)
        self.assigned_cluster=kmeans.fit_predict(X[:,0:3])
        means=np.zeros(self.K)
        for i in range(self.K):
            means[i]=y[self.assigned_cluster==i].mean()
        self.means=means
        self.kmeans=kmeans
        return self
    
    def transform(self, X):
        
        # Use the already predicted clusters to save time if this is what we trained our clusters on
        if (X.shape[0]==len(self.assigned_cluster)):
            cluster_col=np.zeros(X.shape[0])
            for i in range(self.K):
                cluster_col[self.assigned_cluster==i]=self.means[i]
            return np.column_stack((X,cluster_col))
    
        # Otherwise we predict the clusters of the test points
        assigned_cluster=self.kmeans.predict(X[:,0:3])

        cluster_col=np.zeros(X.shape[0])
        
        for i in range(self.K):
            cluster_col[assigned_cluster==i]=self.means[i]
        return np.column_stack((X,cluster_col))

In [5]:
# This also adds a column with the mean for the cluster, but deletes the location and time data
class Cluster_Adder_With_Deletion():
    def __init__(self, K):
        self.K = K
    
    def fit(self, X , y):
        
        kmeans = sklearn.cluster.KMeans(n_clusters=self.K)
        self.assigned_cluster=kmeans.fit_predict(X[:,0:3])
        means=np.zeros(self.K)
        for i in range(self.K):
            means[i]=y[self.assigned_cluster==i].mean()
        self.means=means
        self.kmeans=kmeans
        return self
    
    def transform(self, X):
        
        # Use the already predicted clusters to save time if this is what we trained our clusters on
        if (X.shape[0]==len(self.assigned_cluster)):
            cluster_col=np.zeros(X.shape[0])
            for i in range(self.K):
                cluster_col[self.assigned_cluster==i]=self.means[i]
            # The subset here deletes the time and location data
            return np.column_stack((X[:,3:],cluster_col))
    
        # Otherwise we predict the clusters of the test points
        assigned_cluster=self.kmeans.predict(X[:,0:3])

        cluster_col=np.zeros(X.shape[0])
        
        for i in range(self.K):
            cluster_col[assigned_cluster==i]=self.means[i]
        # The subset here deletes the time and location data
        return np.column_stack((X[:,3:],cluster_col))

In [27]:
def objective_clusters(trial):
    
    data = df_small[['Start_date','Longitude','Latitude','License Status','Industry']]
    target = df_small[['date_diffs']]
    
    K = trial.suggest_int("K", 2, 200)
    
    alpha=trial.suggest_uniform('alpha',0.1,1)
    
    beta=trial.suggest_uniform('beta',0.1,1)
    
    parameter = {
      'max_depth':trial.suggest_int('depth', 3, 5), # show integer parameters between 3 and 5 for depth
      'min_child_weight':trial.suggest_int('childweight',0,5), # show integer parameters between 0 and 5 for childweight
      'learning_rate':trial.suggest_loguniform('ourlearning_rate',0.05,0.6), # set a log distribution between 0.05 and 0.5 for learning rate
      'colsample_bytree':trial.suggest_uniform('colsample_bytree',0.4,0.9), # set a uniformly distributed numbers between 0.4 and 0.9 for colsample_bytree
      'subsample':trial.suggest_uniform('sample',0.4,0.9)
    }
    
    # preprocessor to avoid data leakage
    preprocessor = ColumnTransformer(transformers = [('scaler1', MinMaxScaler((0,alpha)),['Longitude', 'Latitude']),('scaler2', MinMaxScaler((0,beta)),['Start_date']), ('onehot', OneHotEncoder(sparse=False,handle_unknown = 'ignore'), ['Industry'])])
    
    # Add a callback for pruning.
    pruning_callback = optuna.integration.XGBoostPruningCallback(trial, "validation-erorr")
    
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),('clusterer',Cluster_Adder_With_Deletion(K)),('model', xgb.XGBRegressor(**parameter))])

    return np.mean(cross_val_score(pipeline, data, target, cv=3,))


In [28]:
study_clusters = optuna.create_study(direction='maximize',study_name='Clusters')
study_clusters.optimize(objective_clusters,n_trials=25)

[32m[I 2022-06-04 01:18:37,676][0m A new study created in memory with name: Clusters[0m
[32m[I 2022-06-04 01:18:58,342][0m Trial 0 finished with value: 0.23596872877445244 and parameters: {'K': 49, 'alpha': 0.586115378490039, 'beta': 0.5112062934358915, 'depth': 3, 'childweight': 1, 'ourlearning_rate': 0.16652158965954714, 'colsample_bytree': 0.886433746569772, 'sample': 0.7372462553722179}. Best is trial 0 with value: 0.23596872877445244.[0m
[32m[I 2022-06-04 01:19:44,369][0m Trial 1 finished with value: 0.24901249098411884 and parameters: {'K': 197, 'alpha': 0.3012492037578842, 'beta': 0.5070936263454083, 'depth': 5, 'childweight': 2, 'ourlearning_rate': 0.09717343593273509, 'colsample_bytree': 0.6413545823242757, 'sample': 0.7596715169247216}. Best is trial 1 with value: 0.24901249098411884.[0m
[32m[I 2022-06-04 01:20:24,174][0m Trial 2 finished with value: 0.23387937638964876 and parameters: {'K': 166, 'alpha': 0.29889763999361907, 'beta': 0.24303913116734613, 'depth': 3

[32m[I 2022-06-04 01:28:10,314][0m Trial 23 finished with value: 0.2543154274048207 and parameters: {'K': 43, 'alpha': 0.19909830786079694, 'beta': 0.7532081802947215, 'depth': 3, 'childweight': 3, 'ourlearning_rate': 0.2804164736191705, 'colsample_bytree': 0.5124398067810731, 'sample': 0.5501993224608961}. Best is trial 15 with value: 0.2591853752400963.[0m
[32m[I 2022-06-04 01:28:39,891][0m Trial 24 finished with value: 0.25611212631892916 and parameters: {'K': 75, 'alpha': 0.3506891543438483, 'beta': 0.9286938840418915, 'depth': 3, 'childweight': 2, 'ourlearning_rate': 0.1921337561709016, 'colsample_bytree': 0.4424821517873081, 'sample': 0.6245049358564088}. Best is trial 15 with value: 0.2591853752400963.[0m


In [531]:
def objective_no_clusters(trial):
    
    data = df_small[['Start_date','Longitude','Latitude','License Status','Industry']]
    target = df_small[['date_diffs']]
    
    alpha=trial.suggest_uniform('alpha',0.1,1)
    
    beta=trial.suggest_uniform('beta',0.1,1)
    
    parameter = {
      'max_depth':trial.suggest_int('depth', 3, 5), # show integer parameters between 3 and 5 for depth
      'min_child_weight':trial.suggest_int('childweight',0,5), # show integer parameters between 0 and 5 for childweight
      'learning_rate':trial.suggest_loguniform('ourlearning_rate',0.05,0.6), # set a log distribution between 0.05 and 0.5 for learning rate
      'colsample_bytree':trial.suggest_uniform('colsample_bytree',0.4,0.9), # set a uniformly distributed numbers between 0.4 and 0.9 for colsample_bytree
      'subsample':trial.suggest_uniform('sample',0.4,0.9)
    }
    
    # preprocessor to avoid data leakage
    preprocessor = ColumnTransformer(transformers = [('scaler1', MinMaxScaler((0,alpha)),['Longitude', 'Latitude']),('scaler2', MinMaxScaler((0,beta)),['Start_date']), ('onehot', OneHotEncoder(sparse=False,handle_unknown = 'ignore'), ['Industry'])])
    
    # Add a callback for pruning.
    pruning_callback = optuna.integration.XGBoostPruningCallback(trial, "validation-erorr")
    
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),('model', xgb.XGBRegressor(**parameter))])

    return np.mean(cross_val_score(pipeline, data, target, cv=3))


In [532]:
study_no_clusters = optuna.create_study(direction='maximize',study_name='No Clusters')
study_no_clusters.optimize(objective_no_clusters,n_trials=25)

[32m[I 2022-06-03 21:53:36,365][0m A new study created in memory with name: No Clusters[0m
[32m[I 2022-06-03 21:53:38,503][0m Trial 0 finished with value: 0.22924773865689194 and parameters: {'alpha': 0.854084906587587, 'beta': 0.4855154232172746, 'depth': 4, 'childweight': 0, 'ourlearning_rate': 0.3247063181156315, 'colsample_bytree': 0.7680860773013716, 'sample': 0.7350400907839559}. Best is trial 0 with value: 0.22924773865689194.[0m
[32m[I 2022-06-03 21:53:40,504][0m Trial 1 finished with value: 0.2573266376473112 and parameters: {'alpha': 0.6272353504758643, 'beta': 0.8702833613969365, 'depth': 4, 'childweight': 4, 'ourlearning_rate': 0.12296453138158003, 'colsample_bytree': 0.6784496264661272, 'sample': 0.42129718625222634}. Best is trial 1 with value: 0.2573266376473112.[0m
[32m[I 2022-06-03 21:53:42,497][0m Trial 2 finished with value: 0.2590366579059769 and parameters: {'alpha': 0.40668507100781437, 'beta': 0.1823982045539866, 'depth': 4, 'childweight': 5, 'ourlearn

[32m[I 2022-06-03 21:55:25,883][0m Trial 23 finished with value: 0.263275193542402 and parameters: {'alpha': 0.2144506329797492, 'beta': 0.5705564338017767, 'depth': 5, 'childweight': 2, 'ourlearning_rate': 0.06742075184804985, 'colsample_bytree': 0.715227838592398, 'sample': 0.573719451907023}. Best is trial 12 with value: 0.2649414620758183.[0m
[32m[I 2022-06-03 21:55:30,794][0m Trial 24 finished with value: 0.2649666455657007 and parameters: {'alpha': 0.2050545815704506, 'beta': 0.5820437900729117, 'depth': 5, 'childweight': 1, 'ourlearning_rate': 0.07877940303841176, 'colsample_bytree': 0.6259669172923449, 'sample': 0.5933566047260158}. Best is trial 24 with value: 0.2649666455657007.[0m


In [529]:
def objective_empty(trial):
    
    data = df_small[['License Status','Industry']]
    target = df_small[['date_diffs']]
    parameter = {
      'max_depth':trial.suggest_int('depth', 3, 5), # show integer parameters between 3 and 5 for depth
      'min_child_weight':trial.suggest_int('childweight',0,5), # show integer parameters between 0 and 5 for childweight
      'learning_rate':trial.suggest_loguniform('ourlearning_rate',0.05,0.6), # set a log distribution between 0.05 and 0.5 for learning rate
      'colsample_bytree':trial.suggest_uniform('colsample_bytree',0.4,0.9), # set a uniformly distributed numbers between 0.4 and 0.9 for colsample_bytree
      'subsample':trial.suggest_uniform('sample',0.4,0.9)
    }
    
    # preprocessor to avoid data leakage
    preprocessor = ColumnTransformer(transformers = [('onehot', OneHotEncoder(sparse=False,handle_unknown = 'ignore'), ['Industry'])])
    
    # Add a callback for pruning.
    pruning_callback = optuna.integration.XGBoostPruningCallback(trial, "validation-erorr")
    
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),('model', xgb.XGBRegressor(**parameter))])

    return np.mean(cross_val_score(pipeline, data, target, cv=3))


In [530]:
study_empty = optuna.create_study(direction='maximize',study_name='Empty')
study_empty.optimize(objective_empty,n_trials=25)

[32m[I 2022-06-03 21:52:42,944][0m A new study created in memory with name: Empty[0m
[32m[I 2022-06-03 21:52:44,474][0m Trial 0 finished with value: 0.048790136675298044 and parameters: {'depth': 4, 'childweight': 5, 'ourlearning_rate': 0.18167838611750486, 'colsample_bytree': 0.6418584114077657, 'sample': 0.502936623145783}. Best is trial 0 with value: 0.048790136675298044.[0m
[32m[I 2022-06-03 21:52:45,870][0m Trial 1 finished with value: 0.049814245318445104 and parameters: {'depth': 3, 'childweight': 2, 'ourlearning_rate': 0.130863215118722, 'colsample_bytree': 0.7622182137407698, 'sample': 0.7645340904563582}. Best is trial 1 with value: 0.049814245318445104.[0m
[32m[I 2022-06-03 21:52:47,272][0m Trial 2 finished with value: 0.048272769581729956 and parameters: {'depth': 3, 'childweight': 1, 'ourlearning_rate': 0.0811369853638437, 'colsample_bytree': 0.8153794564593015, 'sample': 0.8455972872043664}. Best is trial 1 with value: 0.049814245318445104.[0m
[32m[I 2022-06-

In [32]:
def objective_clusters_big_data(trial):
    
    data = df[['Start_date','Longitude','Latitude','License Status','Industry']]
    target = df[['date_diffs']]
    
    K = trial.suggest_int("K", 2, 500)
    
    alpha=trial.suggest_uniform('alpha',0.1,1)
    
    beta=trial.suggest_uniform('beta',0.1,1)
    
    parameter = {
      'max_depth':trial.suggest_int('depth', 3, 5), # show integer parameters between 3 and 5 for depth
      'min_child_weight':trial.suggest_int('childweight',0,5), # show integer parameters between 0 and 5 for childweight
      'learning_rate':trial.suggest_loguniform('ourlearning_rate',0.05,0.6), # set a log distribution between 0.05 and 0.5 for learning rate
      'colsample_bytree':trial.suggest_uniform('colsample_bytree',0.4,0.9), # set a uniformly distributed numbers between 0.4 and 0.9 for colsample_bytree
      'subsample':trial.suggest_uniform('sample',0.4,0.9)
    }
    
    # preprocessor to avoid data leakage
    preprocessor = ColumnTransformer(transformers = [('scaler1', MinMaxScaler((0,alpha)),['Longitude', 'Latitude']),('scaler2', MinMaxScaler((0,beta)),['Start_date']), ('onehot', OneHotEncoder(sparse=False,handle_unknown = 'ignore'), ['Industry'])])
    
    # Add a callback for pruning.
    pruning_callback = optuna.integration.XGBoostPruningCallback(trial, "validation-erorr")
    
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),('clusterer',Cluster_Adder_With_Deletion(K)),('model', xgb.XGBRegressor(**parameter))])

    return np.mean(cross_val_score(pipeline, data, target, cv=5))


In [34]:
study_clusters_big_data = optuna.create_study(direction='maximize',study_name='Clusters')
study_clusters_big_data.optimize(objective_clusters_big_data,n_trials=25)

[32m[I 2022-06-04 01:35:44,077][0m A new study created in memory with name: Clusters[0m
[32m[I 2022-06-04 01:52:05,392][0m Trial 0 finished with value: -0.32329730645861726 and parameters: {'K': 135, 'alpha': 0.17593708960877896, 'beta': 0.22680229002894353, 'depth': 3, 'childweight': 3, 'ourlearning_rate': 0.22390658337636565, 'colsample_bytree': 0.5707871507409993, 'sample': 0.7577708118598057}. Best is trial 0 with value: -0.32329730645861726.[0m
[32m[I 2022-06-04 01:54:15,018][0m Trial 1 finished with value: -0.2707871919754828 and parameters: {'K': 11, 'alpha': 0.6806864908783613, 'beta': 0.4685650097405637, 'depth': 5, 'childweight': 2, 'ourlearning_rate': 0.23944578253426937, 'colsample_bytree': 0.6980885827705957, 'sample': 0.8011641978924473}. Best is trial 1 with value: -0.2707871919754828.[0m
[32m[I 2022-06-04 02:09:23,020][0m Trial 2 finished with value: -0.3331237895810167 and parameters: {'K': 159, 'alpha': 0.9269366775815344, 'beta': 0.4530218509002504, 'depth

[32m[I 2022-06-04 07:04:43,357][0m Trial 23 finished with value: -0.25467491638254086 and parameters: {'K': 34, 'alpha': 0.6924350722636963, 'beta': 0.34292188125076006, 'depth': 3, 'childweight': 0, 'ourlearning_rate': 0.07261973231944759, 'colsample_bytree': 0.7393257145536194, 'sample': 0.44154172185533186}. Best is trial 22 with value: -0.05105834753429206.[0m
[32m[I 2022-06-04 07:14:38,301][0m Trial 24 finished with value: -0.3342525175970238 and parameters: {'K': 85, 'alpha': 0.9993643931256448, 'beta': 0.5565724963130545, 'depth': 3, 'childweight': 2, 'ourlearning_rate': 0.05841932040796807, 'colsample_bytree': 0.8582125778359583, 'sample': 0.4730992851467611}. Best is trial 22 with value: -0.05105834753429206.[0m


In [2]:
def objective_no_clusters_big_data(trial):
    
    data = df[['Start_date','Longitude','Latitude','License Status','Industry']]
    target = df[['date_diffs']]
    
    alpha=trial.suggest_uniform('alpha',0.1,1)
    
    beta=trial.suggest_uniform('beta',0.1,1)
    
    parameter = {
      'max_depth':trial.suggest_int('depth', 3, 5), # show integer parameters between 3 and 5 for depth
      'min_child_weight':trial.suggest_int('childweight',0,5), # show integer parameters between 0 and 5 for childweight
      'learning_rate':trial.suggest_loguniform('ourlearning_rate',0.05,0.6), # set a log distribution between 0.05 and 0.5 for learning rate
      'colsample_bytree':trial.suggest_uniform('colsample_bytree',0.4,0.9), # set a uniformly distributed numbers between 0.4 and 0.9 for colsample_bytree
      'subsample':trial.suggest_uniform('sample',0.4,0.9)
    }
    
    # preprocessor to avoid data leakage
    preprocessor = ColumnTransformer(transformers = [('scaler1', MinMaxScaler((0,alpha)),['Longitude', 'Latitude']),('scaler2', MinMaxScaler((0,beta)),['Start_date']), ('onehot', OneHotEncoder(sparse=False,handle_unknown = 'ignore'), ['Industry'])])
    
    # Add a callback for pruning.
    pruning_callback = optuna.integration.XGBoostPruningCallback(trial, "validation-erorr")
    
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),('model', xgb.XGBRegressor(**parameter))])

    return np.mean(cross_val_score(pipeline, data, target, cv=5))


In [5]:
study_no_clusters_big_data = optuna.create_study(direction='maximize',study_name='No Clusters')
study_no_clusters_big_data.optimize(objective_no_clusters_big_data,n_trials=25)

[32m[I 2022-06-27 09:15:37,489][0m A new study created in memory with name: No Clusters[0m
[32m[I 2022-06-27 09:16:01,958][0m Trial 0 finished with value: -0.3069335761055729 and parameters: {'alpha': 0.979245718187961, 'beta': 0.5009059095735399, 'depth': 3, 'childweight': 1, 'ourlearning_rate': 0.18194606706362143, 'colsample_bytree': 0.7208838754447114, 'sample': 0.7912200304033714}. Best is trial 0 with value: -0.3069335761055729.[0m
[32m[I 2022-06-27 09:16:34,896][0m Trial 1 finished with value: -0.30230888008872353 and parameters: {'alpha': 0.7094782251065471, 'beta': 0.9808922375996135, 'depth': 4, 'childweight': 3, 'ourlearning_rate': 0.10011969839004746, 'colsample_bytree': 0.6527026807677803, 'sample': 0.6250901920344778}. Best is trial 1 with value: -0.30230888008872353.[0m
[32m[I 2022-06-27 09:18:08,056][0m Trial 2 finished with value: -0.2883779056457746 and parameters: {'alpha': 0.8834988750048103, 'beta': 0.8619957564085128, 'depth': 5, 'childweight': 4, 'ourl

[32m[I 2022-06-27 09:45:14,184][0m Trial 23 finished with value: -0.29085705122047767 and parameters: {'alpha': 0.16336096600848132, 'beta': 0.21256127082351778, 'depth': 5, 'childweight': 1, 'ourlearning_rate': 0.07936808707882156, 'colsample_bytree': 0.8327527421977594, 'sample': 0.8531009430073804}. Best is trial 11 with value: -0.28135067738523756.[0m
[32m[I 2022-06-27 09:46:47,711][0m Trial 24 finished with value: -0.28748810171190503 and parameters: {'alpha': 0.3201635391792185, 'beta': 0.3309541287280361, 'depth': 5, 'childweight': 2, 'ourlearning_rate': 0.06435361832702631, 'colsample_bytree': 0.6973469392709181, 'sample': 0.8915303531182757}. Best is trial 11 with value: -0.28135067738523756.[0m


In [3]:
def objective_no_clusters_big_data_no_scaling(trial):
    
    data = df[['Start_date','Longitude','Latitude','License Status','Industry']]
    target = df[['date_diffs']]
    
    parameter = {
      'max_depth':trial.suggest_int('depth', 3, 5), # show integer parameters between 3 and 5 for depth
      'min_child_weight':trial.suggest_int('childweight',0,5), # show integer parameters between 0 and 5 for childweight
      'learning_rate':trial.suggest_loguniform('ourlearning_rate',0.05,0.6), # set a log distribution between 0.05 and 0.5 for learning rate
      'colsample_bytree':trial.suggest_uniform('colsample_bytree',0.4,0.9), # set a uniformly distributed numbers between 0.4 and 0.9 for colsample_bytree
      'subsample':trial.suggest_uniform('sample',0.4,0.9)
    }
    
    # preprocessor to avoid data leakage
    preprocessor = ColumnTransformer(transformers = [('onehot', OneHotEncoder(sparse=False,handle_unknown = 'ignore'), ['Industry'])])
    
    # Add a callback for pruning.
    pruning_callback = optuna.integration.XGBoostPruningCallback(trial, "validation-erorr")
    
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),('model', xgb.XGBRegressor(**parameter))])

    return np.mean(cross_val_score(pipeline, data, target, cv=5))


In [6]:
study_no_clusters_big_data_no_scaling = optuna.create_study(direction='maximize',study_name='No Clusters no scaling')
study_no_clusters_big_data_no_scaling.optimize(objective_no_clusters_big_data_no_scaling,n_trials=25)

[32m[I 2022-07-05 17:11:52,007][0m A new study created in memory with name: No Clusters no scaling[0m
[32m[I 2022-07-05 17:12:26,543][0m Trial 0 finished with value: -0.062258849761357496 and parameters: {'depth': 5, 'childweight': 4, 'ourlearning_rate': 0.14082558383101496, 'colsample_bytree': 0.8144564132502653, 'sample': 0.6305033627843379}. Best is trial 0 with value: -0.062258849761357496.[0m
[32m[I 2022-07-05 17:12:54,493][0m Trial 1 finished with value: -0.05913645074222684 and parameters: {'depth': 4, 'childweight': 1, 'ourlearning_rate': 0.1072077489962983, 'colsample_bytree': 0.6558115011119723, 'sample': 0.7587863922918476}. Best is trial 1 with value: -0.05913645074222684.[0m
[32m[I 2022-07-05 17:13:32,951][0m Trial 2 finished with value: -0.06361902965660435 and parameters: {'depth': 4, 'childweight': 4, 'ourlearning_rate': 0.5430143065912019, 'colsample_bytree': 0.4605887707678615, 'sample': 0.7989097654455678}. Best is trial 1 with value: -0.05913645074222684.

In [None]:
# So we see that the scaling was somehow messing with our XGBoost model
# There is still some improvement with the use of clustering, though not much

In [None]:
# Finally, lets train the final model.

In [7]:
final_data=df[['Start_date','Longitude','Latitude','License Status','Industry']]
final_target = df[['date_diffs']]

parameter = {
    'depth': 4, 
    'childweight': 0, 
    'ourlearning_rate': 0.06515214309789943, 
    'colsample_bytree': 0.834490669484953, 
    'sample': 0.4579443309559088
    }

preprocessor = ColumnTransformer(transformers = [('scaler1', MinMaxScaler((0, 0.8624729890566708)),['Longitude', 'Latitude']),('scaler2', MinMaxScaler((0,0.20189836604616812)),['Start_date']), ('onehot', OneHotEncoder(sparse=False,handle_unknown = 'ignore'), ['Industry'])])

pipeline = Pipeline(steps=[('preprocessor', preprocessor),('clusterer',Cluster_Adder_With_Deletion(10)),('model', xgb.XGBRegressor(**parameter))])

pipeline.fit(final_data, final_target)

Parameters: { "childweight", "depth", "ourlearning_rate", "sample" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('scaler1',
                                                  MinMaxScaler(feature_range=(0,
                                                                              0.8624729890566708)),
                                                  ['Longitude', 'Latitude']),
                                                 ('scaler2',
                                                  MinMaxScaler(feature_range=(0,
                                                                              0.20189836604616812)),
                                                  ['Start_date']),
                                                 ('onehot',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse=False),
                                                  ['Industry'])])),
                ('clusterer',
           

In [10]:
pipeline.score(final_data, final_target)

0.07071675335497696

In [17]:
final_data=df[['Start_date','Longitude','Latitude','License Status','Industry']]
final_target = df[['date_diffs']]

parameter = {
    'depth': 4, 
    'childweight': 1, 
    'ourlearning_rate': 0.057231642143027714, 
    'colsample_bytree': 0.49901169726396305, 
    'sample': 0.8712121041455485
    }

preprocessor = ColumnTransformer(transformers = [('onehot', OneHotEncoder(sparse=False,handle_unknown = 'ignore'), ['Industry'])])

pipeline_no_clusters = Pipeline(steps=[('preprocessor', preprocessor),('model', xgb.XGBRegressor(**parameter))])

pipeline_no_clusters.fit(final_data, final_target)

Parameters: { "childweight", "depth", "ourlearning_rate", "sample" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('onehot',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse=False),
                                                  ['Industry'])])),
                ('model',
                 XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
                              childweight=1, colsample_bylevel=1,
                              colsample_bynode=1,
                              colsample_bytree=0.49901169726396305, depth=4,
                              early_stopping_rounds=None,
                              enable_catego...
                              gamma=0, gpu_id=-1, grow_policy='depthwise',
                              importance_type=None, interaction_constraints='',
                              learning_rate=0.300000012, max_bin=256,
                              max_cat_to_o

In [18]:
pipeline_no_clusters.score(final_data, final_target)

0.05402779077730091