In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
from pandas.io.formats.style import Styler
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
%matplotlib inline

import os
import sys
import datetime
from tqdm import tqdm
import copy

import optuna
import wandb

import xgboost as xgb
from xgboost.callback import EarlyStopping
import lightgbm as lgbm

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold
from sklearn.metrics import mean_squared_error, log_loss
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

from umap import UMAP

import warnings; warnings.filterwarnings('ignore')

sys.path.append('..')
from config import CFG
from src.data_processing.fi import get_fi
CFG = CFG()
pd.options.display.float_format = '{:,.2f}'.format
pd.set_option('display.max_columns', CFG.NCOLS)
pd.set_option('display.max_rows', CFG.NROWS)

In [3]:
from colorama import Style, Fore
red = Style.BRIGHT + Fore.RED
blu = Style.BRIGHT + Fore.BLUE
mgt = Style.BRIGHT + Fore.MAGENTA
grn = Style.BRIGHT + Fore.GREEN
gld = Style.BRIGHT + Fore.YELLOW
res = Style.RESET_ALL

rc = {
    "axes.facecolor": "#FFFEF8",
    "figure.facecolor": "#FFFEF8",
    "axes.edgecolor": "#000000",
    "grid.color": "#EBEBE7" + "30",
    "font.family": "serif",
    "axes.labelcolor": "#000000",
    "xtick.color": "#000000",
    "ytick.color": "#000000",
    "grid.alpha": 0.4
}
sns.set(rc=rc)
palette = ['#302c36', '#037d97', '#E4591E', '#C09741',
           '#EC5B6D', '#90A6B1', '#6ca957', '#D8E3E2']

In [4]:
scaler = StandardScaler()
config['scaler'] = "standard"

scaled_tr = pd.DataFrame(scaler.fit_transform(train.drop("Class", axis=1)), columns=train.drop("Class", axis=1).columns)
scaled_tr["Class"] = train["Class"]

scaled_orig = pd.DataFrame(scaler.fit_transform(orig.drop("Class", axis=1)), columns=orig.drop("Class", axis=1).columns)
scaled_orig["Class"] = orig["Class"]

scaled_tst = pd.DataFrame(scaler.transform(test), columns=test.columns)

In [5]:
orig = pd.read_csv(os.path.join(CFG.RAW_DATA, 'Pulsar.csv'))
train = pd.read_csv(os.path.join(CFG.RAW_DATA, 'train.csv')).drop(columns='id')
test = pd.read_csv(os.path.join(CFG.RAW_DATA, 'test.csv')).drop(columns='id')

config = {}

In [6]:
scaler = StandardScaler()
config['scaler'] = "standard"

scaled_tr = pd.DataFrame(scaler.fit_transform(train.drop("Class", axis=1)), columns=train.drop("Class", axis=1).columns)
scaled_tr["Class"] = train["Class"]

scaled_orig = pd.DataFrame(scaler.fit_transform(orig.drop("Class", axis=1)), columns=orig.drop("Class", axis=1).columns)
scaled_orig["Class"] = orig["Class"]

scaled_tst = pd.DataFrame(scaler.transform(test), columns=test.columns)

In [7]:
def get_n_components(df):
    n_components = df.shape[1]
    pca = PCA(n_components=n_components, random_state=CFG.SEED)
    
    components = pca.fit_transform(df)
    components = pd.DataFrame(components, columns=[f'PC{i}' for i in range(n_components)])
    components['Class'] = df['Class']
    exp_var = pca.explained_variance_ratio_
    exp_var_cumsum = np.cumsum(exp_var)
    return components, exp_var, exp_var_cumsum

In [8]:
components, exp_var, exp_var_cumsum = get_n_components(scaled_tr)

fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=list(range(len(exp_var_cumsum))),
        y=exp_var_cumsum,
        name='Cumulative Explained Variance',
        line=dict(color=palette[0], width=2),
    )
)
fig.add_trace(
    go.Bar(
        x=list(range(len(exp_var_cumsum))),
        y=exp_var,
        name='Explained Variance',
        marker_color=palette[1],
        width=0.7,
    )
)

fig.update_layout(
    xaxis=dict(title='Principal Components', titlefont_size=20, tickfont_size=16),
    yaxis=dict(title='Explained Variance', titlefont_size=20, tickfont_size=16),
    height=500, width=1000, title_text='Explained Variance by Principal Components', title_x=0.5, titlefont_size=24,
)

In [9]:
def pca_transform(train, test, N):
    pca = PCA(n_components=N, random_state=CFG.SEED)
    X = pca.fit_transform(train.drop("Class", axis=1))
    X = pd.DataFrame(X, columns=[f'PC{i}' for i in range(N)])
    y = train['Class']
    return pca.transform(test), X, y

In [10]:
N = 3
config['n_components'] = N

X_test, X, y = pca_transform(scaled_tr, scaled_tst, N)

In [11]:
k_fold = RepeatedStratifiedKFold(n_splits=CFG.NFOLDS, n_repeats=CFG.REPEATS, random_state=CFG.SEED)

modelsXB = []
predsXB = []

# gpu_params = {'tree_method' : "gpu_hist", 'gpu_id' : 0}
xgbr_params = {
            'n_estimators':9999,
            'max_depth': 4,
            'learning_rate': 0.05333221939055333,
            'min_child_weight': 4,
            'gamma': 5.301218558776368e-08,
            'subsample': 0.41010429946197946,
            'colsample_bytree': 0.8298539920447499,
            'reg_alpha': 0.000517878113716743,
            'reg_lambda': 0.00030121415155097723,
            'n_jobs': -1,
            'objective': 'binary:logistic',
            'verbosity': 0,
            'eval_metric': 'logloss',
            'random_state': CFG.SEED}

config.update({key:val for key, val in xgbr_params.items() if key not in ['random_state', 'eval_metric', 'verbosity', 'objective', 'n_jobs']})
wandb.init(project='S3E10', name='XGBoost', group='XGBoost', config=config)

for train_index, test_index in k_fold.split(X, y):
    X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
    
    model = xgb.XGBClassifier(**xgbr_params)
    
    model.fit(X=X_train, y=y_train,
          eval_set=[(X_valid, y_valid)],
          early_stopping_rounds = CFG.XG_PATIENCE,
          verbose = 100, callbacks=[wandb.xgboost.WandbCallback(log_model=False)]
         )
    modelsXB.append(model)
    predsXB.append(model.predict_proba(X_test)[:, 1])

In [12]:
%load_ext autoreload
%autoreload 2

In [13]:
import pandas as pd
from pandas.io.formats.style import Styler
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
%matplotlib inline

import os
import sys
import datetime
from tqdm import tqdm
import copy

import optuna
import wandb

import xgboost as xgb
from xgboost.callback import EarlyStopping
import lightgbm as lgbm

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold
from sklearn.metrics import mean_squared_error, log_loss
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

from umap import UMAP

import warnings; warnings.filterwarnings('ignore')

sys.path.append('..')
from config import CFG
from src.data_processing.fi import get_fi
CFG = CFG()
pd.options.display.float_format = '{:,.2f}'.format
pd.set_option('display.max_columns', CFG.NCOLS)
pd.set_option('display.max_rows', CFG.NROWS)

In [14]:
from colorama import Style, Fore
red = Style.BRIGHT + Fore.RED
blu = Style.BRIGHT + Fore.BLUE
mgt = Style.BRIGHT + Fore.MAGENTA
grn = Style.BRIGHT + Fore.GREEN
gld = Style.BRIGHT + Fore.YELLOW
res = Style.RESET_ALL

rc = {
    "axes.facecolor": "#FFFEF8",
    "figure.facecolor": "#FFFEF8",
    "axes.edgecolor": "#000000",
    "grid.color": "#EBEBE7" + "30",
    "font.family": "serif",
    "axes.labelcolor": "#000000",
    "xtick.color": "#000000",
    "ytick.color": "#000000",
    "grid.alpha": 0.4
}
sns.set(rc=rc)
palette = ['#302c36', '#037d97', '#E4591E', '#C09741',
           '#EC5B6D', '#90A6B1', '#6ca957', '#D8E3E2']

In [15]:
orig = pd.read_csv(os.path.join(CFG.RAW_DATA, 'Pulsar.csv'))
train = pd.read_csv(os.path.join(CFG.RAW_DATA, 'train.csv')).drop(columns='id')
test = pd.read_csv(os.path.join(CFG.RAW_DATA, 'test.csv')).drop(columns='id')

config = {}

In [16]:
scaler = StandardScaler()
config['scaler'] = "standard"

scaled_tr = pd.DataFrame(scaler.fit_transform(train.drop("Class", axis=1)), columns=train.drop("Class", axis=1).columns)
scaled_tr["Class"] = train["Class"]

scaled_orig = pd.DataFrame(scaler.fit_transform(orig.drop("Class", axis=1)), columns=orig.drop("Class", axis=1).columns)
scaled_orig["Class"] = orig["Class"]

scaled_tst = pd.DataFrame(scaler.transform(test), columns=test.columns)

In [17]:
def get_n_components(df):
    n_components = df.shape[1]
    pca = PCA(n_components=n_components, random_state=CFG.SEED)
    
    components = pca.fit_transform(df)
    components = pd.DataFrame(components, columns=[f'PC{i}' for i in range(n_components)])
    components['Class'] = df['Class']
    exp_var = pca.explained_variance_ratio_
    exp_var_cumsum = np.cumsum(exp_var)
    return components, exp_var, exp_var_cumsum

In [18]:
components, exp_var, exp_var_cumsum = get_n_components(scaled_tr)

fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=list(range(len(exp_var_cumsum))),
        y=exp_var_cumsum,
        name='Cumulative Explained Variance',
        line=dict(color=palette[0], width=2),
    )
)
fig.add_trace(
    go.Bar(
        x=list(range(len(exp_var_cumsum))),
        y=exp_var,
        name='Explained Variance',
        marker_color=palette[1],
        width=0.7,
    )
)

fig.update_layout(
    xaxis=dict(title='Principal Components', titlefont_size=20, tickfont_size=16),
    yaxis=dict(title='Explained Variance', titlefont_size=20, tickfont_size=16),
    height=500, width=1000, title_text='Explained Variance by Principal Components', title_x=0.5, titlefont_size=24,
)

In [19]:
def pca_transform(train, test, N):
    pca = PCA(n_components=N, random_state=CFG.SEED)
    X = pca.fit_transform(train.drop("Class", axis=1))
    X = pd.DataFrame(X, columns=[f'PC{i}' for i in range(N)])
    y = train['Class']
    return pca.transform(test), X, y

In [20]:
N = 3
config['n_components'] = N

X_test, X, y = pca_transform(scaled_tr, scaled_tst, N)

In [21]:
k_fold = RepeatedStratifiedKFold(n_splits=CFG.NFOLDS, n_repeats=CFG.REPEATS, random_state=CFG.SEED)

modelsXB = []
predsXB = []

# gpu_params = {'tree_method' : "gpu_hist", 'gpu_id' : 0}
xgbr_params = {
            'n_estimators':9999,
            'max_depth': 4,
            'learning_rate': 0.05333221939055333,
            'min_child_weight': 4,
            'gamma': 5.301218558776368e-08,
            'subsample': 0.41010429946197946,
            'colsample_bytree': 0.8298539920447499,
            'reg_alpha': 0.000517878113716743,
            'reg_lambda': 0.00030121415155097723,
            'n_jobs': -1,
            'objective': 'binary:logistic',
            'verbosity': 0,
            'eval_metric': 'logloss',
            'random_state': CFG.SEED}

config.update({key:val for key, val in xgbr_params.items() if key not in ['random_state', 'eval_metric', 'verbosity', 'objective', 'n_jobs']})
wandb.init(project='S3E10', name='XGBoost', group='XGBoost', config=config)

for train_index, test_index in k_fold.split(X, y):
    X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
    
    model = xgb.XGBClassifier(**xgbr_params)
    
    model.fit(X=X_train, y=y_train,
          eval_set=[(X_valid, y_valid)],
          early_stopping_rounds = CFG.XG_PATIENCE,
          verbose = 100, callbacks=[wandb.xgboost.WandbCallback(log_model=False)]
         )
    modelsXB.append(model)
    predsXB.append(model.predict_proba(X_test)[:, 1])

In [22]:
feature_importance =  [model[x].feature_importances_ for x in range(CFG.NFOLDS*CFG.REPEATS)]
feature_importance = np.average(feature_importance,axis=0)
data = [[label, value] for (label, value) in zip(X.columns, feature_importance)]

table = wandb.Table(data=data, columns=['label', 'value'])
wandb.log({'XGBoost_Feature_Importance': wandb.plot_bar(table, 'label', 'value', title="Feature Importance")})
# feature_df = pd.DataFrame(feature_importance, index=X.columns)
# feature_df
# wandb.Table

# fig, ax = plt.subplots(figsize=(12, 4))
# sns.barplot(x=feature_df.values.squeeze(), y=feature_df.index,
#             color=palette[-3], linestyle="-", width=0.5, errorbar='sd',
#             linewidth=0.5, edgecolor="black", ax=ax)
# ax.set_title('Feature Importance', fontdict={'fontweight': 'bold'})
# ax.set(xlabel=None)

# for s in ['right', 'top']:
#     ax.spines[s].set_visible(False)

In [23]:
[model[x].feature_importances_ for x in range(CFG.NFOLDS*CFG.REPEATS)]

In [24]:
[modelsXB[x].feature_importances_ for x in range(CFG.NFOLDS*CFG.REPEATS)]

[array([0.7111551 , 0.16333364, 0.12551117], dtype=float32),
 array([0.71454674, 0.16249926, 0.12295397], dtype=float32),
 array([0.70543176, 0.17237453, 0.12219369], dtype=float32),
 array([0.72080547, 0.16008782, 0.11910668], dtype=float32),
 array([0.6976138 , 0.17023507, 0.13215113], dtype=float32),
 array([0.718605  , 0.16489132, 0.11650369], dtype=float32),
 array([0.71374947, 0.16561152, 0.12063902], dtype=float32),
 array([0.7162871 , 0.1660631 , 0.11764979], dtype=float32),
 array([0.69904864, 0.17194287, 0.12900849], dtype=float32)]

In [25]:
feature_importance =  [modelsXB[x].feature_importances_ for x in range(CFG.NFOLDS*CFG.REPEATS)]
feature_importance = np.average(feature_importance,axis=0)
data = [[label, value] for (label, value) in zip(X.columns, feature_importance)]

table = wandb.Table(data=data, columns=['label', 'value'])
wandb.log({'XGBoost_Feature_Importance': wandb.plot_bar(table, 'label', 'value', title="Feature Importance")})
# feature_df = pd.DataFrame(feature_importance, index=X.columns)
# feature_df
# wandb.Table

# fig, ax = plt.subplots(figsize=(12, 4))
# sns.barplot(x=feature_df.values.squeeze(), y=feature_df.index,
#             color=palette[-3], linestyle="-", width=0.5, errorbar='sd',
#             linewidth=0.5, edgecolor="black", ax=ax)
# ax.set_title('Feature Importance', fontdict={'fontweight': 'bold'})
# ax.set(xlabel=None)

# for s in ['right', 'top']:
#     ax.spines[s].set_visible(False)

In [26]:
feature_importance =  [modelsXB[x].feature_importances_ for x in range(CFG.NFOLDS*CFG.REPEATS)]
feature_importance = np.average(feature_importance,axis=0)
data = [[label, value] for (label, value) in zip(X.columns, feature_importance)]

table = wandb.Table(data=data, columns=['label', 'value'])
wandb.log({'XGBoost_Feature_Importance': wandb.plot.bar(table, 'label', 'value', title="Feature Importance")})
# feature_df = pd.DataFrame(feature_importance, index=X.columns)
# feature_df
# wandb.Table

# fig, ax = plt.subplots(figsize=(12, 4))
# sns.barplot(x=feature_df.values.squeeze(), y=feature_df.index,
#             color=palette[-3], linestyle="-", width=0.5, errorbar='sd',
#             linewidth=0.5, edgecolor="black", ax=ax)
# ax.set_title('Feature Importance', fontdict={'fontweight': 'bold'})
# ax.set(xlabel=None)

# for s in ['right', 'top']:
#     ax.spines[s].set_visible(False)

In [27]:
wandb.finish()

In [28]:
feature_importance =  [modelsLB[x].feature_importances_ for x in range(CFG.NFOLDS*CFG.REPEATS)]
feature_importance = np.average(feature_importance,axis=0)
data = [[label, value] for (label, value) in zip(X.columns, feature_importance)]

table = wandb.Table(data=data, columns=['label', 'value'])
wandb.log({'LGBM_Feature_Importance': wandb.plot.bar(table, 'label', 'value', title="Feature Importance")})

In [29]:
modelsLB = []
predsLB = []

lgbr_params = {
            'n_estimators': 9999,
            'max_depth': 5,
            'learning_rate': 0.00693702575527996,
            'subsample': 0.20851841295589477,
            'colsample_bytree': 0.5784778854092203, 
            'reg_alpha': 0.2622912287429849,
            'reg_lambda': 2.8702494234117617e-08,
            'objective': 'binary',
            'metric': 'logloss',
            'boosting_type': 'gbdt',
            'random_state': CFG.SEED
        }
config = {} | {
    key: value for key, value in lgbr_params.items() if key not in ['']
}
wandb.init(project='S3E10', name='LightGBM', group='LightGBM', config=config, reinit=True)

for train_index, test_index in k_fold.split(X, y):
    X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[test_index]

    model = lgbm.LGBMClassifier(**lgbr_params)

    model.fit(X=X_train, y=y_train,
          eval_set=[(X_valid, y_valid)],
          eval_metric = 'logloss',
          early_stopping_rounds = CFG.XG_PATIENCE,
          verbose = 150,
          callbacks=[wandb.wandb_callback()]
         )
    modelsLB.append(model)
    predsLB.append(model.predict_proba(X_test)[:, 1])

In [30]:
modelsLB = []
predsLB = []

lgbr_params = {
            'n_estimators': 9999,
            'max_depth': 5,
            'learning_rate': 0.00693702575527996,
            'subsample': 0.20851841295589477,
            'colsample_bytree': 0.5784778854092203, 
            'reg_alpha': 0.2622912287429849,
            'reg_lambda': 2.8702494234117617e-08,
            'objective': 'binary',
            'metric': 'logloss',
            'boosting_type': 'gbdt',
            'random_state': CFG.SEED
        }
config = {} | {
    key: value for key, value in lgbr_params.items() if key not in ['']
}
wandb.init(project='S3E10', name='LightGBM', group='LightGBM', config=config, reinit=True)

for train_index, test_index in k_fold.split(X, y):
    X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[test_index]

    model = lgbm.LGBMClassifier(**lgbr_params)

    model.fit(X=X_train, y=y_train,
          eval_set=[(X_valid, y_valid)],
          eval_metric = 'logloss',
          early_stopping_rounds = CFG.XG_PATIENCE,
          verbose = 150,
          callbacks=[wandb.wandb_callback()]
         )
    modelsLB.append(model)
    predsLB.append(model.predict_proba(X_test)[:, 1])

In [31]:
modelsLB = []
predsLB = []

lgbr_params = {
            'n_estimators': 9999,
            'max_depth': 5,
            'learning_rate': 0.00693702575527996,
            'subsample': 0.20851841295589477,
            'colsample_bytree': 0.5784778854092203, 
            'reg_alpha': 0.2622912287429849,
            'reg_lambda': 2.8702494234117617e-08,
            'objective': 'binary',
            'metric': 'logloss',
            'boosting_type': 'gbdt',
            'random_state': CFG.SEED
        }
config = {} | {
    key: value for key, value in lgbr_params.items() if key not in ['']
}
wandb.init(project='S3E10', name='LightGBM', group='LightGBM', config=config, reinit=True)

for train_index, test_index in k_fold.split(X, y):
    X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[test_index]

    model = lgbm.LGBMClassifier(**lgbr_params)

    model.fit(X=X_train, y=y_train,
          eval_set=[(X_valid, y_valid)],
          eval_metric = 'logloss',
          early_stopping_rounds = CFG.XG_PATIENCE,
          verbose = 150,
          callbacks=[wandb_callback()]
         )
    modelsLB.append(model)
    predsLB.append(model.predict_proba(X_test)[:, 1])

In [32]:
import pandas as pd
from pandas.io.formats.style import Styler
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
%matplotlib inline

import os
import sys
import datetime
from tqdm import tqdm
import copy

import optuna
import wandb
from wandb.lightgbm import wandb_callback, log_summary

import xgboost as xgb
from xgboost.callback import EarlyStopping
import lightgbm as lgbm

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold
from sklearn.metrics import mean_squared_error, log_loss
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

from umap import UMAP

import warnings; warnings.filterwarnings('ignore')

sys.path.append('..')
from config import CFG
from src.data_processing.fi import get_fi
CFG = CFG()
pd.options.display.float_format = '{:,.2f}'.format
pd.set_option('display.max_columns', CFG.NCOLS)
pd.set_option('display.max_rows', CFG.NROWS)

In [33]:
modelsLB = []
predsLB = []

lgbr_params = {
            'n_estimators': 9999,
            'max_depth': 5,
            'learning_rate': 0.00693702575527996,
            'subsample': 0.20851841295589477,
            'colsample_bytree': 0.5784778854092203, 
            'reg_alpha': 0.2622912287429849,
            'reg_lambda': 2.8702494234117617e-08,
            'objective': 'binary',
            'metric': 'logloss',
            'boosting_type': 'gbdt',
            'random_state': CFG.SEED
        }
config = {} | {
    key: value for key, value in lgbr_params.items() if key not in ['']
}
wandb.init(project='S3E10', name='LightGBM', group='LightGBM', config=config, reinit=True)

for train_index, test_index in k_fold.split(X, y):
    X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[test_index]

    model = lgbm.LGBMClassifier(**lgbr_params)

    model.fit(X=X_train, y=y_train,
          eval_set=[(X_valid, y_valid)],
          eval_metric = 'logloss',
          early_stopping_rounds = CFG.XG_PATIENCE,
          verbose = 150,
          callbacks=[wandb_callback()]
         )
    modelsLB.append(model)
    predsLB.append(model.predict_proba(X_test)[:, 1])

In [34]:
feature_importance =  [modelsLB[x].feature_importances_ for x in range(CFG.NFOLDS*CFG.REPEATS)]
feature_importance = np.average(feature_importance,axis=0)
data = [[label, value] for (label, value) in zip(X.columns, feature_importance)]

table = wandb.Table(data=data, columns=['label', 'value'])
wandb.log({'LGBM_Feature_Importance': wandb.plot.bar(table, 'label', 'value', title="Feature Importance")})

In [35]:
import pandas as pd
from pandas.io.formats.style import Styler
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
%matplotlib inline

import os
import sys
import datetime
from tqdm import tqdm
import copy

import optuna
import wandb
from wandb.lightgbm import wandb_callback, log_summary

import xgboost as xgb
from xgboost.callback import EarlyStopping
import lightgbm as lgbm

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold
from sklearn.metrics import mean_squared_error, log_loss
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier, Ridge, BayesianRidge

from umap import UMAP

import warnings; warnings.filterwarnings('ignore')

sys.path.append('..')
from config import CFG
from src.data_processing.fi import get_fi
CFG = CFG()
pd.options.display.float_format = '{:,.2f}'.format
pd.set_option('display.max_columns', CFG.NCOLS)
pd.set_option('display.max_rows', CFG.NROWS)

In [36]:
scaler = StandardScaler()
config['scaler'] = "standard"

scaled_tr = pd.DataFrame(scaler.fit_transform(train.drop("Class", axis=1)), columns=train.drop("Class", axis=1).columns)
scaled_tr["Class"] = train["Class"]

scaled_orig = pd.DataFrame(scaler.fit_transform(orig.drop("Class", axis=1)), columns=orig.drop("Class", axis=1).columns)
scaled_orig["Class"] = orig["Class"]

scaled_tst = pd.DataFrame(scaler.transform(test), columns=test.columns)

In [37]:
def get_n_components(df):
    n_components = df.shape[1]
    pca = PCA(n_components=n_components, random_state=CFG.SEED)
    
    components = pca.fit_transform(df)
    components = pd.DataFrame(components, columns=[f'PC{i}' for i in range(n_components)])
    components['Class'] = df['Class']
    exp_var = pca.explained_variance_ratio_
    exp_var_cumsum = np.cumsum(exp_var)
    return components, exp_var, exp_var_cumsum

In [38]:
components, exp_var, exp_var_cumsum = get_n_components(scaled_tr)

fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=list(range(len(exp_var_cumsum))),
        y=exp_var_cumsum,
        name='Cumulative Explained Variance',
        line=dict(color=palette[0], width=2),
    )
)
fig.add_trace(
    go.Bar(
        x=list(range(len(exp_var_cumsum))),
        y=exp_var,
        name='Explained Variance',
        marker_color=palette[1],
        width=0.7,
    )
)

fig.update_layout(
    xaxis=dict(title='Principal Components', titlefont_size=20, tickfont_size=16),
    yaxis=dict(title='Explained Variance', titlefont_size=20, tickfont_size=16),
    height=500, width=1000, title_text='Explained Variance by Principal Components', title_x=0.5, titlefont_size=24,
)

In [39]:
def pca_transform(train, test, N):
    pca = PCA(n_components=N, random_state=CFG.SEED)
    X = pca.fit_transform(train.drop("Class", axis=1))
    X = pd.DataFrame(X, columns=[f'PC{i}' for i in range(N)])
    y = train['Class']
    return pca.transform(test), X, y

In [40]:
N = 3
config['n_components'] = N

X_test, X, y = pca_transform(scaled_tr, scaled_tst, N)

In [41]:
modelsRF = []
predsRF = []

params = {
        'criterion': 'log_loss',
        'n_jobs': -1,
        'random_state': CFG.SEED,
        'verbose': False,
        'n_estimators': 100,
        'max_depth': None,
        'min_samples_split': 2,
        'min_samples_lead': 1,
        'min_weight_fraction_leaf': 0.0,
        'max_features': 'sqrt',
        'max_leaf_nodes': None,
        'min_impunity_decrease': 0.0,
        'bootstrap': True,
        'max_samples': None
    }

config = {} | {
    key:value for key, value in params.items() if key not in ['criterion', 'n_jobs', 'random_state', 'verbose', 'bootstrap']
}
wandb.init(project='S3E10', name='RandomForest', group='RandomForest', config=config, reinit=True)

for train_index, test_index in k_fold.split(X, y):
    X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
    
    model = RandomForestClassifier(**params)
    model.fit(X=X_train, y=y_train)
    yprobas = model.predict_proba(X_valid)
    ypred = model.predict(X_valid)
    score = log_loss(y_valid, ypred)
    print(f'Score: {score}')
    
    modelsRF.append(model)
    predsRF.append(model.predict_proba(test)[:, 1])
    wandb.sklearn.plot_classifier(model, 
                               X_train, X_valid, 
                               y_train, y_valid,
                               ypred, yprobas,
                               [0, 1],
                               is_binary=True, 
                               model_name='RandomForest')

In [42]:
modelsRF = []
predsRF = []

params = {
        'criterion': 'log_loss',
        'n_jobs': -1,
        'random_state': CFG.SEED,
        'verbose': False,
        'n_estimators': 100,
        'max_depth': None,
        'min_samples_split': 2,
        'min_samples_leaf': 1,
        'min_weight_fraction_leaf': 0.0,
        'max_features': 'sqrt',
        'max_leaf_nodes': None,
        'min_impunity_decrease': 0.0,
        'bootstrap': True,
        'max_samples': None
    }

config = {} | {
    key:value for key, value in params.items() if key not in ['criterion', 'n_jobs', 'random_state', 'verbose', 'bootstrap']
}
wandb.init(project='S3E10', name='RandomForest', group='RandomForest', config=config, reinit=True)

for train_index, test_index in k_fold.split(X, y):
    X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
    
    model = RandomForestClassifier(**params)
    model.fit(X=X_train, y=y_train)
    yprobas = model.predict_proba(X_valid)
    ypred = model.predict(X_valid)
    score = log_loss(y_valid, ypred)
    print(f'Score: {score}')
    
    modelsRF.append(model)
    predsRF.append(model.predict_proba(test)[:, 1])
    wandb.sklearn.plot_classifier(model, 
                               X_train, X_valid, 
                               y_train, y_valid,
                               ypred, yprobas,
                               [0, 1],
                               is_binary=True, 
                               model_name='RandomForest')

In [43]:
modelsRF = []
predsRF = []

params = {
        'criterion': 'log_loss',
        'n_jobs': -1,
        'random_state': CFG.SEED,
        'verbose': False,
        'n_estimators': 100,
        'max_depth': None,
        'min_samples_split': 2,
        'min_samples_leaf': 1,
        'min_weight_fraction_leaf': 0.0,
        'max_features': 'sqrt',
        'max_leaf_nodes': None,
        'min_impurity_decrease': 0.0,
        'bootstrap': True,
        'max_samples': None
    }

config = {} | {
    key:value for key, value in params.items() if key not in ['criterion', 'n_jobs', 'random_state', 'verbose', 'bootstrap']
}
wandb.init(project='S3E10', name='RandomForest', group='RandomForest', config=config, reinit=True)

for train_index, test_index in k_fold.split(X, y):
    X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
    
    model = RandomForestClassifier(**params)
    model.fit(X=X_train, y=y_train)
    yprobas = model.predict_proba(X_valid)
    ypred = model.predict(X_valid)
    score = log_loss(y_valid, ypred)
    print(f'Score: {score}')
    
    modelsRF.append(model)
    predsRF.append(model.predict_proba(test)[:, 1])
    wandb.sklearn.plot_classifier(model, 
                               X_train, X_valid, 
                               y_train, y_valid,
                               ypred, yprobas,
                               [0, 1],
                               is_binary=True, 
                               model_name='RandomForest')

In [44]:
N = 3
config['n_components'] = N

X_test, X, y = pca_transform(scaled_tr, scaled_tst, N)

In [45]:
modelsRF = []
predsRF = []

params = {
        'criterion': 'log_loss',
        'n_jobs': -1,
        'random_state': CFG.SEED,
        'verbose': False,
        'n_estimators': 100,
        'max_depth': None,
        'min_samples_split': 2,
        'min_samples_leaf': 1,
        'min_weight_fraction_leaf': 0.0,
        'max_features': 'sqrt',
        'max_leaf_nodes': None,
        'min_impurity_decrease': 0.0,
        'bootstrap': True,
        'max_samples': None
    }

config = {} | {
    key:value for key, value in params.items() if key not in ['criterion', 'n_jobs', 'random_state', 'verbose', 'bootstrap']
}
wandb.init(project='S3E10', name='RandomForest', group='RandomForest', config=config, reinit=True)

for train_index, test_index in k_fold.split(X, y):
    X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
    
    model = RandomForestClassifier(**params)
    model.fit(X=X_train, y=y_train)
    yprobas = model.predict_proba(X_valid)
    ypred = model.predict(X_valid)
    score = log_loss(y_valid, ypred)
    print(f'Score: {score}')
    
    modelsRF.append(model)
    predsRF.append(model.predict_proba(X_test)[:, 1])
    wandb.sklearn.plot_classifier(model, 
                               X_train, X_valid, 
                               y_train, y_valid,
                               ypred, yprobas,
                               [0, 1],
                               is_binary=True, 
                               model_name='RandomForest')

In [46]:
wandb.finish()

In [47]:
from sklearn.linear_model import LogisticRegression, RidgeClassifier

In [48]:
lr = LogisticRegression()
print(lr.__repr__)

In [49]:
lr = LogisticRegression()
print(lr.__repr__())

In [50]:
lr = LogisticRegression()
print(type(lr.__repr__()))

In [51]:
lr = LogisticRegression()
print(lr.__repr__()[:-2])

In [52]:
def fit_linear(X, y, test, model, params):
    models = []
    preds = []

    for train_index, test_index in k_fold.split(X, y):
        X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
        
        model = model(**params)
        model.fit(X=X_train, y=y_train)
        yprobas = model.predict_proba(X_valid)
        ypred = model.predict(X_valid)
        score = log_loss(y_valid, ypred)
        print(f'Score: {score}')
        
        models.append(model)
        preds.append(model.predict_proba(test)[:, 1])
        wandb.sklearn.plot_classifier(model, 
                                X_train, X_valid, 
                                y_train, y_valid,
                                ypred, yprobas,
                                [0, 1],
                                is_binary=True, 
                                model_name=model.__repr()[:-2])
    return models, preds

In [53]:
lr_params = {
    'random_state': CFG.SEED,
    'max_iter': 10000,
    'n_jobs': -1,
    'solver': 'lbfgs',
    'C': 0.1,
    'tol': 1e-3
}

modelsLR, predsLR = fit_linear(X, y, X_test, LogisticRegression, lr_params)

In [54]:
def fit_linear(X, y, test, model, params):
    models = []
    preds = []

    for train_index, test_index in k_fold.split(X, y):
        X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
        
        model = model(**params)
        model.fit(X=X_train, y=y_train)
        yprobas = model.predict_proba(X_valid)
        ypred = model.predict(X_valid)
        score = log_loss(y_valid, ypred)
        print(f'Score: {score}')
        
        models.append(model)
        preds.append(model.predict_proba(test)[:, 1])
        wandb.sklearn.plot_classifier(model, 
                                X_train, X_valid, 
                                y_train, y_valid,
                                ypred, yprobas,
                                [0, 1],
                                is_binary=True, 
                                model_name=model().__repr()[:-2])
    return models, preds

In [55]:
lr_params = {
    'random_state': CFG.SEED,
    'max_iter': 10000,
    'n_jobs': -1,
    'solver': 'lbfgs',
    'C': 0.1,
    'tol': 1e-3
}

modelsLR, predsLR = fit_linear(X, y, X_test, LogisticRegression, lr_params)

In [56]:
def fit_linear(X, y, test, model):
    models = []
    preds = []

    for train_index, test_index in k_fold.split(X, y):
        X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
        
        #model = model(**params)
        model.fit(X=X_train, y=y_train)
        yprobas = model.predict_proba(X_valid)
        ypred = model.predict(X_valid)
        score = log_loss(y_valid, ypred)
        print(f'Score: {score}')
        
        models.append(model)
        preds.append(model.predict_proba(test)[:, 1])
        wandb.sklearn.plot_classifier(model, 
                                X_train, X_valid, 
                                y_train, y_valid,
                                ypred, yprobas,
                                [0, 1],
                                is_binary=True, 
                                model_name=model().__repr()[:-2])
    return models, preds

In [57]:
lr_params = {
    'random_state': CFG.SEED,
    'max_iter': 10000,
    'n_jobs': -1,
    'solver': 'lbfgs',
    'C': 0.1,
    'tol': 1e-3
}

modelsLR, predsLR = fit_linear(X, y, X_test, LogisticRegression(**lr_params))

In [58]:
lr_params = {
    'random_state': CFG.SEED,
    'max_iter': 10000,
    'n_jobs': -1,
    'solver': 'lbfgs',
    'C': 0.1,
    'tol': 1e-3
}
lr = LogisticRegression(**lr_params)
modelsLR, predsLR = fit_linear(X, y, X_test, lr)

In [59]:
def fit_linear(X, y, test, model):
    models = []
    preds = []

    for train_index, test_index in k_fold.split(X, y):
        X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
        
        #model = model(**params)
        model.fit(X=X_train, y=y_train)
        yprobas = model.predict_proba(X_valid)
        ypred = model.predict(X_valid)
        score = log_loss(y_valid, ypred)
        print(f'Score: {score}')
        
        models.append(model)
        preds.append(model.predict_proba(test)[:, 1])
        wandb.sklearn.plot_classifier(model, 
                                X_train, X_valid, 
                                y_train, y_valid,
                                ypred, yprobas,
                                [0, 1],
                                is_binary=True, 
                                model_name=model.__repr()[:-2])
    return models, preds

In [60]:
lr_params = {
    'random_state': CFG.SEED,
    'max_iter': 10000,
    'n_jobs': -1,
    'solver': 'lbfgs',
    'C': 0.1,
    'tol': 1e-3
}
lr = LogisticRegression(**lr_params)
modelsLR, predsLR = fit_linear(X, y, X_test, lr)

In [61]:
lr.__repr__

<bound method BaseEstimator.__repr__ of LogisticRegression(C=0.1, max_iter=10000, n_jobs=-1, random_state=69, tol=0.001)>

In [62]:
lr().__repr__

In [63]:
str(lr)

'LogisticRegression(C=0.1, max_iter=10000, n_jobs=-1, random_state=69, tol=0.001)'

In [64]:
# lr = LogisticRegression()
print(lr.__repr__()[:-2])

In [65]:
# lr = LogisticRegression()
print(lr.__repr__()[:12])

In [66]:
# lr = LogisticRegression()
print(lr.__repr__()[:16])

In [67]:
def fit_linear(X, y, test, model, name):
    models = []
    preds = []

    for train_index, test_index in k_fold.split(X, y):
        X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
        
        #model = model(**params)
        model.fit(X=X_train, y=y_train)
        yprobas = model.predict_proba(X_valid)
        ypred = model.predict(X_valid)
        score = log_loss(y_valid, ypred)
        print(f'Score: {score}')
        
        models.append(model)
        preds.append(model.predict_proba(test)[:, 1])
        wandb.sklearn.plot_classifier(model, 
                                X_train, X_valid, 
                                y_train, y_valid,
                                ypred, yprobas,
                                [0, 1],
                                is_binary=True, 
                                model_name=name)
    return models, preds

In [68]:
lr_params = {
    'random_state': CFG.SEED,
    'max_iter': 10000,
    'n_jobs': -1,
    'solver': 'lbfgs',
    'C': 0.1,
    'tol': 1e-3
}
lr = LogisticRegression(**lr_params)
modelsLR, predsLR = fit_linear(X, y, X_test, lr, 'LogisticRegression')

In [69]:
lr_params = {
    'random_state': CFG.SEED,
    'max_iter': 10000,
    'n_jobs': -1,
    'solver': 'lbfgs',
    'C': 0.1,
    'tol': 1e-3
}
wandb.init(project='S3E10', name='LogisticRegression', group='LogisticRegression', config=lr_params, reinit=True)
lr = LogisticRegression(**lr_params)
modelsLR, predsLR = fit_linear(X, y, X_test, lr, 'LogisticRegression')

In [70]:
r_params = {
    'random_state': CFG.SEED,
    'max_iter': 10000,
    'alpha': 0.1,
    'tol': 1e-3
}
wandb.init(project='S3E10', name='RidgeRegression', group='RidgeRegression', config=r_params, reinit=True)
lr = Ridge(**r_params)
modelsLR, predsLR = fit_linear(X, y, X_test, lr, 'RidgeRegression')

In [71]:
r_params = {
    'random_state': CFG.SEED,
    'max_iter': 10000,
    'alpha': 0.1,
    'tol': 1e-3
}
wandb.init(project='S3E10', name='RidgeRegression', group='RidgeRegression', config=r_params, reinit=True)
r = Ridge(**r_params)
modelsR, predsR = fit_linear(X, y, X_test, r, 'RidgeRegression')

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016668928650021068, max=1.0…

In [72]:
r_params = {
    'random_state': CFG.SEED,
    'max_iter': 10000,
    'alpha': 0.1,
    'tol': 1e-3
}
wandb.init(project='S3E10', name='RidgeClassifier', group='RidgeClassifier', config=r_params, reinit=True)
r = RidgeClassifier(**r_params)
modelsR, predsR = fit_linear(X, y, X_test, r, 'RidgeClassifier')
wandb.finish()