In [None]:
%%time
from google.colab import drive
drive.mount("/content/drive")
!unzip "/content/drive/MyDrive/Colab Notebooks/AMEX-data/amex-data.zip"

Mounted at /content/drive
Archive:  /content/drive/MyDrive/Colab Notebooks/AMEX-data/amex-data.zip
  inflating: test.csv                
  inflating: train.csv               
  inflating: train_labels.csv        
CPU times: user 2.82 s, sys: 472 ms, total: 3.29 s
Wall time: 4min 27s


In [None]:
!pip install colorama

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting colorama
  Downloading colorama-0.4.5-py2.py3-none-any.whl (16 kB)
Installing collected packages: colorama
Successfully installed colorama-0.4.5


In [None]:
import numpy as np
import pandas as pd
import gc
from colorama import Style, Fore
import sys
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier

In [None]:
cat_features = ['B_30_0.0', 'B_30_1.0', 'B_30_2.0','B_38_1.0','B_38_2.0','B_38_3.0','B_38_4.0',
            'B_38_5.0','B_38_6.0','B_38_7.0','D_114_0.0','D_114_1.0','D_116_0.0','D_116_1.0',
            'D_117_-1.0','D_117_1.0','D_117_2.0','D_117_3.0','D_117_4.0','D_117_5.0','D_117_6.0',
            'D_120_0.0','D_120_1.0','D_126_-1.0','D_126_0.0','D_126_1.0','D_63_CL','D_63_CO','D_63_CR',
            'D_63_XL','D_63_XM','D_63_XZ','D_64_O','D_64_R','D_64_U','D_68_1.0',
            'D_68_2.0','D_68_3.0','D_68_4.0','D_68_5.0','D_68_6.0','R_2_0.0','R_2_1.0','B_8_0.0',
            'B_8_1.0','S_6_0.0','S_6_1.0','D_54_0.0','D_54_1.0','R_4_0.0','R_4_1.0','P_4_0.0',
            'P_4_1.0','B_33_0.0','B_33_1.0','D_103_0.0','D_103_1.0','D_104_0.0','D_104_1.0',
            'R_27_0.0','R_27_1.0','D_112_0.0','D_112_1.0','D_123_0.0','D_123_1.0','D_127_0.0',
            'D_127_1.0','D_128_0.0','D_128_1.0','D_129_0.0','D_129_1.0','D_130_0.0','D_130_1.0',
            'D_131_0.0','D_131_1.0','D_139_0.0','D_139_1.0','D_141_0.0','D_141_0.9','D_143_0.0',
            'D_143_1.0']

In [None]:
NUM_FOLDS=5

In [None]:
def amex_metric(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:

    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
        
    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    g = normalized_weighted_gini(y_true, y_pred)
    d = top_four_percent_captured(y_true, y_pred)

    return 0.5 * (g + d)

In [None]:
df = pd.read_csv('train.csv', encoding = 'utf-8', sep = ';')
labels = pd.read_csv("train_labels.csv")
df = df.merge(labels, on='customer_ID')
sys.getsizeof(df)/2**20

2959.9750328063965

In [None]:
X = df.drop(['customer_ID', 'target', 'D_64_-1', 'D_68_0.0'], axis = 1)
y = df['target']
del df
_ = gc.collect()

In [None]:
class СustomForest:
    def __init__(self, criterion='gini', max_depth=None, 
               random_state=None, min_samples_split=2, 
               min_samples_leaf=1, cat_features=[], verbose=False):
    self.criterion = criterion
    self.max_depth = max_depth
    self.random_state = random_state
    self.min_samples_split = min_samples_split
    self.min_samples_leaf = min_samples_leaf
    self.cat_features = cat_features
    self.verbose = verbose

    def __str__(self) -> str:
        return "СustomForest(criterion={0}, random_state={1}, verbose={2})".format(
          self.criterion, self.random_state, self.verbose
          )

    @staticmethod
    def __get_feature_classes(X):
        columns = X.columns.to_list()
        feature_classes = ['D', 'S', 'P', 'B', 'R']
        res = []
        for classf in feature_classes:
            typef = []
            for feature in columns:
            if classf in feature:
                typef.append(feature)
        res.append(typef)
        return dict(zip(feature_classes, res))

    def __get_grouped_features(self, X):
        columns = X.drop(self.cat_features+['S_2'], axis=1).columns.to_list()
        res = []
        for feature in columns:
            classf, number, stat = feature.split('_')[0], feature.split('_')[1], feature.split('_')[2]
            res.append("_".join([classf, stat]))
        grouped_features = list(set(res))
        res = []
        for grouped in grouped_features:
            typef = []
            classf, stat = grouped.split('_')[0], grouped.split('_')[1]
            for feature in columns:
                if (classf in feature) and (stat in feature):
                    typef.append(feature)
            res.append(typef)
        return dict(zip(grouped_features, res))

    def fit(self, X, y):
        trees = []

        grouped_features = self.__get_grouped_features(X)
        feature_classes = self.__get_feature_classes(X)

        grouped_features.update(feature_classes)
        grouped_features.update({'cat_fwatures': self.cat_features})

        for key, val in grouped_features.items():
            if self.verbose:
                print("building tree on {0}".format(key))
            tree = DecisionTreeClassifier(criterion=self.criterion, max_depth=self.max_depth, 
                                          random_state=self.random_state, 
                                          min_samples_split=self.min_samples_split, 
                                          min_samples_leaf=self.min_samples_leaf)
            tree.fit(X[val], y)
            trees.append(tree)
        self.fitted_trees = dict(zip(grouped_features.keys(), trees))
        return self

    def predict_proba(self, X):
        proba_0, proba_1 = [], []
        grouped_features = self.__get_grouped_features(X)
        feature_classes = self.__get_feature_classes(X)
        grouped_features.update(feature_classes)
        grouped_features.update({'cat_fwatures': self.cat_features})
        for key, tree in self.fitted_trees.items():
            proba_0.append(tree.predict_proba(X[grouped_features.get(key)])[:, 0])
            proba_1.append(tree.predict_proba(X[grouped_features.get(key)])[:, 1])
        proba_0 = np.array(proba_0).mean(axis=0)
        proba_1 = np.array(proba_1).mean(axis=0)
        return np.vstack((proba_0, proba_1)).T

In [None]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()

forest1 = СustomForest(criterion='gini', 
                       cat_features=cat_features, 
                       verbose=True, 
                       random_state=13)

forest2 = СustomForest(criterion='entropy', 
                       cat_features=cat_features, 
                       verbose=True, 
                       random_state=21)

rfc1 = RandomForestClassifier(n_estimators = 50, 
                              criterion='entropy', 
                              max_features='sqrt', 
                              min_samples_leaf=2, 
                              verbose=5, 
                              random_state=34,
                              n_jobs=-1)
rfc2 = RandomForestClassifier(n_estimators = 100, 
                              criterion='gini', 
                              max_features='log2', 
                              min_samples_leaf=4, 
                              verbose=5, 
                              random_state=55,
                              n_jobs=-1)
rfc3 = RandomForestClassifier(n_estimators = 150, 
                              criterion='entropy', 
                              max_features='sqrt', 
                              min_samples_leaf=8,
                              verbose=5, 
                              random_state=89,
                              n_jobs=-1)

rfc4 = RandomForestClassifier(n_estimators = 400, 
                              criterion='gini', 
                              max_features='log2', 
                              min_samples_leaf=16,
                              verbose=5, 
                              random_state=144,
                              n_jobs=-1)

lr1 = LogisticRegression(solver='sag', 
                         penalty='l2', 
                         C=7, 
                         random_state=233, 
                         n_jobs=-1)

lr2 = LogisticRegression(solver='sag', 
                         penalty='l2', 
                         C=1, 
                         random_state=377, 
                         n_jobs=-1)

models = [gnb, rfc1, rfc2, rfc3, rfc4, lr1, lr2, forest1, forest2]

In [None]:
#first layer of Stacking

skf = StratifiedKFold(n_splits=NUM_FOLDS)
train_answers, test_answers = [], []
for i, model in enumerate(models):
    print(Style.BRIGHT+"{0}th model fitting. Model: {1}".format(i+1 ,model.__str__()) + Style.RESET_ALL)
    answers = np.array([])
    mean_test_answers = []
    for fold, (train_index, test_index) in enumerate(skf.split(X, y)):
        print(Fore.BLUE + "#" * 10, f"Fold {fold+1}", "#" * 10 + Style.RESET_ALL)
        X_train = X.iloc[train_index]
        y_train = y.iloc[train_index]
        X_test = X.iloc[test_index]
        y_test = y.iloc[test_index]
        model.fit(X_train, y_train)
        y_pred = model.predict_proba(X_test)[:, 1]
        answers = np.hstack((answers, y_pred))

        print('predict for test on current fold...')
        test_chunks = pd.read_csv('test.csv', chunksize=20000, encoding = 'utf-8', sep = ';')
        answers_fold = np.array([])
        for test in test_chunks:
            X_submit = test.drop(['customer_ID'], axis=1)
            #X_submit = X_submit.to_numpy()
            del test
            _ = gc.collect()
            y_pred_submit = model.predict_proba(X_submit)[:, 1]
            del X_submit
            _ = gc.collect()
            answers_fold = np.hstack((answers_fold, y_pred_submit))
        mean_test_answers.append(answers_fold)

    test_answers.append(np.array(mean_test_answers).mean(axis=0))
    train_answers.append(answers)
    print(Fore.BLUE + "#" * 28, "\n" + Style.RESET_ALL)
train_answers = np.array(train_answers)
test_answers = np.array(test_answers)

[1m1th model fitting. Model: LogisticRegression(C=7, n_jobs=-1, random_state=233, solver='sag')[0m
[34m########## Fold 1 ##########[0m




predict for test on current fold...
[34m########## Fold 2 ##########[0m




predict for test on current fold...
[34m########## Fold 3 ##########[0m




predict for test on current fold...
[34m########## Fold 4 ##########[0m




predict for test on current fold...
[34m########## Fold 5 ##########[0m




predict for test on current fold...
[34m############################ 
[0m
[1m2th model fitting. Model: LogisticRegression(C=1, n_jobs=-1, random_state=377, solver='sag')[0m
[34m########## Fold 1 ##########[0m




predict for test on current fold...
[34m########## Fold 2 ##########[0m




predict for test on current fold...
[34m########## Fold 3 ##########[0m




predict for test on current fold...
[34m########## Fold 4 ##########[0m




predict for test on current fold...
[34m########## Fold 5 ##########[0m




predict for test on current fold...
[34m############################ 
[0m
[1m3th model fitting. Model: СustomForest(criterion=gini, random_state=13, verbose=True)[0m
[34m########## Fold 1 ##########[0m
building tree on R_std
building tree on R_last
building tree on P_std
building tree on P_max
building tree on S_mean
building tree on P_first
building tree on B_max
building tree on D_std
building tree on D_last
building tree on D_min
building tree on B_first
building tree on S_min
building tree on S_max
building tree on B_min
building tree on P_min
building tree on S_last
building tree on R_min
building tree on R_first
building tree on D_first
building tree on D_mean
building tree on B_mean
building tree on R_mean
building tree on R_max
building tree on B_last
building tree on P_mean
building tree on S_first
building tree on B_std
building tree on P_last
building tree on S_std
building tree on D_max
building tree on D
building tree on S
building tree on P
building tree on B
buildi

KeyboardInterrupt: ignored

In [None]:
train_answers = np.array(train_answers)
test_answers = np.array(test_answers)

In [None]:
df_train_answ = pd.DataFrame({'gnb':train_answers[0], 
                              'rfc1': train_answers[1], 
                              'rfc2':train_answers[2], 
                              'rfc3':train_answers[3], 
                              'rfc4':train_answers[4], 
                              'lr1':train_answers[5], 
                              'lr2':train_answers[6], 
                              'forest1':train_answers[7], 
                              'forest2':train_answers[8]})
df_train_answ.to_csv('train_answers.csv', index=False)

In [None]:
df_test_answ = pd.DataFrame({'gnb':test_answers[0], 
                              'rfc1': test_answers[1], 
                              'rfc2':test_answers[2], 
                              'rfc3':test_answers[3], 
                              'rfc4':test_answers[4], 
                              'lr1':test_answers[5], 
                              'lr2':test_answers[6], 
                              'forest1':test_answers[7], 
                              'forest2':test_answers[8]})
df_test_answ.to_csv('test_answers.csv', index=False)