In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from category_encoders import BinaryEncoder
from category_encoders.one_hot import OneHotEncoder
from sklearn.base import TransformerMixin, ClassifierMixin
from sklearn.metrics import roc_auc_score

In [2]:
def import_data():
    train = pd.read_csv('data/_transacao_train.csv', index_col='ID')
    train.rename(columns={'data': 'data_2', 'var10': 'var10_'}, inplace=True)

    test = pd.read_csv('data/_transacao_test.csv', index_col='ID')
    test.rename(columns={'data': 'data_2', 'var10': 'var10_'}, inplace=True)
    
    pessoas = pd.read_csv('data/_pessoas.csv', index_col='ID')
    
    train_ = train.join(pessoas)
    X_train = train_.drop(columns='retorno')
    y_train = train.retorno
    
    test_ = test.join(pessoas)
    
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2)
    
    return X_train, y_train, X_val, y_val, test

In [3]:
X_train, y_train, X_val, y_val, X_test = import_data()

In [4]:
print('X_train shape:', X_train.shape)
print('X_val shape:', X_val.shape)
print('X_test shape:', X_test.shape)

X_train shape: (1551740, 43)
X_val shape: (387936, 43)
X_test shape: (100000, 3)


In [5]:
X_train.head()

Unnamed: 0_level_0,data_2,categoria,var10_,var1,grupo,var2,data,var3,var4,var5,...,var29,var30,var31,var32,var33,var34,var35,var36,var37,var38
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
cli_370459,2022-08-03,tipo2,type 1,tipo2,group 17304,tipo2,2020-11-26,tipo2,tipo7,tipo2,...,False,False,False,False,False,False,False,False,False,0
cli_111738,2023-03-23,tipo2,type 1,tipo2,group 22753,tipo3,2021-11-03,tipo2,tipo7,tipo2,...,False,False,False,False,False,False,False,False,False,66
cli_171061,2022-09-30,tipo4,type 166,tipo2,group 4742,tipo3,2022-08-09,tipo7,tipo6,tipo7,...,True,False,False,True,False,False,True,True,False,94
cli_295643,2023-03-02,tipo2,type 1,tipo2,group 17577,tipo3,2022-08-27,tipo40,tipo25,tipo9,...,False,False,False,True,False,True,False,False,False,88
cli_64887,2022-09-22,tipo3,type 23,tipo2,group 20472,tipo3,2020-11-06,tipo40,tipo25,tipo9,...,False,True,True,False,True,False,False,True,True,66


In [6]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1551740 entries, cli_370459 to cli_276964
Data columns (total 43 columns):
data_2       1551740 non-null object
categoria    1551740 non-null object
var10_       1551740 non-null object
var1         1551740 non-null object
grupo        1551740 non-null object
var2         1551740 non-null object
data         1551740 non-null object
var3         1551740 non-null object
var4         1551740 non-null object
var5         1551740 non-null object
var6         1551740 non-null object
var7         1551740 non-null object
var8         1551740 non-null object
var9         1551740 non-null object
var10        1551740 non-null bool
var11        1551740 non-null bool
var12        1551740 non-null bool
var13        1551740 non-null bool
var14        1551740 non-null bool
var15        1551740 non-null bool
var16        1551740 non-null bool
var17        1551740 non-null bool
var18        1551740 non-null bool
var19        1551740 non-null bool
var20       

In [7]:
cat_columns = ['var' + str(n) for n in range(1, 10)] + ['grupo', 'var10_', 'categoria']

In [8]:
for col in cat_columns:
    print(col, 'number of categories:', X_train[col].nunique())

var1 number of categories: 2
var2 number of categories: 3
var3 number of categories: 43
var4 number of categories: 25
var5 number of categories: 9
var6 number of categories: 7
var7 number of categories: 25
var8 number of categories: 8
var9 number of categories: 9
grupo number of categories: 26659
var10_ number of categories: 6049
categoria number of categories: 6


In [9]:
cat_columns_ = [
    'var1',
    'var2',
    'var5',
    'var6',
    'var8',
    'var9',
    'categoria'
]

In [10]:
class Categorical(TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
    
    def fit(self, df, _):
        self.ohe = OneHotEncoder()
        self.ohe.fit(df[self.columns])
        return self
    
    def transform(self, df):
        df_: pd.DataFrame = df.copy()
        encoded = self.ohe.transform(df_[self.columns])
        df_.drop(columns=self.columns, inplace=True)
        df_ = pd.concat([df_, encoded], axis=1, sort=False)
        return df_
    

class Binary(TransformerMixin):
    def __init__(self):
        self.columns = ['var' + str(n) for n in range(10, 38)]
    
    def fit(self, df, _):
        return self
    
    def transform(self, df):
        df_: pd.DataFrame = df.copy()
        df_[self.columns] = df_[self.columns].astype('int')
        return df_

    
class Normalize(TransformerMixin):
    def __init__(self):
        self.columns = ['var38']
    
    def fit(self, df, _):
        self.norm = MinMaxScaler()
        self.norm.fit(df[self.columns])
        return self
    
    def transform(self, df):
        df_: pd.DataFrame = df.copy()
        df_[self.columns] = self.norm.transform(df_[self.columns])
        return df_
    
    
class Model_RF(ClassifierMixin):
    def __init__(self):
        self.model = RandomForestClassifier(
            n_estimators=2000, 
            max_depth=10,
            n_jobs=-1)

    def fit(self, X_train, y_train):
        self.model.fit(X_train, y_train)
        return self
    
    def predict(self, X_test):
        return self.model.predict(X_test)

In [11]:
pipeline = Pipeline([
    ('one_hot_encoder', Categorical(cat_columns_)),
    ('binary_encoder', Binary()),
    ('norm', Normalize()),
    ('model_rf', Model_RF())] 
)

In [12]:
pipeline.fit(
    X_train.drop(columns=[
        'data',
        'data_2',
        'var3',
        'var4',
        'var7',
        'var10_',
        'grupo'
    ]), 
    y_train)

Pipeline(memory=None,
         steps=[('one_hot_encoder',
                 <__main__.Categorical object at 0x12f43a610>),
                ('binary_encoder', <__main__.Binary object at 0x12f43ae90>),
                ('norm', <__main__.Normalize object at 0x1340ca890>),
                ('model_rf', <__main__.Model_RF object at 0x130efee10>)],
         verbose=False)

In [13]:
pred_log = pipeline.predict(X_val.drop(columns=[
        'data',
        'data_2',
        'var3',
        'var4',
        'var7',
        'var10_',
        'grupo'
    ]))
roc_auc_log = roc_auc_score(y_val, pred_log)
roc_auc_log

0.5037574044595248