In [1]:
import warnings
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer

#supressing warnings for readability
warnings.filterwarnings("ignore")

# To plot pretty figures directly within Jupyter
%matplotlib inline

# choose your own style: https://matplotlib.org/3.1.0/gallery/style_sheets/style_sheets_reference.html
plt.style.use('seaborn-whitegrid')

# Go to town with https://matplotlib.org/tutorials/introductory/customizing.html
# plt.rcParams.keys()
mpl.rc('axes', labelsize=14, titlesize=14)
mpl.rc('figure', titlesize=20)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# contants for figsize
S = (8,8)
M = (12,12)
L = (14,14)

# pandas options
pd.set_option("display.max.columns", None)
pd.set_option("display.max.rows", None)
pd.set_option("display.precision", 2)

In [2]:
# Import dataset
# df_client = pd.read_csv('../data/processed/client_train.csv')
# df_invoice = pd.read_csv('../data/processed/invoice_train.csv')
# df_merged = pd.merge(df_client, df_invoice, on='client_id')
df_merged = pd.read_csv('../data/processed/merged_train.csv')

In [3]:
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4452681 entries, 0 to 4452680
Data columns (total 33 columns):
 #   Column                        Dtype 
---  ------                        ----- 
 0   Unnamed: 0                    int64 
 1   Unnamed: 0_x                  int64 
 2   district                      int64 
 3   client_id                     object
 4   client_cat                    int64 
 5   region                        int64 
 6   creation_date                 object
 7   target                        bool  
 8   Unnamed: 0_y                  int64 
 9   invoice_date                  object
 10  tarif_type                    int64 
 11  counter_number                int64 
 12  counter_status                int64 
 13  counter_code                  int64 
 14  counter_score                 int64 
 15  counter_coefficient           int64 
 16  consommation_level_1          int64 
 17  consommation_level_2          int64 
 18  consommation_level_3          int64 
 19  

In [4]:
# reduce memory: strings to categories

# int64 to smallest, unsigned int
for col in df_merged.select_dtypes(include='int64').columns:
    df_merged[col] = pd.to_numeric(df_merged[col], downcast='unsigned')

for col in df_merged.select_dtypes(include='object').columns:
    df_merged[col] = df_merged[col].astype('category')

df_merged.info(verbose=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4452681 entries, 0 to 4452680
Columns: 33 entries, Unnamed: 0 to same_month
dtypes: bool(6), category(6), uint16(3), uint32(10), uint64(1), uint8(7)
memory usage: 336.9 MB


In [5]:
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4452681 entries, 0 to 4452680
Data columns (total 33 columns):
 #   Column                        Dtype   
---  ------                        -----   
 0   Unnamed: 0                    uint32  
 1   Unnamed: 0_x                  uint32  
 2   district                      uint8   
 3   client_id                     category
 4   client_cat                    uint8   
 5   region                        uint16  
 6   creation_date                 category
 7   target                        bool    
 8   Unnamed: 0_y                  uint32  
 9   invoice_date                  category
 10  tarif_type                    uint8   
 11  counter_number                uint64  
 12  counter_status                uint8   
 13  counter_code                  uint16  
 14  counter_score                 uint8   
 15  counter_coefficient           uint8   
 16  consommation_level_1          uint32  
 17  consommation_level_2          uint32  
 18  co

In [6]:
y = 'target'
commont_col = 'client_id'

client_cat_col = ['district', 'client_id', 'client_cat', 'region']
client_dt_col = ['creation_date']
client_bool_col = ['target']

invoice_cat_col = ['client_id', 'tarif_type', 'counter_status', 'counter_code', 'month', 'counter_type']
invoice_ordinal_col = ['counter_status']
invoice_dt_col = ['invoice_date']
invoice_num_col = ['counter_number', 'counter_score', 'counter_coefficient', 'consommation_level_1', 'consommation_level_2', 'consommation_level_3', 'consommation_level_4', 'old_index', 'new_index']
consom_col = ['consommation_level_1', 'consommation_level_2', 'consommation_level_3', 'consommation_level_4']
consom_without1_col = ['consommation_level_2', 'consommation_level_3', 'consommation_level_4']
id_cols = ['client_id']

In [7]:
def bin_data(df, column, num_bins):
    # Create bins
    edge, bins = pd.qcut(df[column], q=num_bins, retbins=True)
    # labels = [str(interval) for interval in bins.categories]

    # Assign labels to each bin
    labels = []
    for x in range(1, len(bins)):
        labels.append(f'{bins[x-1]} - {bins[x]}')

    # # Create the new categorical column based on the bin labels
    return pd.qcut(df[column], q=num_bins, labels=labels)

In [8]:
from sklearn.base import BaseEstimator, TransformerMixin

class ConsoAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_total=True, bin=False, above_100=False):
        self.add_total = add_total
        self.bin = bin
        self.above_100 = above_100
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        data = X.copy()
        ret = []
        if self.add_total:
            data['total_consom'] = X[data.columns].sum(axis=1)
            ret.append('total_consom')
        if self.above_100:
            for col in data.columns:
                data[col + '_above_100'] = X[col].map(lambda x: True if x > 100 else False)
                ret.append(col + '_above_100')
        if self.bin:
            for col in data.columns:
                if col == 'consommation_level_1':
                    data[col + 'bin'] = bin_data(X, 'consommation_level_1', 9)
                    ret.append(col + 'bin')
                    continue
        return data[ret]

class DateAttributesAdder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        data = X.copy()
        ret = []
        for col in data.columns:
            data[col] = pd.to_datetime(data[col])
            data[col + 'month'] = data[col].dt.month
            data[col + 'year'] = data[col].dt.year
            ret.append(col + 'month')
            ret.append(col + 'year')
        return data[ret]
    
class NumericalToCategoricalTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.encoders = {}
    
    def fit(self, X, y=None):
        for col in X.columns:
            encoder = LabelEncoder()
            encoder.fit(X[col])
            self.encoders[col] = encoder
        return self
    
    def transform(self, X):
        transformed = []
        for col in X.columns:
            encoder = self.encoders[col]
            transformed.append(encoder.transform(X[col]))
        transformed = np.vstack(transformed).T
        return transformed

In [9]:
consom_col_num = Pipeline([
    ('attribs_adder', ConsoAttributesAdder()),
])

consom_col_cat = Pipeline([
    ('attribs_adder', ConsoAttributesAdder(add_total=False, above_100=True)),
])

consom_col_bin = Pipeline([
    ('attribs_adder', ConsoAttributesAdder(add_total=False, bin=True)),
    ('one_hot_encoder', OneHotEncoder(sparse=False)),
])

num_pipe = Pipeline([
    ('std_scaler', StandardScaler())
])

date_pipe = Pipeline([
    ('attribs_adder', DateAttributesAdder())
])

one_hot_pipe = Pipeline([
    ('label_encoder', NumericalToCategoricalTransformer()),
    ('one_hot_encoder', OneHotEncoder(sparse=False)),
])

# ColumnTransformer on all included columns.
# Note columns that are not specified are dropped by default
transformers = {
    "consom_num": ("consom_num", consom_col_num, consom_col),
    "consom_cat": ("consom_cat", consom_col_cat, consom_col),
    "consom_bin": ("consom_bin", consom_col_bin, consom_col),
    "num_pipe": ("num_pipe", num_pipe, invoice_num_col),
    "date_pipe": ("date_pipe", date_pipe, invoice_dt_col+client_dt_col),
    "ord_cat_pipe": ("ord_cat_pipe", OrdinalEncoder(), invoice_ordinal_col),
    'one_hot_pipe': ('one_hot_pipe', one_hot_pipe, [x for x in invoice_cat_col + client_cat_col if x not in invoice_ordinal_col + id_cols])
}
prep = ColumnTransformer(
    transformers=[v for _, v in transformers.items()])

In [10]:
X_train = prep.fit_transform(df_merged)
X_train[0]

array([ 8.20000000e+01,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  1.00000000e+00,
        0.00000000e+00,  0.00000000e+00, -7.42677148e-02,  5.01333668e-01,
       -9.71455712e-03, -4.43843763e-01, -8.89030198e-02, -1.60238307e-01,
       -6.09058051e-02, -8.59368943e-02, -9.60323709e-02,  3.00000000e+00,
        2.01400000e+03,  1.20000000e+01,  1.99400000e+03,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  1.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  

In [11]:
len(X_train[0])

131

In [12]:
labels = df_merged['target']
labels.value_counts() / len(df_merged) * 100

target
False    92.11
True      7.89
Name: count, dtype: float64

In [13]:
from imblearn.under_sampling import RandomUnderSampler

sampler = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = sampler.fit_resample(X_train, labels)

In [14]:
import pickle

with open('../data/var_store/X_resampled.pkl', 'wb') as f:
    pickle.dump(X_resampled, f)

with open('../data/var_store/y_resampled.pkl', 'wb') as f:
    pickle.dump(y_resampled, f)

In [15]:
# import pickle

# with open('../data/var_store/X_resampled.pkl', 'rb') as f:
#     X_resampled = pickle.load(f)

# with open('../data/var_store/y_resampled.pkl', 'rb') as f:
#     y_resampled = pickle.load(f)