In [5]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import FeatureUnion, Pipeline 
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.validation import check_is_fitted
import warnings
import glob

warnings.filterwarnings("ignore")

In [6]:
df = pd.read_csv("clean_dataset.csv", dtype = {"cnae": str})

In [7]:
# _h0, _h1, _h2
# _h0: history 0, here h0 means the year 2017 (historia 0, aquí h0 significa el año 2017)
# _h1: history -1, here h1 means the year 2016 (historia -1, aquí h1 significa el año 2016)
# _h2: history -2, here h2 means the year 2015 (historia -2, aquí h2 significa el año 2015)

# Ebita Margin - Ebitda / Turn over (Ventas)
# p49100: Profit (Resultado del ejercicio)
# p40800: Amortization (Amortización) 
# p40100: Sales Turnover (Ingresos de Explotación)
# p40500: Other sales (Otros Ingresos)
df['ebitda_income'] = (df.p49100_h1+df.p40800_h1)/(df.p40100_mas_40500_h1) 

# Total Debt / Ebita 
# p31200: Short Term Debt / Deuda a corto plazo
# p32300: Long Term Debt / Deuda a largo plazo
# p49100: Profit (Resultado del ejercicio)
# p40800: Amortization (Amortización) 
df['debt_ebitda'] =(df.p31200_h1 + df.p32300_h1) /(df.p49100_h1+df.p40800_h1) 

# rraa_rrpp: Financial leveraging / apalancamiento financiero 
# p10000: Total Assets / Total activos
# p20000: Own Capital / Patrimonio neto
df['rraa_rrpp'] = (df.p10000_h1 - df.p20000_h1) /df.p20000_h1

# Log of Operating Income
df['log_operating_income'] = np.log(df.p40100_mas_40500_h1)

In [8]:
df_clean = df[['ebitda_income','debt_ebitda','rraa_rrpp','log_operating_income','target_status', 'cnae']].replace([np.inf, -np.inf], np.nan).dropna()
X = df_clean[['ebitda_income','debt_ebitda','rraa_rrpp','log_operating_income', 'cnae']]
y = df_clean['target_status']

#X_train, X_test, y_train, y_test = train_test_split( X, y , test_size = 0.2 , random_state = 42, stratify = y)

In [9]:
df_clean.target_status.value_counts()

0    30247
1     3582
Name: target_status, dtype: int64

We have a clear problem of an imbalanced dataset, so we have to deal with some techniques in order to try to score better

### UPSAMPLING

In [10]:
from sklearn.utils import resample

X = df_clean[['ebitda_income','debt_ebitda','rraa_rrpp','log_operating_income', 'cnae']]
y = df_clean['target_status']

# setting up testing and training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=27)

# concatenate our training data back together
X = pd.concat([X_train, y_train], axis=1)

# separate minority and majority classes
active = X[X.target_status==0]
closed = X[X.target_status==1]

# upsample minority
closed_upsampled = resample(closed,
                          replace=True, # sample with replacement
                          n_samples=len(active), # match number in majority class
                          random_state=27) # reproducible results

# combine majority and upsampled minority
upsampled = pd.concat([active, closed_upsampled])

# check new class counts
upsampled.target_status.value_counts()

1    22726
0    22726
Name: target_status, dtype: int64

In [11]:
X_train = upsampled.drop(columns = ["target_status"])
y_train = upsampled.target_status

In [8]:
#Categrical features to pass down the categorical pipeline 
categorical_features = ["sector"]

#Numerical features to pass down the numerical pipeline 
numerical_features = ['ebitda_income','debt_ebitda','rraa_rrpp','log_operating_income']

In [9]:
class CNAE_Transformer(BaseEstimator, TransformerMixin ):   

    #Return self nothing else to do here
    def fit( self, X, y = None  ):
        return self
    
    #Transformer method we wrote for this transformer 
    def transform(self, X , y = None ):  
        X = X.copy()
        X.loc[:, "sector"] = X.cnae.str[:2]
        X.sector = X.sector.str.strip()
        X = X.replace({"sector":""}, "missing")
        return X

In [10]:
class Mean_Imputer(BaseEstimator, TransformerMixin ):   

    #Return self nothing else to do here
    def fit( self, X, y = None  ):
        return self
    
    #Transformer method we wrote for this transformer 
    def transform(self, X , y = None ):  
        numeric_column_names = X.select_dtypes(include =["float64", "int"]).columns
        X = X.copy()
        X[numeric_column_names] = X[numeric_column_names].fillna(X.mean())
        return X

In [11]:
class GroupNormalizer(BaseEstimator, TransformerMixin):
    '''
    Class used for imputing missing values in a pd.DataFrame using either mean or median of a group.
    
    Parameters
    ----------    
    group_cols : list
        List of columns used for calculating the aggregated value 
    target : str
        The name of the column to impute
    metric : str

    Returns
    -------
    X : array-like
        The array with imputed values in the target column
    '''
    def __init__(self, group_cols, target):
        
        self.group_cols = group_cols
        self.target = target
    
    def fit(self, X, y=None):
        
        assert pd.isnull(X[self.group_cols]).any(axis=None) == False, 'There are missing values in group_cols'
        
        impute_map = X.groupby(self.group_cols)[self.target].agg([np.mean, np.std]) \
                                                            .reset_index(drop=False)
        self.impute_map_ = impute_map.fillna(impute_map.median())
        
        impute_map_total = X[self.target].agg([np.mean, np.std]) 
        self.impute_map_total = impute_map_total.fillna(impute_map_total.median())        

        return self
    
    
    def normalizer_sector(self, df):
        
        df_normalized = pd.DataFrame(columns = df.columns)        
        for group, x in df.groupby("sector"):
            if any(x.sector.isin(self.impute_map_.sector)):
                impute_sector = self.impute_map_.loc[self.impute_map_.sector.isin(x.sector)]

                mean = impute_sector.xs("mean", level = 1, axis = 1)
                std = impute_sector.xs("std", level = 1, axis = 1)
                x[self.target] = (x[self.target] - mean.iloc[0]) -(x[self.target] - std.iloc[0])
            else:
                x.loc[:, self.target] = (x[self.target] - self.impute_map_total.loc['mean']) / self.impute_map_total.loc['std']
            df_normalized = df_normalized.append(x)               
        return df_normalized
    
    def normalizer_total(self, df):
        
        df.loc[:, self.target] = (df[self.target] - self.impute_map_total.loc['mean']) / self.impute_map_total.loc['std']
        return df
        
    
    def transform(self, X, y=None):
        
        # make sure that the imputer was fitted
        check_is_fitted(self, 'impute_map_')
        
        X = X.copy()
        df_final = pd.DataFrame(columns = X.columns)
        
        # Primero vemos si el sector de la tupla que queremos trasnformar está en el atributo impute_map_, que contiene
        # la media y la desviación estándar por grupo de cuando se entrenó el modelo. Si no está, aplicamos la normalización
        # basándonos en la media y la desviación estándar de todo el dataset con el que se entrenó.

        df_final = self.normalizer_sector(X)
            
        return df_final

In [12]:
#Custom Transformer that extracts columns passed as argument to its constructor 
class FeatureSelector( BaseEstimator, TransformerMixin ):
    #Class Constructor 
    def __init__( self, feature_names ):
        self._feature_names = feature_names 
    
    #Return self nothing else to do here    
    def fit( self, X, y = None ):
        return self 
    
    #Method that describes what we need this transformer to do
    def transform( self, X, y = None ):
        return X[ self._feature_names ] 

In [13]:
preprocessing = Pipeline([("CNAE_Transformer", CNAE_Transformer()), ("Mean_Imputer", Mean_Imputer()), ("standarize", GroupNormalizer(["sector"], numerical_features))])
categorical_pipeline = Pipeline( steps = [ ( 'preprocessing', preprocessing ), 
                                  ( 'cat_selector', FeatureSelector(categorical_features)),
                                  ( 'one_hot_encoder', OneHotEncoder( sparse = False ) ) ] )

numerical_pipeline = Pipeline( steps = [( 'preprocessing', preprocessing ), 
                                  ( 'cat_selector', FeatureSelector(numerical_features))])


full_pipeline = FeatureUnion( transformer_list = [ ( 'categorical_pipeline', categorical_pipeline ), 
                                                  
                                                  ( 'numerical_pipeline', numerical_pipeline ) ] )

In [14]:
full_pipeline_m = Pipeline( steps = [ ( 'full_pipeline', full_pipeline),
                                  
                                  ( 'model', RandomForestClassifier(random_state=1234)) ] )


full_pipeline_m.fit(X_train, y_train )


y_pred = full_pipeline_m.predict( X_train ) 
y_pred_test = full_pipeline_m.predict( X_test ) 

from sklearn.metrics import accuracy_score
print("Accuracy train: {0}".format(accuracy_score(y_pred,y_train)))

print("Accuracy test: {0}".format(accuracy_score(y_pred_test,y_test)))

Accuracy train: 0.8911159024905395
Accuracy test: 0.41948451170489476


In [15]:
from sklearn.metrics import classification_report

print("classification report for train")
print(classification_report(y_train, y_pred))

print("classification report for test")
print(classification_report(y_test, y_pred_test))

classification report for train
              precision    recall  f1-score   support

           0       1.00      0.78      0.88     22726
           1       0.82      1.00      0.90     22726

    accuracy                           0.89     45452
   macro avg       0.91      0.89      0.89     45452
weighted avg       0.91      0.89      0.89     45452

classification report for test
              precision    recall  f1-score   support

           0       0.89      0.39      0.55      7521
           1       0.11      0.63      0.19       937

    accuracy                           0.42      8458
   macro avg       0.50      0.51      0.37      8458
weighted avg       0.81      0.42      0.51      8458



### DOWNSAMPLING

In [16]:
# downsample minority
active_downsample = resample(active,
                          replace=True, # sample with replacement
                          n_samples=len(closed), # match number in majority class
                          random_state=27) # reproducible results

# combine majority and downsample minority
downsample = pd.concat([active_downsample, closed])

# check new class counts
downsample.target_status.value_counts()

1    2645
0    2645
Name: target_status, dtype: int64

In [17]:
X_train = downsample.drop(columns = ["target_status"])
y_train = downsample.target_status

In [18]:
preprocessing = Pipeline([("CNAE_Transformer", CNAE_Transformer()), ("Mean_Imputer", Mean_Imputer()), ("standarize", GroupNormalizer(["sector"], numerical_features))])
categorical_pipeline = Pipeline( steps = [ ( 'preprocessing', preprocessing ), 
                                  ( 'cat_selector', FeatureSelector(categorical_features)),
                                  ( 'one_hot_encoder', OneHotEncoder( sparse = False ) ) ] )

numerical_pipeline = Pipeline( steps = [( 'preprocessing', preprocessing ), 
                                  ( 'cat_selector', FeatureSelector(numerical_features))])


full_pipeline = FeatureUnion( transformer_list = [ ( 'categorical_pipeline', categorical_pipeline ), 
                                                  
                                                  ( 'numerical_pipeline', numerical_pipeline ) ] )

In [19]:
full_pipeline_m = Pipeline( steps = [ ( 'full_pipeline', full_pipeline),
                                  
                                  ( 'model', RandomForestClassifier(random_state=1234)) ] )


full_pipeline_m.fit(X_train, y_train )


y_pred = full_pipeline_m.predict( X_train ) 
y_pred_test = full_pipeline_m.predict( X_test ) 

from sklearn.metrics import accuracy_score
print("Accuracy train: {0}".format(accuracy_score(y_pred,y_train)))

#print("Accuracy test: {0}".format(accuracy_score(y_pred_test,y_test)))

Accuracy train: 0.8982986767485822


In [20]:
print("classification report for train")
print(classification_report(y_train, y_pred))

print("classification report for test")
print(classification_report(y_test, y_pred_test))

classification report for train
              precision    recall  f1-score   support

           0       1.00      0.80      0.89      2645
           1       0.83      1.00      0.91      2645

    accuracy                           0.90      5290
   macro avg       0.92      0.90      0.90      5290
weighted avg       0.92      0.90      0.90      5290

classification report for test
              precision    recall  f1-score   support

           0       0.89      0.39      0.55      7521
           1       0.11      0.63      0.19       937

    accuracy                           0.42      8458
   macro avg       0.50      0.51      0.37      8458
weighted avg       0.81      0.42      0.51      8458



### SMOTE

In [21]:
from imblearn.over_sampling import SMOTE

ModuleNotFoundError: No module named 'imblearn'