In [6]:
pip install arff2pandas

Collecting arff2pandas
  Using cached arff2pandas-1.0.1-py3-none-any.whl
Collecting pandas
  Using cached pandas-1.2.0-cp39-cp39-macosx_10_9_x86_64.whl (10.7 MB)
Installing collected packages: pandas, arff2pandas
Successfully installed arff2pandas-1.0.1 pandas-1.2.0
You should consider upgrading via the '/usr/local/Cellar/jupyterlab/3.0.0_1/libexec/bin/python -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [8]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd


In [61]:
from arff2pandas import a2p

def load(file):
    with open(file) as f:
        df = a2p.load(f)

        labels = """X1 net profit / total assets 
        X2 total liabilities / total assets 
        X3 working capital / total assets 
        X4 current assets / short-term liabilities 
        X5 ((cash + short-term securities + receivables - short-term liabilities) / (operating expenses - depreciation)) * 365 
        X6 retained earnings / total assets 
        X7 EBIT / total assets 
        X8 book value of equity / total liabilities 
        X9 sales / total assets 
        X10 equity / total assets 
        X11 (gross profit + extraordinary items + financial expenses) / total assets 
        X12 gross profit / short-term liabilities 
        X13 (gross profit + depreciation) / sales 
        X14 (gross profit + interest) / total assets 
        X15 (total liabilities * 365) / (gross profit + depreciation) 
        X16 (gross profit + depreciation) / total liabilities 
        X17 total assets / total liabilities 
        X18 gross profit / total assets 
        X19 gross profit / sales 
        X20 (inventory * 365) / sales 
        X21 sales (n) / sales (n-1) 
        X22 profit on operating activities / total assets 
        X23 net profit / sales 
        X24 gross profit (in 3 years) / total assets 
        X25 (equity - share capital) / total assets 
        X26 (net profit + depreciation) / total liabilities 
        X27 profit on operating activities / financial expenses 
        X28 working capital / fixed assets 
        X29 logarithm of total assets 
        X30 (total liabilities - cash) / sales 
        X31 (gross profit + interest) / sales 
        X32 (current liabilities * 365) / cost of products sold 
        X33 operating expenses / short-term liabilities 
        X34 operating expenses / total liabilities 
        X35 profit on sales / total assets 
        X36 total sales / total assets 
        X37 (current assets - inventories) / long-term liabilities 
        X38 constant capital / total assets 
        X39 profit on sales / sales 
        X40 (current assets - inventory - receivables) / short-term liabilities 
        X41 total liabilities / ((profit on operating activities + depreciation) * (12/365)) 
        X42 profit on operating activities / sales 
        X43 rotation receivables + inventory turnover in days 
        X44 (receivables * 365) / sales 
        X45 net profit / inventory 
        X46 (current assets - inventory) / short-term liabilities 
        X47 (inventory * 365) / cost of products sold 
        X48 EBITDA (profit on operating activities - depreciation) / total assets 
        X49 EBITDA (profit on operating activities - depreciation) / sales 
        X50 current assets / total liabilities 
        X51 short-term liabilities / total assets 
        X52 (short-term liabilities * 365) / cost of products sold) 
        X53 equity / fixed assets 
        X54 constant capital / fixed assets 
        X55 working capital 
        X56 (sales - cost of products sold) / sales 
        X57 (current assets - inventory - short-term liabilities) / (sales - gross profit - depreciation) 
        X58 total costs /total sales 
        X59 long-term liabilities / equity 
        X60 sales / inventory 
        X61 sales / receivables 
        X62 (short-term liabilities *365) / sales 
        X63 sales / short-term liabilities 
        X64 sales / fixed assets""".split('\n')

        df.rename(columns = {'Attr'+str(i)+'@NUMERIC':labels[i-1] for i in range(65)}, inplace=True)
        df.rename(columns = {'class@{0,1}': 'Y'}, inplace=True)

        return df

In [62]:
df_1year = load('data/1year.arff')
df_2year = load('data/2year.arff')
df_3year = load('data/3year.arff')
df_4year = load('data/4year.arff')
df_5year = load('data/5year.arff')

In [50]:
def impute_missing_values(df):
    from sklearn.impute import SimpleImputer
    imp=SimpleImputer(missing_values=np.NaN)
    idf=pd.DataFrame(imp.fit_transform(df))
    idf.columns=df.columns
    idf.index=df.index
    return idf

def split_df(df):
    from sklearn.model_selection import train_test_split
    df = impute_missing_values(df)
    
    X = df[df.columns[:-2]]
    Y = df[df.columns[-1]]
    return train_test_split(X,Y)

In [70]:
def RandomForest_grid_search(df):
    """
    Réglage des hyper-paramètres du classifieur Random Forest
    """
    import sys, pickle
    from sklearn.ensemble          import RandomForestClassifier
    from sklearn.model_selection   import GridSearchCV
    from sklearn.metrics           import make_scorer
    from sklearn.preprocessing     import label_binarize
    from sklearn.multiclass        import OneVsRestClassifier
    #grid search :

    parameters = {  "n_estimators"         : [ 10, 100, 200 ], 
                    "min_samples_leaf"     : [  3,   1,   7 ],
                    "criterion"            : ["gini"],#["gini", "entropy"]         ,
                    "max_features"         : ["auto"]      ,
                    "max_depth"            : [None]         , 
                    "n_jobs"               : [ -1]             ,  
                    "class_weight"         : ["auto"]          ,
                    "warm_start"           : [False]           }
    
    grid_type = "RandomForestClassifier"
    
    print ("-"*30, "\n %s"%grid_type); sys.stdout.flush()
    """
        Search 
    """
    
    # Exhaustive Grid Search : will try all parameters combinations
    # vs RandomizedSeachCV where the model test random combinations.
    
    df = impute_missing_values(df)
    X = df[df.columns[:-2]]
    Y = df[df.columns[-1]]
    
    grid         = GridSearchCV(RandomForestClassifier(), {})#, parameters, verbose=1, scoring='f1_weighted')
    
    try :grid.fit(X, Y)
    except Exception as e : print ("[%s - 2] : %s"%(grid_type,e) )
    
    print (" best_score_ %s with =  %s "%( grid.best_score_,  grid.best_estimator_ ) ); sys.stdout.flush()    
    return grid.best_estimator_


In [71]:
RandomForest_grid_search(df_1year)

------------------------------ 
 RandomForestClassifier
 best_score_ 0.9728196898903023 with =  RandomForestClassifier() 


RandomForestClassifier()

In [53]:

def random_forest(df):
    from sklearn.ensemble import RandomForestClassifier

    x_train, x_test, y_train, y_test = split_df(df)
    
    algo   = RandomForestClassifier()
    modele = algo.fit(x_train, y_train)
    print(modele.score(x_test, y_test))
    
random_forest(df_1year)

0.977233921457029


In [65]:
def xgb(df):
    from xgboost import XGBClassifier
    
    x_train, x_test, y_train, y_test = split_df(df)
    
    algo   = XGBClassifier(use_label_encoder=False)t
    modele = algo.fit(x_train, y_train)
    print(modele.score(x_test, y_test))

xgb(df_1year)

0.9778030734206034


In [66]:
def catboost(df):
    from catboost import CatBoostClassifier
    
    x_train, x_test, y_train, y_test = split_df(df)
    
    algo   = CatBoostClassifier()
    modele = algo.fit(x_train, y_train)
    print(modele.score(x_test, y_test))

catboost(df_1year)

Learning rate set to 0.020948
0:	learn: 0.6595033	total: 5.54ms	remaining: 5.53s
1:	learn: 0.6176885	total: 12.2ms	remaining: 6.1s
2:	learn: 0.5877864	total: 17.3ms	remaining: 5.74s
3:	learn: 0.5531123	total: 23.2ms	remaining: 5.79s
4:	learn: 0.5267925	total: 28.7ms	remaining: 5.72s
5:	learn: 0.5032264	total: 34.5ms	remaining: 5.72s
6:	learn: 0.4812985	total: 39.8ms	remaining: 5.65s
7:	learn: 0.4608732	total: 45.4ms	remaining: 5.62s
8:	learn: 0.4411393	total: 50.8ms	remaining: 5.6s
9:	learn: 0.4222547	total: 55.7ms	remaining: 5.51s
10:	learn: 0.4056474	total: 63ms	remaining: 5.66s
11:	learn: 0.3901568	total: 68.7ms	remaining: 5.65s
12:	learn: 0.3748720	total: 74.1ms	remaining: 5.63s
13:	learn: 0.3594270	total: 81ms	remaining: 5.71s
14:	learn: 0.3461683	total: 86ms	remaining: 5.65s
15:	learn: 0.3302245	total: 90.6ms	remaining: 5.57s
16:	learn: 0.3135944	total: 97.3ms	remaining: 5.63s
17:	learn: 0.2995522	total: 103ms	remaining: 5.62s
18:	learn: 0.2890159	total: 108ms	remaining: 5.59s
19

185:	learn: 0.0553563	total: 1.22s	remaining: 5.32s
186:	learn: 0.0552402	total: 1.23s	remaining: 5.33s
187:	learn: 0.0551830	total: 1.24s	remaining: 5.33s
188:	learn: 0.0550147	total: 1.24s	remaining: 5.33s
189:	learn: 0.0549201	total: 1.25s	remaining: 5.32s
190:	learn: 0.0547516	total: 1.25s	remaining: 5.31s
191:	learn: 0.0546867	total: 1.26s	remaining: 5.3s
192:	learn: 0.0546327	total: 1.27s	remaining: 5.29s
193:	learn: 0.0545661	total: 1.27s	remaining: 5.29s
194:	learn: 0.0544686	total: 1.28s	remaining: 5.28s
195:	learn: 0.0543892	total: 1.28s	remaining: 5.27s
196:	learn: 0.0542908	total: 1.29s	remaining: 5.27s
197:	learn: 0.0540153	total: 1.3s	remaining: 5.26s
198:	learn: 0.0539166	total: 1.3s	remaining: 5.25s
199:	learn: 0.0538812	total: 1.31s	remaining: 5.25s
200:	learn: 0.0538015	total: 1.32s	remaining: 5.24s
201:	learn: 0.0536759	total: 1.32s	remaining: 5.23s
202:	learn: 0.0536213	total: 1.33s	remaining: 5.23s
203:	learn: 0.0535393	total: 1.34s	remaining: 5.22s
204:	learn: 0.0

371:	learn: 0.0376085	total: 2.42s	remaining: 4.09s
372:	learn: 0.0375607	total: 2.43s	remaining: 4.09s
373:	learn: 0.0375036	total: 2.44s	remaining: 4.09s
374:	learn: 0.0374347	total: 2.45s	remaining: 4.08s
375:	learn: 0.0373751	total: 2.46s	remaining: 4.08s
376:	learn: 0.0373142	total: 2.46s	remaining: 4.07s
377:	learn: 0.0372650	total: 2.47s	remaining: 4.07s
378:	learn: 0.0371611	total: 2.48s	remaining: 4.06s
379:	learn: 0.0370749	total: 2.48s	remaining: 4.05s
380:	learn: 0.0370456	total: 2.49s	remaining: 4.04s
381:	learn: 0.0369562	total: 2.49s	remaining: 4.04s
382:	learn: 0.0369139	total: 2.5s	remaining: 4.03s
383:	learn: 0.0368806	total: 2.51s	remaining: 4.02s
384:	learn: 0.0368134	total: 2.51s	remaining: 4.01s
385:	learn: 0.0367754	total: 2.52s	remaining: 4.01s
386:	learn: 0.0367316	total: 2.53s	remaining: 4s
387:	learn: 0.0365556	total: 2.53s	remaining: 4s
388:	learn: 0.0364568	total: 2.54s	remaining: 3.99s
389:	learn: 0.0363742	total: 2.54s	remaining: 3.98s
390:	learn: 0.03628

553:	learn: 0.0272920	total: 3.63s	remaining: 2.93s
554:	learn: 0.0272372	total: 3.64s	remaining: 2.92s
555:	learn: 0.0272222	total: 3.65s	remaining: 2.92s
556:	learn: 0.0272041	total: 3.66s	remaining: 2.91s
557:	learn: 0.0271767	total: 3.66s	remaining: 2.9s
558:	learn: 0.0271502	total: 3.67s	remaining: 2.9s
559:	learn: 0.0271177	total: 3.68s	remaining: 2.89s
560:	learn: 0.0270433	total: 3.68s	remaining: 2.88s
561:	learn: 0.0270205	total: 3.69s	remaining: 2.88s
562:	learn: 0.0269847	total: 3.69s	remaining: 2.87s
563:	learn: 0.0269420	total: 3.7s	remaining: 2.86s
564:	learn: 0.0269318	total: 3.71s	remaining: 2.85s
565:	learn: 0.0268365	total: 3.71s	remaining: 2.85s
566:	learn: 0.0268193	total: 3.72s	remaining: 2.84s
567:	learn: 0.0267263	total: 3.73s	remaining: 2.83s
568:	learn: 0.0266869	total: 3.73s	remaining: 2.83s
569:	learn: 0.0266696	total: 3.74s	remaining: 2.82s
570:	learn: 0.0266162	total: 3.75s	remaining: 2.81s
571:	learn: 0.0265870	total: 3.75s	remaining: 2.81s
572:	learn: 0.0

735:	learn: 0.0201880	total: 4.83s	remaining: 1.73s
736:	learn: 0.0201231	total: 4.84s	remaining: 1.73s
737:	learn: 0.0200882	total: 4.85s	remaining: 1.72s
738:	learn: 0.0200760	total: 4.86s	remaining: 1.72s
739:	learn: 0.0200399	total: 4.86s	remaining: 1.71s
740:	learn: 0.0200309	total: 4.87s	remaining: 1.7s
741:	learn: 0.0199864	total: 4.88s	remaining: 1.7s
742:	learn: 0.0199566	total: 4.88s	remaining: 1.69s
743:	learn: 0.0199316	total: 4.89s	remaining: 1.68s
744:	learn: 0.0199139	total: 4.89s	remaining: 1.68s
745:	learn: 0.0198286	total: 4.9s	remaining: 1.67s
746:	learn: 0.0197808	total: 4.91s	remaining: 1.66s
747:	learn: 0.0197603	total: 4.91s	remaining: 1.66s
748:	learn: 0.0197179	total: 4.92s	remaining: 1.65s
749:	learn: 0.0196932	total: 4.93s	remaining: 1.64s
750:	learn: 0.0196661	total: 4.93s	remaining: 1.64s
751:	learn: 0.0196388	total: 4.94s	remaining: 1.63s
752:	learn: 0.0196133	total: 4.95s	remaining: 1.62s
753:	learn: 0.0195721	total: 4.95s	remaining: 1.62s
754:	learn: 0.0

924:	learn: 0.0149253	total: 6.04s	remaining: 490ms
925:	learn: 0.0148893	total: 6.05s	remaining: 484ms
926:	learn: 0.0148744	total: 6.06s	remaining: 477ms
927:	learn: 0.0148525	total: 6.07s	remaining: 471ms
928:	learn: 0.0148174	total: 6.07s	remaining: 464ms
929:	learn: 0.0147729	total: 6.08s	remaining: 457ms
930:	learn: 0.0147301	total: 6.08s	remaining: 451ms
931:	learn: 0.0147147	total: 6.09s	remaining: 444ms
932:	learn: 0.0147068	total: 6.09s	remaining: 438ms
933:	learn: 0.0146649	total: 6.1s	remaining: 431ms
934:	learn: 0.0146580	total: 6.11s	remaining: 425ms
935:	learn: 0.0146170	total: 6.11s	remaining: 418ms
936:	learn: 0.0145745	total: 6.12s	remaining: 411ms
937:	learn: 0.0145618	total: 6.12s	remaining: 405ms
938:	learn: 0.0145551	total: 6.13s	remaining: 398ms
939:	learn: 0.0145370	total: 6.13s	remaining: 392ms
940:	learn: 0.0145184	total: 6.14s	remaining: 385ms
941:	learn: 0.0145083	total: 6.15s	remaining: 378ms
942:	learn: 0.0145038	total: 6.15s	remaining: 372ms
943:	learn: 0

In [None]:
scatter_matrix = pd.plotting.scatter_matrix(df_1year, alpha=0.2)