## Import Packages

In [None]:
import time
import pandas as pd
import numpy as np
import datetime as dt
from math import *
import logging




# And some Machine Learning modules from scikit-learn
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from copy import copy
from sklearn import preprocessing
from sklearn.metrics import cohen_kappa_score
from category_encoders import *
from sklearn.metrics import roc_curve, precision_recall_curve, auc, make_scorer, recall_score, accuracy_score, precision_score, confusion_matrix
from sklearn import ensemble
from sklearn import metrics

from sklearn.pipeline import Pipeline

# Some modules for plotting and visualizing
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display


# In[232]:


from collections import Counter
from tempfile import mkdtemp
from shutil import rmtree
from sklearn.datasets import make_classification
import category_encoders
import functools
from sklearn.base import BaseEstimator, TransformerMixin
from imblearn.over_sampling import ADASYN, SMOTE, RandomOverSampler
from imblearn.pipeline import make_pipeline
from imblearn.metrics import classification_report_imbalanced


## Load data files

In [None]:
col=['flex', 'pays', 'fab', 'vds', 'chass', 'checkD', 'year', 'plant']
clus_var = pd.read_csv('chassis/cluster_de_variante.csv')
df_mixed = pd.read_csv('chassis/df_mixed.csv')

## Filtering

In [None]:
min_instances = 20
b = pd.DataFrame.from_dict(Counter(df_mixed['idvariante']), orient='index').sort_values(by=0)
df_mixed = df_mixed[ df_mixed['idvariante'].isin( list(b[ b[0] > min_instances ].index ) ) ]

## Useful Functions

In [None]:
def CalculClassification(cluster='clus', df=df_mixed, y_col='class', resample = True, cv= False, col=col, name = None, filtering=True  ):
    from sklearn.externals import joblib
    logging.basicConfig(filename='calcul_chassis-17-09.log',level=logging.DEBUG)
    logging.basicConfig(format='%(asctime)s : %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')

    loop = 0
    one = True
    clf = None


    #Filtering
    if cluster == 'clus':
        DF = df.drop(columns ='idvariante')
        y_col = 'class'
        
    elif type(cluster) == int or type(cluster) == np.int64 :
        if y_col == 'class':
            if filtering:
                DF = df[ df['class'] == cluster ].drop(columns = 'idvariante')
            else:
                DF = df.drop(columns = 'idvariante')
        elif y_col == 'idvariante':
            if filtering :
                DF = df[ df['class'] == cluster ].drop(columns='class')
            else :
                DF = df.drop(columns = 'class')
    else:
        raise ValueError('A very specific bad thing happened')
        
    if not name:
        name = cluster

    #Encodage
    logging.info('{}/{}'.format(name,'encodage') )
    x_cols = DF.columns.drop(y_col)
    

    
    y = DF[y_col]
    X = DF[x_cols]

    
    display(DF.shape)
    display(Counter(y))
    
    
    ce_x = BinaryEncoder(cols=col).fit(X,y)
    X_num = ce_x.transform(X)
    joblib.dump(ce_x, 'chassi__s/encoder/enc_clus%s' % name )


    #Label Encoding    
    logging.info('{}/{}'.format(name,'label_encoding') )
    le = preprocessing.LabelEncoder()
    le.fit(y)
    y_res = le.transform(y)
    joblib.dump(le, 'chassi__s/label/label_clus%s' % name)

    
    #Resampling
    logging.info('{}/{}'.format(name,'resampling') )
    if resample:
        #rus = imblearn.under_sampling.RandomUnderSampler(random_state=42)
        rus = imblearn.over_sampling.SMOTE(n_jobs=-1, kind='borderline1')
        X_res, Y_res = rus.fit_sample(X_num, y_res)
    else:
        X_res, Y_res = X_num, y_res


    # Affichage distribution par classes
    %matplotlib inline
    nb_counts = Counter(Y_res)
    tdf = pd.DataFrame.from_dict(nb_counts, orient='index').sort_values(by=0)

    index = tdf.index.tolist()

    fig = plt.figure(figsize=(15,10))
    ax = fig.add_subplot(111)
    ax.bar(range(len(index)), tdf[0].tolist())
    ax.axhline(500) 

    plt.xlabel('Classe', fontsize=15)
    plt.ylabel('Nb instances', fontsize=15)
    plt.xticks(range(len(index)), index, fontsize=10, rotation=30)
    plt.show()

    # Split train and test
    X_train, X_test, y_train, y_test = train_test_split( X_res, Y_res, test_size=0.2, random_state=20,  stratify=Y_res)

    # Training and evaluation
    logging.info('{}/{}'.format(name,'training') )
    if one == True:
        #clf = ensemble.GradientBoostingClassifier(n_estimators=100)
        clf = ensemble.RandomForestClassifier(n_estimators=100, 
                                            criterion='entropy',
                                            min_samples_split=3, 
                                            random_state = 42, 
                                            max_depth=25, 
                                            n_jobs=-1,  
                                            class_weight=None)
        
        if cluster == 'clus':
            clf = ensemble.BaggingClassifier(n_estimators=100)

        if cv:
            if 'max_depth' in clf.get_params():
                grid = [{
                    #'n_estimators': [ 100, 150],
                    'max_depth': [ None,  15, 20 ],
                    #'criterion': ['entropy', 'gini']
                }]
            elif 'max_samples' in clf.get_params():
                grid = [{
                    'n_estimators': [100, 200], 
                    'max_features': [ 1.0]
                }]
            cv = GridSearchCV(clf, grid, scoring='accuracy', cv=None)
            cv.fit(X_train, y_train)
            logging.info('{}/{}'.format(cluster,metrics.accuracy_score(y_test, cv.predict(X_test))) )
            clf = cv
        else:
            clf.fit(X_train, y_train)
            logging.info('{}/{}'.format(cluster, metrics.accuracy_score(y_test, clf.predict(X_test))) )

        # Display Evaluation result
        print(metrics.classification_report( y_test, clf.predict(X_test) ))

        #Saving model
        from sklearn.externals import joblib
        joblib.dump(clf, 'chassi__s/model/cls_%s' % name)

    else:

        pipeline = Pipeline([('est', ensemble.RandomForestClassifier())])

        grid = [
            {'est': [ensemble.BaggingClassifier()] },
            {'est': [ensemble.RandomForestClassifier()] },
            {'est': [ensemble.GradientBoostingClassifier()] },
            {'est': [ensemble.AdaBoostClassifier()] },
        ]

        grid_test = [{
            'est': [\
                    ensemble.BaggingClassifier(), \
                    ensemble.RandomForestClassifier(), \
                    ensemble.GradientBoostingClassifier(), \
                    ensemble.AdaBoostClassifier()\
                   ]
        }]

        gs = GridSearchCV(pipeline, grid, scoring='accuracy', n_jobs=-1)

        gs.fit(X_train, y_train)

        errvals = np.array([])

        this_err = metrics.accuracy_score(y_test, gs.predict(X_test))

        print(this_err)
        logging.info('{}/{}'.format(cluster,this_err) )
        
        clf = gs.best_estimator_.steps[0][1]
        one = True

    loop+=1

## Run Classification

In [None]:
highvar = []

_count = {}
for name, group in df_mixed.groupby([df_mixed['class'], df_mixed['idvariante']]):
    try:
        _count[ name[0] ] = _count[ name[0] ] + 1
    except KeyError:
        _count[ name[0] ] = 0
for i,k in _count.items():
    if k > 100:
        highvar.append(i)


__list_clus = ['clus'] + list(set(df_mixed['class'].unique()) - set(highvar))

In [None]:
for i in __list_clus:
    if i == 'clus':
        CalculClassification(cluster=i, resample=False)
    else:
        CalculClassification(cluster=i, y_col='idvariante')

In [None]:
__att = {}
__att[7] = ['modele','vitessesnbr']
__att[9] = ['modele','energie']
__att[17] = ['carosserie']
__att[11] = ['energie']
__att[29] = ['modele', 'carosserie']

def ClassiMultiNiveau(DF, attr_list, cluster, clus_var_loc, name=None ):
    col = attr_list.pop(0)
    
    attr =  clus_var_loc[col].unique().tolist()

    
    for j,x in enumerate(attr):
        variante_list = clus_var_loc[ clus_var_loc[col] == x ]['id'].tolist()
        DF.loc[ DF['idvariante'].isin(variante_list), 'class' ] = j
    
    if not name:
        rad = str(cluster)
    else:
        rad = name
              
    CalculClassification(cluster=i, df=DF, y_col='class', filtering=False, name=rad)
    
    for j in Counter(DF['class']):
        display('%s------%s' % (rad,j))
        DF_x = DF[ DF['class'] == j ]

        if len( DF_x['idvariante'].unique()) > 90:
            name = rad + '_%s'%j
            
            clus_var_loc = clus_var_loc[ clus_var_loc[col] == attr[j] ]
            ClassiMultiNiveau( DF=DF_x, attr_list=attr_list, cluster= i, clus_var_loc=clus_var_loc, name=name)
        else:
            name = rad + '_%s'%j
            CalculClassification(cluster=i, df=DF_x, y_col='idvariante', filtering=False, name=name)
            




for i,k in __att.items():
    
    DF = df_mixed[ df_mixed['class'] == int(i) ]
    ClassiMultiNiveau(DF=DF, attr_list=k, cluster=i, clus_var_loc = clus_var[ clus_var['cluster'] == i ])
