# Import des librairies

In [10]:
from sklearn.externals import joblib
import numpy as np
import pandas as pd
import datetime as dt
import logging
import pickle
from collections import Counter
import operator
from sklearn.pipeline import Pipeline

# Définition des fonctions de transformation

In [2]:
def fn( _str ):
    if ( len(_str) == 17 ):
        return np.array([[_str[:3], _str[3:6], _str[6:8], _str[8], _str[-8], _str[-7], _str[-6:] ]], dtype='object')
    else:
        return []

In [3]:
from sklearn.base import BaseEstimator, TransformerMixin

# Fonction pour décomposer le VIN en plusieurs colonnes
def define_column(df):
    df = df[df['vin'].str.len() == 17]

    df['wmi'] = df['vin'].str[:3]
    df['vds'] = df['vin'].str[3:6]
    df['chass'] = df['vin'].str[6:8]
    df['checkD'] = df['vin'].str[8]
    df['year'] = df['vin'].str[-8]  
    df['plant'] = df['vin'].str[-7] 
    df['seq'] = df['vin'].str[-6:]
    df['pays'] = df['wmi'].str[0]
    df['fab'] = df['wmi'].str[1:]
    df['flex'] = df['wmi'].str[1:] + df['chass'].str[-1]

    df = df.drop('vin', 1)
    df = df.drop('wmi', 1)

    return df

class VinTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, func):
        self.func = func
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        assert isinstance(X, pd.DataFrame)
        return self.func(X)

# Prediction

##### Testing Loop

In [72]:
# Function to load models and encoders
# Le dernier modele se trouve dans le dossier .jupyter/chassi__s/*.*
def model(s):
    RdF_s = joblib.load('chassi__s/model/cls_%s'% s)
    enc_s = joblib.load('chassi__s/encoder/enc_clus%s'%s)
    labencoder_s = joblib.load('chassi__s/label/label_clus%s'%s)

    transformer = Pipeline([  
        ('vintransform', VinTransformer(define_column) ),
        ('encoder', enc_s ),
        ('clf', RdF_s )
    ])  
    return transformer,labencoder_s

In [11]:
# Log file definition
logging.basicConfig(filename='test_chassis-{}.log'.format(str(dt.datetime.now().date())),level=logging.DEBUG)
logging.basicConfig(format='%(asctime)s : %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')

## Different Test we can run

#### VIN Français 

In [107]:
_testFR = pd.read_csv('chassis/df_vin_new.csv')

# Rebuild VIN in one string
_testFR['vin'] = _testFR['pays'] + \
_testFR['fab'] + \
_testFR['vds'] + \
_testFR['chass'] + \
_testFR['checkD']  + \
_testFR['year'].map(str) + \
_testFR['plant'] + \
_testFR['seq'].map(str)

_testFR = _testFR[['vin','idvariante']]

#Shuffle data
_all = _testFR.sample(frac=1).reset_index(drop=True)

#### VIN Espagnols 

In [None]:
_all = pd.read_csv('testESP.csv')

_all['resultat'] = _all['resultat'].str.replace(r'.([0-9]*).',r'\1')
_all = _all.rename(index=str, columns={"resultat": "idvariante"})
_all['idvariante'] = _all['idvariante'].apply(pd.to_numeric) 

#Shuffle data
_all = _all.sample(frac=1).reset_index(drop=True)

#### VIN Suédois 

In [17]:
_all = pd.read_csv('testSE.csv')
_all['resultat'] = _all['resultat'].str.replace(r'.([0-9]*).',r'\1')
_all = _all.rename(index=str, columns={"resultat": "idvariante"})
_all['idvariante'] = _all['idvariante'].apply(pd.to_numeric) 

#Shuffle data
_all = _all.sample(frac=1).reset_index(drop=True)

##### After choosing a country, run the test

In [110]:
to_test = _all[['vin', 'idvariante']]

In [None]:
import warnings
warnings.filterwarnings('ignore')
model_clus, labelenc = model('clus')
cpt = 0
true = 0
cpt_san = 0
_di = {}
wrong = []

for i in range(20000):
    _in_ = to_test.iloc[i:i+1,[0]]
    out =  int(to_test.iloc[i:i+1,[1]]['idvariante'].values[0])
    s = 'clus'
    clus = labelenc.inverse_transform(model_clus.predict(_in_)[0])
    #display(clus)
    target_class = clus
    n = clus
    cpt += 1
    
    if ( n != 15 ):
        cpt_san += 1
        while ( n <= 35 ):
            
            _model, label = model(target_class)
            niv = label.inverse_transform(_model.predict(_in_)[0])
            target_class = str(target_class) + '_' + str(int(niv))



            n = niv


        if (n == int(out) ):
            true += 1
            logging.info('{}/{}'.format(_in_.iloc[0,0],n) )
        else:
            x = dict( zip(_model.classes_, _model.predict_proba( _in_ )[0]  ) ) 
            sorted_x =  sorted(x.items(), key=operator.itemgetter(1), reverse=True) 
            
            pred_list = list(label.inverse_transform([x[0] for x in sorted_x[:5]]))
            
            if  int(out) in pred_list:
                true += 1
                #logging.info('{}/{}'.format(true,cpt_san) )
            else:
                wrong.append(_in_)
                display(pred_list)
                logging.info('{}/{}'.format(_in_, ','.join( [ str(x) for x in pred_list])) )
                
                try:
                    _di[clus] += 1
                except KeyError:
                    _di[clus] = 1


### Importantes variables à analyser après le test:<br/>
- **wrong** - Contient les VIN erronés<br/>
- **true** - Nombre de vin prédit correctement<br/>
- **_di** - Dictionnaire contenant la repartition du nombre d'erreur par cluster de variante<br/>
- **cpt** - Nombre total de VIN prédit