In [127]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy as sps
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot #Ajouter plot pour enregistrer un html

init_notebook_mode(connected=True)

traindata = pd.read_csv("input_train.csv", delimiter=",")
output = pd.read_csv("output_train.csv", delimiter=";")
input_test = pd.read_csv("input_test.csv", delimiter=",")
ID = input_test['ID']

In [11]:
output['CLAIM_TYPE'].unique()

array(['DAMAGED', '-', 'NOT_RECEIVED', 'WITHDRAWAL', 'UNDEFINED',
       'SELLER_CANCEL_POSTERIORI', 'DIFFERENT', 'FAKE'], dtype=object)

In [19]:
np.sum(output['CLAIM_TYPE']=="SELLER_CANCEL_POSTERIORI")

13782

In [128]:
dataAll = pd.concat((traindata, input_test), axis=0)

In [85]:
#Visualize Birthday dates
d = [go.Histogram(x=traindata['BUYER_BIRTHDAY_DATE'])]
iplot(d, filename='Birthday dates histogram')

In [129]:
dataAll['SHIPPING_MODE'] = dataAll['SHIPPING_MODE'].fillna('NORMAL')

dataAll['SHIPPING_PRICE'] = dataAll['SHIPPING_PRICE'].map({'<1': 0, '1<5':1, '5<10':2, '10<20':3, '>20': 4})
dataAll['SHIPPING_PRICE'] = dataAll['SHIPPING_PRICE'].fillna(0)

dataAll['WARRANTIES_PRICE'] = dataAll['WARRANTIES_PRICE'].map({'<5': 1, '5<20':2, '20<50':3, '50<100':4, '100<500':5})
dataAll['WARRANTIES_PRICE'] = dataAll['WARRANTIES_PRICE'].fillna(0)

dataAll['PRICECLUB_STATUS'] = dataAll['PRICECLUB_STATUS'].fillna("UNSUBSCRIBED")

dataAll['BUYER_BIRTHDAY_DATE'] = dataAll['BUYER_BIRTHDAY_DATE'].fillna(1977)
dataAll['BUYER_BIRTHDAY_DATE'][(dataAll['BUYER_BIRTHDAY_DATE']<1940) | (dataAll['BUYER_BIRTHDAY_DATE']>2000)] = 1977

dataAll['PURCHASE_COUNT'] = dataAll['PURCHASE_COUNT'].map({'<5':0, '5<20': 1, '20<50':2, '50<100':3, '100<500': 4, '>500':5})

dataAll['BUYING_DATE'] = dataAll['BUYING_DATE'].map({'1/2017':1, '2/2017':2,'3/2017':3,'4/2017':4,'5/2017':5, '6/2017':6, '7/2017':7, '8/2017':8, '9/2017':9, '10/2017':10})


dataAll['SELLER_SCORE_COUNT'] = dataAll['SELLER_SCORE_COUNT'].map({'<100':0, '100<1000':1, '1000<10000':2, '10000<100000':3, '100000<1000000':4})
dataAll['SELLER_SCORE_COUNT'] = dataAll['SELLER_SCORE_COUNT'].fillna(0)

dataAll['SELLER_SCORE_AVERAGE'] = dataAll['SELLER_SCORE_AVERAGE'].fillna(44)

dataAll['ITEM_PRICE'] = dataAll['ITEM_PRICE'].map({'<10':0, '10<20':1, '20<50':2, '50<100':3, '100<500':4, '500<1000':5, '1000<5000':6,'>5000':7})




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [130]:
dataAll = pd.concat((dataAll, pd.get_dummies(dataAll['SHIPPING_MODE'], prefix="SHIPPING_MODE"),\
                 pd.get_dummies(dataAll['PRICECLUB_STATUS'], prefix="PRICECLUB_STATUS"),\
                 pd.get_dummies(dataAll['PRODUCT_FAMILY'], prefix="PRODUCT_FAMILY"),\
                 pd.get_dummies(dataAll['PRODUCT_TYPE'], prefix="PRODUCT_TYPE"),\
                 pd.get_dummies(dataAll['SELLER_COUNTRY'], prefix="SELLER_COUNTRY"),\
                 pd.get_dummies(dataAll['SELLER_DEPARTMENT'], prefix="SELLER_DEPARTMENT"),\
                 pd.get_dummies(dataAll['BUYER_DEPARTMENT'], prefix="BUYER_DEPARTMENT")), axis=1)

In [131]:
del dataAll['ID'], dataAll['SELLER_COUNTRY'], dataAll['SELLER_DEPARTMENT'], dataAll['BUYER_DEPARTMENT'],dataAll['PRODUCT_TYPE'], dataAll['SHIPPING_MODE'], dataAll['PRICECLUB_STATUS'], dataAll['PRODUCT_FAMILY']

In [132]:
train = dataAll.iloc[0:100000,0:]
test = dataAll.iloc[100000:,0:]

In [125]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score

In [133]:
xtrain, xtest, ytrain, ytest = train_test_split(train, np.ravel(output['CLAIM_TYPE']))

In [134]:
ytrain2 = ytrain.copy()
ytrain2[ytrain=='-'] = 'NO_CLAIM'
ytrain2[ytrain!='-'] = 'CLAIM'
ytrain3 = ytrain[ytrain!='-']
xtrain3 = xtrain[ytrain!='-']

In [155]:
from functools import lru_cache

@lru_cache(maxsize=2000)
def run_method(method, *args, **kwargs):
    model = method(*args, **kwargs)
    model.fit(xtrain, ytrain)
    pred = model.predict(xtest)
    pred2 = model.predict(test)
    return pred, accuracy_score(ytest, pred)

In [81]:
from sklearn.ensemble import RandomForestClassifier

In [136]:
model = RandomForestClassifier(n_estimators=300, max_depth=80, min_samples_split=3, n_jobs=-1, random_state=0)
model.fit(xtrain, ytrain2)
model2 = RandomForestClassifier(n_estimators=300, max_depth=80, min_samples_split=3, n_jobs=-1, random_state=0)
model2.fit(xtrain3,ytrain3)
pred = model.predict(xtest)
xtest2 = xtest[pred!='NO_CLAIM']
pred2 = model2.predict(xtest2)

In [137]:
pred[pred=='NO_CLAIM'] = '-'

In [138]:
i=0
for l,elt in enumerate(pred):
    if elt!='-':
        pred[l] = pred2[i]
        i+=1

In [139]:
pred
accuracy_score(ytest, pred)

0.51363999999999999

In [140]:
from sklearn.metrics import confusion_matrix

In [148]:
confusion_matrix(ytest, pred, labels=['-', 'DAMAGED', 'NOT_RECEIVED', 'WITHDRAWAL', 'UNDEFINED',
       'SELLER_CANCEL_POSTERIORI', 'DIFFERENT', 'FAKE'])

array([[8471,  338, 1674,  412,  113, 1454,   88,    1],
       [ 536,  287,  316,   72,   47,  183,   21,    0],
       [1079,  125, 1947,  130,   41,  382,   22,    0],
       [ 554,  106,  296,  359,   45,  404,   11,    1],
       [ 261,  111,  212,   78,  185,  167,   10,    0],
       [1229,   70,  378,  148,   17, 1481,   20,    0],
       [ 471,   43,  229,   45,   15,  165,  106,    2],
       [   3,    9,   21,    3,    0,    0,    1,    5]], dtype=int64)

In [156]:
pred, score = run_method(RandomForestClassifier, n_estimators=300, max_depth=80, min_samples_split=3, n_jobs=-1, random_state=0)

In [157]:
score

0.5696

In [158]:
confusion_matrix(ytest, pred, labels=['-', 'DAMAGED', 'NOT_RECEIVED', 'WITHDRAWAL', 'UNDEFINED',
       'SELLER_CANCEL_POSTERIORI', 'DIFFERENT', 'FAKE'])

array([[11547,    92,   456,    67,    22,   346,    19,     2],
       [ 1115,   151,   114,    15,    21,    37,     9,     0],
       [ 2224,    50,  1252,    47,    23,   128,     2,     0],
       [ 1272,    48,   101,   191,    23,   138,     2,     1],
       [  703,    40,    76,    19,   140,    44,     2,     0],
       [ 2246,    21,   130,    44,     7,   883,    12,     0],
       [  856,    12,    81,     9,     6,    41,    70,     1],
       [   13,     6,    16,     1,     0,     0,     0,     6]], dtype=int64)

In [43]:
df = pd.DataFrame(pred, columns=['CLAIM_TYPE'])
df = pd.concat((ID, df), axis=1)

In [100]:
df.to_csv('output_test.csv', sep=';', columns=['ID', 'CLAIM_TYPE'], index=False) #Ecrire le fichier

In [51]:
run_method.cache_clear()