In [127]:
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
from keras.models import Sequential
from keras.layers import Dense, Dropout, Input, concatenate , LSTM
from keras import Model
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix , f1_score
from keras.callbacks import EarlyStopping
import xgboost as xgb
from xgboost import XGBClassifier
from openpyxl import load_workbook


In [2]:
def apply_proba_threshold(y_positive_proba_pred, threshold):
    apply_threshold = np.vectorize(lambda x:0 if x< threshold else 1)
    return apply_threshold(y_positive_proba_pred)
def optimize_binary_threshold(y_proba_pred, y_true, thresholds, metric = 'f1'):
    best_score = 0
    best_thresh = 0.1
    
    for thresh in thresholds : 
        y_pred = apply_proba_threshold(y_proba_pred, thresh)
        
        score = f1_score(y_pred = y_pred,y_true = y_true)
        
            
        if score > best_score:
            best_score = score
            best_thresh = thresh
    
    return best_thresh

In [3]:
### lecture des données ####
train = pd.read_csv("Data/train.csv")
test = pd.read_csv("Data/test.csv")

In [4]:
### embedding ###
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
print('embedding chargé')

embedding chargé


In [35]:
xtrain = train[['text','keyword','location']]
ytrain = train['target']
xtest = test[['text','keyword','location']]

In [18]:
dict_keyword = {}
j=0
for i in set(train['keyword']):
    dict_keyword[i] = j
    j+=1
dict_location = {}
j=0
for i in set(train['location']):
    dict_location[i] = j
    j+=1

In [6]:
### caractéristiques des données ###
print("Colonnes : ",list(train.columns))
print("Nombre de lignes du set de train : ", len(train))
print("Nombre de lignes du set de test : ", len(test))
print("Les classes sont : ",set(train['target']))

Colonnes :  ['id', 'keyword', 'location', 'text', 'target']
Nombre de lignes du set de train :  7613
Nombre de lignes du set de test :  3263
Les classes sont :  {0, 1}


In [36]:
### Split train into train and validation ###
xtrain, xvalid, ytrain, yvalid = train_test_split(xtrain, ytrain, test_size = 0.3)
xtrain_text = np.array(embed(xtrain['text']))
xtrain_location = np.array([dict_location[i] for i in xtrain['location']])
xtrain_keyword = np.array([dict_keyword[i] for i in xtrain['keyword']])

xvalid_text = np.array(embed(xvalid['text']))
xvalid_location = np.array([dict_location[i] for i in xvalid['location']])
xvalid_keyword = np.array([dict_keyword[i] for i in xvalid['keyword']]) 

xtest_text = np.array(embed(xtest['text']))
xtest_location = []
xtest_keyword = []

for i in xtest['location']:
    if i in dict_location:
        xtest_location.append(dict_location[i])
    else:
        xtest_location.append(0)

for i in xtest['keyword']:
    if i in dict_keyword:
        xtest_keyword.append(dict_keyword[i])
    else:
        xtest_keyword.append(0)
        
xtest_location = np.array(xtest_location)
xtest_keyword = np.array(xtest_keyword)

In [None]:
xtrain_xgb = []
for i in range(len(xtrain_text)):
    m = list(xtrain_text[i])
    m.append(xtrain_location[i])
    m.append(xtrain_keyword[i])
    xtrain_xgb.append(m)
xtrain_xgb=np.array(xtrain_xgb)

xvalid_xgb = []
for i in range(len(xvalid_text)):
    m = list(xvalid_text[i])
    m.append(xvalid_location[i])
    m.append(xvalid_keyword[i])
    xvalid_xgb.append(m)
xvalid_xgb=np.array(xvalid_xgb)

In [125]:
### xgboost model learning####
model = XGBClassifier(learning_rate = 0.20, n_estimators=250, max_depth=7)
model.fit(xtrain_xgb, ytrain)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.2, max_delta_step=0, max_depth=7,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=250, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [126]:
### xgboost model predict####


y_pred = model.predict(xvalid_xgb)
CM = confusion_matrix(y_pred = y_pred,y_true = yvalid)
f1 = f1_score(y_pred = y_pred,y_true = yvalid)
print(CM)
print(f1)

[[1165  152]
 [ 263  704]]
0.7723532638507954


In [147]:
es = EarlyStopping(monitor='val_loss',mode='min',verbose=1,patience=20)
cb_list=[es]
def base_line(n1,n2):
    inputs = Input(shape=(512,))
    
    first_layer = Dense(n1, activation='sigmoid')(inputs)
    
    first_layer_bis = Dense(n1, activation='relu')(inputs)
    
    inn = concatenate([first_layer,first_layer_bis,inputs])
    
    second_layer = Dense(n2, activation='sigmoid')(inn)
    
    second_layer_bis = Dense(n2, activation='sigmoid')(second_layer)
    
    out = Dense(1,activation='sigmoid')(second_layer_bis)
    
    loss_f = 'binary_crossentropy'
    
    model = Model(inputs=[inputs], outputs=[out])
    
    model.compile(optimizer='adam',loss=[loss_f],metrics=['accuracy'])
    
    model.fit(xtrain,ytrain, epochs=200,batch_size=64,callbacks=cb_list,validation_split=0.1)
    
    return model
def b():
    inputs_text = Input(shape=(512,))
    
    inputs_keyword = Input(shape=(1,))
    
    inputs_location = Input(shape=(1,))

    
    
    first_layer_text = Dense(150, activation='sigmoid')(inputs_text)
    
    first_layer_keyword = Dense(100, activation='sigmoid')(inputs_keyword)
    
    first_layer_location = Dense(100, activation='sigmoid')(inputs_location)
    
    
    pre_out = concatenate([first_layer_text,first_layer_keyword,first_layer_location])
    
    pre_out_out = Dense(100, activation='sigmoid')(pre_out)
    
    out = Dense(1,activation='sigmoid')(pre_out_out)
    
    loss_f = 'binary_crossentropy'
    
    model = Model(inputs=[inputs_text,inputs_keyword,inputs_location], outputs=[out])
    
    model.compile(optimizer='adam',loss=[loss_f],metrics=['accuracy'])
    
    model.fit([xtrain_text,xtrain_location,xtrain_keyword],ytrain, epochs=200,batch_size=64,callbacks=cb_list,validation_split=0.1)
    
    return model

In [148]:
model = b()
ypred = model.predict([xvalid_text,xvalid_location,xvalid_keyword])


ValueError: Layer dense_85 was called with an input that isn't a symbolic tensor. Received type: <class 'keras.layers.recurrent.LSTM'>. Full input: [<keras.layers.recurrent.LSTM object at 0x0000014841A64688>]. All inputs to the layer should be tensors.

In [142]:
thresholds = [0.1+i*0.001 for i in range(900)]
print(max(thresholds))
thresh = optimize_binary_threshold(ypred, yvalid, thresholds, metric = 'f1')
print(thresh)
y_pred = apply_proba_threshold(ypred, thresh)
CM = confusion_matrix(y_pred = y_pred,y_true = yvalid)
f1 = f1_score(y_pred = y_pred,y_true = yvalid)
print(CM)
print(f1)

0.999
0.394
[[1149  168]
 [ 248  719]]
0.7756202804746495


In [118]:
idd = test['id']
predictions = model.predict(xtest)

In [92]:
### pred for submission ###
idd = test['id']
predict = model.predict([xtest_text,xtest_location,xtest_keyword])
predictions = apply_proba_threshold(predict, thresh)

In [93]:
### save data ####
path = 'Submission/'
file_name = 'SubmissionV0.csv'
df = pd.DataFrame(idd)
df['target'] = predictions
print(df)
df.to_csv(path+file_name,index=False)

         id  target
0         0       1
1         2       1
2         3       1
3         9       1
4        11       1
...     ...     ...
3258  10861       1
3259  10865       1
3260  10868       1
3261  10874       1
3262  10875       1

[3263 rows x 2 columns]
