In [1]:
import numpy as np
import pandas as pd
import random
from sklearn.preprocessing import LabelEncoder

In [2]:
#loading train_data and divide them into different array: alertkey, custinfo, dp_5_arr
train_data = np.load("train_set_1.npy",allow_pickle = True)

alert_key_arr = train_data[:,0]

custinfo_arr = train_data[:,1]
custinfo_arr = np.array([np.array(i) for i in custinfo_arr]) #convert all into array

dp_5_arr = train_data[:,2]
dp_5_arr = np.array([np.array(i) for i in dp_5_arr]) #convert all into array

sarflag_arr = pd.read_csv("train_y_answer.csv")["sar_flag"].values
sarflag_arr_double = np.array([[0,1] if label == 1 else [1,0] for label in sarflag_arr])

In [3]:
#shuffle the data and do train test split

#generate random_indexes(23906)
random.seed(10)
shuffle_index = [i for i in range(len(alert_key_arr))]
random.shuffle(shuffle_index)

#shuffle all the data
shuffle_alert_key = alert_key_arr[shuffle_index]
shuffle_custinfo = custinfo_arr[shuffle_index]
shuffle_dp_5 = dp_5_arr[shuffle_index]
shuffle_sarflag = sarflag_arr_double[shuffle_index]
print(shuffle_index[:10])

print(shuffle_custinfo.shape)

[1482, 610, 8937, 410, 18718, 6198, 18647, 23110, 7603, 1065]
(23906, 42)


In [4]:
test_data = np.load("public_set_1.npy",allow_pickle = True)
print(test_data)
alert_key_arr = test_data[:,0]

custinfo_arr = test_data[:,1]
test_custinfo = np.array([np.array(i) for i in custinfo_arr]) #convert all into array

dp_5_arr = test_data[:,2]
test_dp_5 = np.array([np.array(i) for i in dp_5_arr]) #convert all into array
print(test_custinfo.shape)

[[352342
  array([0.        , 1.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 1.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 1.        , 0.        , 0.        ,
       0.        , 0.98920194, 0.08979592, 0.        , 0.        ,
       0.02967432, 0.        ])
  array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 1.        , 0.        ,
        1.        , 0.        , 1.        , 0.        , 0.        ,
        0. 

In [5]:
#calculate class weight:
neg = 0
pos = 0
total = len(shuffle_sarflag)

for label in shuffle_sarflag:
    if label[0] == 1:
        neg += 1
    else:
        pos += 1

weight_for_0 = (1 / neg) * (total / 2.0)
weight_for_1 = (1 / pos) * (total / 2.0)/(3/2)

class_weight = {0: weight_for_0, 1: weight_for_1}

print(class_weight)

{0: 0.5049425481581615, 1: 34.054131054131055}


In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pandas as pd
import math
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Dropout
from tensorflow.keras import Input
#https://keras.io/api/layers/core_layers/input/
from tensorflow.keras.layers import concatenate

In [7]:
METRICS = [
      tf.keras.metrics.Recall(name='recall'),
]


In [8]:
def model_phase():
    print(shuffle_custinfo.shape)
    #define input_shape
    custinfo_input = Input(shape=(shuffle_custinfo.shape[1]))
    dp_input = Input(shape=shuffle_dp_5.shape[1:])
    
    dp_embedding = LSTM(shuffle_dp_5.shape[2],activation="relu")(dp_input)
    dp_embedding = Dropout(0.5)(dp_embedding)
    
    concat_embedding = concatenate([custinfo_input,dp_embedding])
    
    out64 = Dense(64, activation="relu")(concat_embedding)
    
    #out32 = Dense(32, activation="relu")(out64)
    
    out16 = Dense(16, activation="relu")(out64)
    
    #out8 = Dense(8, activation="relu")(out16)
    
    #out4 = Dense(4, activation="relu")(out8)
    
    out2 = Dense(2, activation="softmax")(out16)

    model = Model(inputs=[custinfo_input, dp_input], outputs=out2)
    model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
            optimizer='adam',
            metrics=METRICS)
    print(model.summary())
    #callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=20)
    # 進行訓練
    history = model.fit([shuffle_custinfo,shuffle_dp_5], 
                        shuffle_sarflag, 
                        epochs = 5, 
                        batch_size = 64,
                        class_weight=class_weight)
                        #callbacks=[callback],
                        #validation_data=([test_custinfo,test_dp_5], test_sarflag))  

    
    
    y_pred = model.predict([test_custinfo,test_dp_5])
    
    return y_pred

In [9]:
pred = model_phase()

(23906, 42)
Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 5, 89)]      0                                            
__________________________________________________________________________________________________
lstm (LSTM)                     (None, 89)           63724       input_2[0][0]                    
__________________________________________________________________________________________________
input_1 (InputLayer)            [(None, 42)]         0                                            
__________________________________________________________________________________________________
dropout (Dropout)               (None, 89)           0           lstm[0][0]                       
__________________________________________________________________________________

In [10]:
ans_csv = pd.DataFrame([])
ans_csv["alert_key"] = test_data[:,0]
display(ans_csv)
ans_csv["probability"] = pred[:,1]
display(ans_csv)
ans_csv.to_csv("ans_csv1204.csv",index=False)

handout_data = pd.read_csv("D:\Python_projects\DataMining\yushan\預測的案件名單及提交檔案範例.csv")
handout_data = handout_data.merge(ans_csv,on="alert_key",how="left")
display(handout_data)
handout_data =  handout_data.drop(columns="probability_x")
handout_data = handout_data.rename(columns={"probability_y":"probability"})
handout_data = handout_data.fillna(0)
display(handout_data)
handout_data.to_csv("handout_dataVer4.csv",index=False)


Unnamed: 0,alert_key
0,352342
1,352866
2,352696
3,352330
4,352683
...,...
1840,364472
1841,364788
1842,364673
1843,364626


Unnamed: 0,alert_key,probability
0,352342,8.493055e-06
1,352866,1.507370e-11
2,352696,1.241570e-01
3,352330,8.416706e-06
4,352683,6.008185e-01
...,...,...
1840,364472,5.707369e-01
1841,364788,7.887495e-02
1842,364673,6.990023e-01
1843,364626,6.352881e-01


Unnamed: 0,alert_key,probability_x,probability_y
0,357307,0.000017,9.657728e-02
1,376329,0.000324,
2,373644,0.000372,
3,357668,0.000489,9.501728e-07
4,354443,0.000526,1.392722e-03
...,...,...,...
3845,364485,0.997702,2.811011e-01
3846,363155,0.998987,1.329330e-01
3847,368710,0.999694,
3848,358067,0.999821,2.323399e-01


Unnamed: 0,alert_key,probability
0,357307,9.657728e-02
1,376329,0.000000e+00
2,373644,0.000000e+00
3,357668,9.501728e-07
4,354443,1.392722e-03
...,...,...
3845,364485,2.811011e-01
3846,363155,1.329330e-01
3847,368710,0.000000e+00
3848,358067,2.323399e-01


In [11]:
from sklearn.metrics import classification_report

y_pred = [0 if label[0]> label[1] else 1 for label in pred]
y_true = [0 if label[0]> label[1] else 1 for label in test_sarflag]

print(classification_report(y_true,y_pred))
print(len(pred))

y_pred_index = [i for i in range(len(y_pred)) if y_pred[i]==1]
y_true_index = [i for i in range(len(y_true)) if y_true[i]==1]
print("predict index:",y_pred_index)
print("true index:",y_true_index)

NameError: name 'test_sarflag' is not defined