In [1]:
import tensorflow as tf
from tensorflow import keras

import os
import tempfile

import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

import sklearn
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
train = pd.read_csv('tr_FE.csv')
print(train.shape)
test = pd.read_csv('ts_FE.csv')
print(test.shape)

train_df, val_df = train_test_split(train, test_size=0.2)

(2021448, 78)
(4577464, 78)


In [3]:
y_train = train_df.click
X_train = train_df.drop(['click', 'device_ip', 'Unnamed: 0'], axis = 1)

y_val = val_df.click
X_val = val_df.drop(['click', 'device_ip', 'Unnamed: 0'], axis = 1)

In [4]:
#构建训练模型
METRICS = [
#       keras.metrics.BinaryAccuracy(name='accuracy'),
    tf.keras.metrics.BinaryCrossentropy(
    name='binary_crossentropy', dtype=None, from_logits=False, label_smoothing=0
)
]



def make_model(metrics=METRICS, output_bias=None):
  if output_bias is not None:
    output_bias = tf.keras.initializers.Constant(output_bias)
  model = keras.Sequential([
      keras.layers.Dense(
          16, activation='relu',
          input_shape=(X_train.shape[-1],),kernel_initializer='he_normal'),
      keras.layers.Dropout(0.5),
      keras.layers.Dense(128, activation='relu',kernel_initializer='he_normal'),
      keras.layers.Dropout(0.5),
      keras.layers.Dense(128, activation='relu',kernel_initializer='he_normal'),
      keras.layers.Dropout(0.5),
      keras.layers.Dense(16, activation='relu',kernel_initializer='he_normal'),
      keras.layers.Dropout(0.5),
      keras.layers.Dense(1, activation='sigmoid',kernel_initializer='he_normal'),
                         #bias_initializer=output_bias),
  ])

  model.compile(
      optimizer=keras.optimizers.Adam(lr=0.001),
      loss=keras.losses.BinaryCrossentropy(),
      metrics=metrics)

  return model

In [5]:
#查看正负样本比例
neg, pos = np.bincount(y_train)
total = neg + pos
print('Examples:\n    Total: {}\n    Positive: {} ({:.2f}% of total)\n'.format(
    total, pos, 100 * pos / total))

Examples:
    Total: 1617158
    Positive: 274482 (16.97% of total)



In [6]:
initial_bias = np.log([pos/neg])
initial_bias = np.log([1])
initial_bias

array([0.])

In [7]:
EPOCHS = 5
BATCH_SIZE = 32

early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', 
    verbose=1,
    patience=3,
    mode='max',
    restore_best_weights=True)

#model = make_model()
model = make_model(output_bias=initial_bias)
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 16)                1216      
_________________________________________________________________
dropout (Dropout)            (None, 16)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               2176      
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 128)               16512     
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 16)                2

In [None]:
history = model.fit(
    X_train,
    y_train,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    #callbacks=[early_stopping],
    validation_data=(X_val, y_val))
    #class_weight=class_weight)

Epoch 1/5

In [None]:
X_test = test.drop(['click', 'device_ip', 'Unnamed: 0'], axis = 1)
#对原始test数据集进行切分
test_id = pd.read_csv('./test',dtype = {'id':'str'})#读取训练集数据  
test_id = test_id[['id']]

#进行预测
test_labels = model.predict(X_test)
test_labels = test_labels.reshape(1,4577464)
test_labels

#生成返回csv
s_id = test_id['id']
s_click = pd.Series(test_labels[0])

result = pd.DataFrame(list(zip(s_id, s_click)))
new_col = ['id', 'click']

result.columns = new_col
result["id"] = result["id"].astype('str')
result

In [None]:
result.to_csv('./Result',index=None) 