In [None]:
import warnings
warnings.simplefilter('ignore')

import os
import re
import gc

import numpy as np
import pandas as pd
pd.set_option('max_columns', None)
pd.set_option('max_rows', 200)

from tqdm.notebook import tqdm

from sklearn.preprocessing import LabelEncoder, MinMaxScaler, minmax_scale
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import auc, roc_auc_score

from deepctr.models import DeepFM
from deepctr.feature_column import SparseFeat, DenseFeat, get_feature_names

In [None]:
train_data = pd.read_csv('Sohu2022_data/rec_data/train-dataset.csv')
test_data = pd.read_csv('Sohu2022_data/rec_data/test-dataset.csv')

print(train_data.shape, test_data.shape)
display(train_data.head())
display(test_data.head())

In [None]:
train_data.label.value_counts()

In [None]:
data = pd.concat([train_data, test_data])
print(data.shape)

In [None]:
senti_feats = pd.read_csv('senti_feats.csv')
senti_feats

In [None]:
data = data.merge(senti_feats, left_on='itemId', right_on='id', how='left')
data

In [None]:
data['entity_count'].isna().sum()

In [None]:
for feat in ['pvId', 'suv', 'itemId', 'operator', 'browserType', 
             'deviceType', 'osType', 'province', 'city']:
    lbe = LabelEncoder()
    data[feat] = lbe.fit_transform(data[feat])

In [None]:
# 造点统计特征

data['pvid_count'] = data.groupby('pvId')['itemId'].transform('count')
data['pvid_item_nunique'] = data.groupby('pvId')['itemId'].transform('nunique')
data['pvid_suv_nunique'] = data.groupby('pvId')['suv'].transform('nunique')

data['item_count'] = data.groupby('itemId')['itemId'].transform('count')
data['item_suv_nunique'] = data.groupby('pvId')['suv'].transform('nunique')

In [None]:
sparse_features = ['pvId', 'suv', 'itemId', 'operator', 'browserType', 
                   'deviceType', 'osType', 'province', 'city']
dense_features = ['pvid_count', 'pvid_item_nunique', 'pvid_suv_nunique',
                  'item_count', 'item_suv_nunique', 
                  'senti_0_max', 'senti_0_min', 'senti_0_mean',
                  'senti_0_std', 'senti_1_max', 'senti_1_min', 'senti_1_mean',
                  'senti_1_std', 'senti_2_max', 'senti_2_min', 'senti_2_mean',
                  'senti_2_std', 'senti_3_max', 'senti_3_min', 'senti_3_mean',
                  'senti_3_std', 'senti_4_max', 'senti_4_min', 'senti_4_mean',
                  'senti_4_std', 'entity_count']

target = ['label']

In [None]:
data[dense_features] = data[dense_features].fillna(0, )

In [None]:
mms = MinMaxScaler(feature_range=(0, 1))
data[dense_features] = mms.fit_transform(data[dense_features])

In [None]:
fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].max() + 1, embedding_dim=16)
                              for i, feat in enumerate(sparse_features)] + [DenseFeat(feat, 1, )
                                                                            for feat in dense_features]

In [None]:
dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns

feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

In [None]:
data.drop(['userSeq', 'logTs'], axis=1, inplace=True)
display(data.head())

In [None]:
train_data = data[data['label'].notna()]
test_data = data[data['label'].isna()]

train_data.drop(['testSampleId'], axis=1, inplace=True)
test_data.drop(['sampleId', 'label'], axis=1, inplace=True)

print(train_data.shape, test_data.shape)

In [None]:
del data; gc.collect()

In [None]:
display(train_data.head())
display(test_data.head())

In [None]:
# 数据划分

train = train_data[:int(train_data.shape[0]*0.8)]
valid = train_data[int(train_data.shape[0]*0.8):]
test = test_data.copy()

train.shape, valid.shape, test.shape

In [None]:
train_model_input = {name: train[name] for name in feature_names}
valid_model_input = {name: valid[name] for name in feature_names}
test_model_input = {name: test[name] for name in feature_names}

In [None]:
model = DeepFM(linear_feature_columns, dnn_feature_columns, task='binary')
model.compile("adam", "binary_crossentropy", metrics=['binary_crossentropy', 'accuracy'])

In [None]:
history = model.fit(train_model_input, train[target].values,
                    batch_size=256, epochs=3, verbose=1, 
                    validation_data=(valid_model_input, valid[target].values))

In [None]:
pred_ans = model.predict(valid_model_input, batch_size=256)
print("valid AUC", round(roc_auc_score(valid[target].values, pred_ans), 4))

In [None]:
pred_ans = model.predict(test_model_input, batch_size=256)
sub = pd.read_csv('submission_orig/section2.txt', sep='\t')
sub['result'] = pred_ans
sub.to_csv('section2.txt', sep='\t', index=False)