In [2]:
import pandas as pd 
import numpy as np 
import random

np.random.seed(42)
random.seed(42)

In [3]:
# 导入模型训练数据
train_path = "/kaggle/input/xunfei-dataset/train.csv"
train_df = pd.read_csv(train_path)
print(f"len(train_df):{(len(train_df))}")
train_df.head()

len(train_df):620356


Unnamed: 0,uuid,eid,udmap,common_ts,x1,x2,x3,x4,x5,x6,x7,x8,target
0,0,26,"{""key3"":""67804"",""key2"":""650""}",1689673468244,4,0,41,107,206,1,0,1,0
1,1,26,"{""key3"":""67804"",""key2"":""484""}",1689082941469,4,0,41,24,283,4,8,1,0
2,2,8,unknown,1689407393040,4,0,41,71,288,4,7,1,0
3,3,11,unknown,1689467815688,1,3,41,17,366,1,6,1,0
4,4,26,"{""key3"":""67804"",""key2"":""650""}",1689491751442,0,3,41,92,383,4,8,1,0


In [4]:
# 导入测试数据
test_path = "/kaggle/input/xunfei-dataset/test.csv"
test_df = pd.read_csv(test_path)
print(f"len(test_df):{(len(test_df))}")
test_df.head()

len(test_df):206785


Unnamed: 0,uuid,eid,udmap,common_ts,x1,x2,x3,x4,x5,x6,x7,x8
0,0,11,unknown,1689594441029,4,1,41,85,343,4,8,1
1,1,35,unknown,1689551329947,4,0,41,24,283,1,6,1
2,2,34,"{""key3"":""73457"",""key2"":""936""}",1688965066999,4,2,41,71,288,4,2,0
3,3,0,"{""key3"":""18771""}",1689308623754,1,0,41,104,37,4,8,1
4,4,26,"{""key3"":""67804"",""key2"":""650""}",1689516018904,0,1,41,115,217,4,8,1


In [5]:
# 整合所有数据，按照之前建模的操作处理

total_df = pd.concat((train_df,test_df), axis = 0)

total_df.drop(['uuid','udmap'], axis = 1, inplace = True)

print(f"len(total_df):{len(total_df)}")
total_df.head()

len(total_df):827141


Unnamed: 0,eid,common_ts,x1,x2,x3,x4,x5,x6,x7,x8,target
0,26,1689673468244,4,0,41,107,206,1,0,1,0.0
1,26,1689082941469,4,0,41,24,283,4,8,1,0.0
2,8,1689407393040,4,0,41,71,288,4,7,1,0.0
3,11,1689467815688,1,3,41,17,366,1,6,1,0.0
4,26,1689491751442,0,3,41,92,383,4,8,1,0.0


In [6]:
# 计算相对于2023年（非闰年）的进度百分比
total_df['common_ts'] = ((total_df['common_ts'] / 1000) % 31536000) / 31536000 
total_df.head()

Unnamed: 0,eid,common_ts,x1,x2,x3,x4,x5,x6,x7,x8,target
0,26,0.579194,4,0,41,107,206,1,0,1,0.0
1,26,0.560469,4,0,41,24,283,4,8,1,0.0
2,8,0.570757,4,0,41,71,288,4,7,1,0.0
3,11,0.572673,1,3,41,17,366,1,6,1,0.0
4,26,0.573432,0,3,41,92,383,4,8,1,0.0


In [7]:
# 选取相关数据，以供后续训练
# 皮尔逊相关系数(+1正相关,-1负相关,接近没有明显相关性)
pearson = total_df.corr(method = 'pearson').values[-1]
choose1 = np.where(abs(pearson) >= 0.01)[0]
print(f"len(choose1):{len(choose1)},choose1:{choose1}")
choose = total_df.keys().values[choose1]
total_df = total_df[choose]
total_df.head()

len(choose1):9,choose1:[ 0  1  3  5  6  7  8  9 10]


Unnamed: 0,eid,common_ts,x2,x4,x5,x6,x7,x8,target
0,26,0.579194,0,107,206,1,0,1,0.0
1,26,0.560469,0,24,283,4,8,1,0.0
2,8,0.570757,0,71,288,4,7,1,0.0
3,11,0.572673,3,17,366,1,6,1,0.0
4,26,0.573432,3,92,383,4,8,1,0.0


In [8]:
# 得到'common_ts'列的均值和方差
total_df.describe()

Unnamed: 0,eid,common_ts,x2,x4,x5,x6,x7,x8,target
count,827141.0,827141.0,827141.0,827141.0,827141.0,827141.0,827141.0,827141.0,620356.0
mean,22.150853,0.567872,1.105287,82.89957,224.947866,2.902127,5.864469,0.855634,0.140566
std,12.139231,0.008717,1.173478,44.115095,114.293439,1.444678,2.576408,0.351461,0.347574
min,0.0,0.538215,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,11.0,0.560602,0.0,51.0,133.0,1.0,6.0,1.0,0.0
50%,26.0,0.569755,1.0,86.0,241.0,4.0,7.0,1.0,0.0
75%,34.0,0.575678,2.0,107.0,313.0,4.0,7.0,1.0,0.0
max,42.0,0.579908,3.0,151.0,413.0,4.0,9.0,1.0,1.0


In [9]:
np.unique(total_df['eid'].values)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42])

In [10]:
# 将eid处理后作为新特征传入

eid_target = train_df['target'].groupby([train_df['eid']]).mean()
eid = eid_target.keys().values
target = eid_target.values
eid_target = pd.DataFrame({"eid":eid,"eid_target":target})
eid_target.head()

Unnamed: 0,eid,eid_target
0,0,0.069281
1,1,0.485054
2,2,0.139414
3,3,0.35264
4,4,0.486146


In [11]:
total_df = pd.merge(total_df, eid_target, on = "eid", how = "left")
total_df.head()

Unnamed: 0,eid,common_ts,x2,x4,x5,x6,x7,x8,target,eid_target
0,26,0.579194,0,107,206,1,0,1,0.0,0.072707
1,26,0.560469,0,24,283,4,8,1,0.0,0.072707
2,8,0.570757,0,71,288,4,7,1,0.0,0.097401
3,11,0.572673,3,17,366,1,6,1,0.0,0.098421
4,26,0.573432,3,92,383,4,8,1,0.0,0.072707


In [12]:
# 用具有周期性的三角函数新增特征（用前面得到的均值和方差）
# 归一化
total_df['sin_norm'] = np.sin(2 * np.pi * (total_df['common_ts'] - 0.567872) / 0.008717)
total_df['cos_norm'] = np.cos(2 * np.pi * (total_df['common_ts'] - 0.567872) / 0.008717)
# 未归一化
total_df['sin'] = np.sin(2 * np.pi * total_df['common_ts']) 
total_df['cos'] = np.cos(2 * np.pi * total_df['common_ts'])

In [13]:
train_df = total_df[:len(train_df)]
test_df = total_df[len(train_df):]

In [14]:
y = train_df['target'].values
X = train_df.drop(['target'], axis = 1).values

In [15]:
# 划分训练集和测试集的函数
def train_test_split(dataX,datay,shuffle=True,percentage=0.8):
    """
    将训练数据X和标签y以numpy.array数组的形式传入
    划分的比例定为 训练集:测试集 = 8:2 
    """
    if shuffle :
        random_num=[index for index in range(len(dataX))]
        np.random.shuffle(random_num)
        dataX=dataX[random_num]
        datay=datay[random_num]

    split_num = int(len(dataX) * percentage)
    train_X = dataX[:split_num]
    train_y = datay[:split_num]
    test_X = dataX[split_num:]
    test_y = datay[split_num:]
    return train_X,train_y,test_X,test_y

In [16]:
train_X,train_y,valid_X,valid_y = train_test_split(X,y,percentage=0.9)
print(f"train_X.shape:{train_X.shape},valid_X.shape:{valid_X.shape}")

train_X.shape:(558320, 13),valid_X.shape:(62036, 13)


In [None]:
!pip install FLAML
!pip install "ray[tune]<2.5.0"

In [None]:
# 斯皮尔曼相关系数,两个变量的单调关系。
spearson = train_df.corr(method = 'spearman') .values[-1]
choose2 = np.where(abs(spearson) >= 0.01)[0]
print(f"len(choose2):{len(choose2)},choose2:{choose2}")
