In [1]:
import pandas as pd 
import numpy as np 
import random

np.random.seed(42)
random.seed(42)

In [2]:
# 导入模型训练数据
train_path = "data/data236342/train.csv"
train_df = pd.read_csv(train_path)
print(f"len(train_df):{(len(train_df))}")
train_df.head()

len(train_df):620356


Unnamed: 0,uuid,eid,udmap,common_ts,x1,x2,x3,x4,x5,x6,x7,x8,target
0,0,26,"{""key3"":""67804"",""key2"":""650""}",1689673468244,4,0,41,107,206,1,0,1,0
1,1,26,"{""key3"":""67804"",""key2"":""484""}",1689082941469,4,0,41,24,283,4,8,1,0
2,2,8,unknown,1689407393040,4,0,41,71,288,4,7,1,0
3,3,11,unknown,1689467815688,1,3,41,17,366,1,6,1,0
4,4,26,"{""key3"":""67804"",""key2"":""650""}",1689491751442,0,3,41,92,383,4,8,1,0


In [3]:
# 导入测试数据
test_path = "data/data236342/test.csv"
test_df = pd.read_csv(test_path)
print(f"len(test_df):{(len(test_df))}")
test_df.head()

len(test_df):206785


Unnamed: 0,uuid,eid,udmap,common_ts,x1,x2,x3,x4,x5,x6,x7,x8
0,0,11,unknown,1689594441029,4,1,41,85,343,4,8,1
1,1,35,unknown,1689551329947,4,0,41,24,283,1,6,1
2,2,34,"{""key3"":""73457"",""key2"":""936""}",1688965066999,4,2,41,71,288,4,2,0
3,3,0,"{""key3"":""18771""}",1689308623754,1,0,41,104,37,4,8,1
4,4,26,"{""key3"":""67804"",""key2"":""650""}",1689516018904,0,1,41,115,217,4,8,1


In [4]:
# 整合所有数据，按照之前建模的操作处理

total_df = pd.concat((train_df,test_df), axis = 0)

total_df.drop(['uuid','udmap'], axis = 1, inplace = True)

print(f"len(total_df):{len(total_df)}")
total_df.head()

len(total_df):827141


Unnamed: 0,eid,common_ts,x1,x2,x3,x4,x5,x6,x7,x8,target
0,26,1689673468244,4,0,41,107,206,1,0,1,0.0
1,26,1689082941469,4,0,41,24,283,4,8,1,0.0
2,8,1689407393040,4,0,41,71,288,4,7,1,0.0
3,11,1689467815688,1,3,41,17,366,1,6,1,0.0
4,26,1689491751442,0,3,41,92,383,4,8,1,0.0


In [5]:
# 计算相对于2023年（非闰年）的进度百分比
total_df['common_ts'] = ((total_df['common_ts'] / 1000) % 31536000) / 31536000 
total_df.head()

Unnamed: 0,eid,common_ts,x1,x2,x3,x4,x5,x6,x7,x8,target
0,26,0.579194,4,0,41,107,206,1,0,1,0.0
1,26,0.560469,4,0,41,24,283,4,8,1,0.0
2,8,0.570757,4,0,41,71,288,4,7,1,0.0
3,11,0.572673,1,3,41,17,366,1,6,1,0.0
4,26,0.573432,0,3,41,92,383,4,8,1,0.0


In [6]:
# 选取相关数据，以供后续训练
# 皮尔逊相关系数(+1正相关,-1负相关,接近没有明显相关性)
pearson = total_df.corr(method = 'pearson').values[-1]
choose1 = np.where(abs(pearson) >= 0.01)[0]
print(f"len(choose1):{len(choose1)},choose1:{choose1}")
choose = total_df.keys().values[choose1]
total_df = total_df[choose]
total_df.head()

len(choose1):9,choose1:[ 0  1  3  5  6  7  8  9 10]


Unnamed: 0,eid,common_ts,x2,x4,x5,x6,x7,x8,target
0,26,0.579194,0,107,206,1,0,1,0.0
1,26,0.560469,0,24,283,4,8,1,0.0
2,8,0.570757,0,71,288,4,7,1,0.0
3,11,0.572673,3,17,366,1,6,1,0.0
4,26,0.573432,3,92,383,4,8,1,0.0


In [7]:
# 得到'common_ts'列的均值和方差
total_df.describe()

Unnamed: 0,eid,common_ts,x2,x4,x5,x6,x7,x8,target
count,827141.0,827141.0,827141.0,827141.0,827141.0,827141.0,827141.0,827141.0,620356.0
mean,22.150853,0.567872,1.105287,82.89957,224.947866,2.902127,5.864469,0.855634,0.140566
std,12.139231,0.008717,1.173478,44.115095,114.293439,1.444678,2.576408,0.351461,0.347574
min,0.0,0.538215,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,11.0,0.560602,0.0,51.0,133.0,1.0,6.0,1.0,0.0
50%,26.0,0.569755,1.0,86.0,241.0,4.0,7.0,1.0,0.0
75%,34.0,0.575678,2.0,107.0,313.0,4.0,7.0,1.0,0.0
max,42.0,0.579908,3.0,151.0,413.0,4.0,9.0,1.0,1.0


In [8]:
np.unique(total_df['eid'].values)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42])

In [9]:
# 将eid处理后作为新特征传入

eid_target = train_df['target'].groupby([train_df['eid']]).mean()
eid = eid_target.keys().values
target = eid_target.values
eid_target = pd.DataFrame({"eid":eid,"eid_target":target})
eid_target.head()

Unnamed: 0,eid,eid_target
0,0,0.069281
1,1,0.485054
2,2,0.139414
3,3,0.35264
4,4,0.486146


In [10]:
total_df = pd.merge(total_df, eid_target, on = "eid", how = "left")
total_df.head()

Unnamed: 0,eid,common_ts,x2,x4,x5,x6,x7,x8,target,eid_target
0,26,0.579194,0,107,206,1,0,1,0.0,0.072707
1,26,0.560469,0,24,283,4,8,1,0.0,0.072707
2,8,0.570757,0,71,288,4,7,1,0.0,0.097401
3,11,0.572673,3,17,366,1,6,1,0.0,0.098421
4,26,0.573432,3,92,383,4,8,1,0.0,0.072707


In [11]:
# 用具有周期性的三角函数新增特征（用前面得到的均值和方差）
# 归一化
total_df['sin_norm'] = np.sin(2 * np.pi * (total_df['common_ts'] - 0.567872) / 0.008717)
total_df['cos_norm'] = np.cos(2 * np.pi * (total_df['common_ts'] - 0.567872) / 0.008717)
# 未归一化
total_df['sin'] = np.sin(2 * np.pi * total_df['common_ts']) 
total_df['cos'] = np.cos(2 * np.pi * total_df['common_ts'])

In [12]:
train_df = total_df[:len(train_df)]
test_df = total_df[len(train_df):]

In [13]:
y = train_df['target'].values
X = train_df.drop(['target'], axis = 1).values

In [14]:
# 划分训练集和测试集的函数
def train_test_split(dataX,datay,shuffle=True,percentage=0.8):
    """
    将训练数据X和标签y以numpy.array数组的形式传入
    划分的比例定为 训练集:测试集 = 8:2 
    """
    if shuffle :
        random_num=[index for index in range(len(dataX))]
        np.random.shuffle(random_num)
        dataX=dataX[random_num]
        datay=datay[random_num]

    split_num = int(len(dataX) * percentage)
    train_X = dataX[:split_num]
    train_y = datay[:split_num]
    test_X = dataX[split_num:]
    test_y = datay[split_num:]
    return train_X,train_y,test_X,test_y

In [15]:
train_X,train_y,valid_X,valid_y = train_test_split(X,y,percentage=0.9)
print(f"train_X.shape:{train_X.shape},valid_X.shape:{valid_X.shape}")

train_X.shape:(558320, 13),valid_X.shape:(62036, 13)


In [17]:
!pip install --upgrade pip
!pip install FLAML
!pip install "ray[tune]<2.5.0"


Looking in indexes: https://mirror.baidu.com/pypi/simple/, https://mirrors.aliyun.com/pypi/simple/, https://pypi.tuna.tsinghua.edu.cn/simple/
Collecting pip
  Downloading https://mirrors.aliyun.com/pypi/packages/50/c2/e06851e8cc28dcad7c155f4753da8833ac06a5c704c109313b8d5a62968a/pip-23.2.1-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 22.1.2
    Uninstalling pip-22.1.2:
      Successfully uninstalled pip-22.1.2
Successfully installed pip-23.2.1
Looking in indexes: https://mirror.baidu.com/pypi/simple/, https://mirrors.aliyun.com/pypi/simple/, https://pypi.tuna.tsinghua.edu.cn/simple/
Looking in indexes: https://mirror.baidu.com/pypi/simple/, https://mirrors.aliyun.com/pypi/simple/, https://pypi.tuna.tsinghua.edu.cn/simple/
Collecting importlib-metadata (from click>=7.

In [18]:
from flaml import AutoML
automl = AutoML()

In [19]:
automl.fit(train_X, train_y, task = "classification", metric = 'accuracy', time_budget = 3600)

[flaml.automl.logger: 08-24 23:39:10] {1679} INFO - task = classification
[flaml.automl.logger: 08-24 23:39:10] {1690} INFO - Evaluation method: holdout
[flaml.automl.logger: 08-24 23:39:11] {1788} INFO - Minimizing error metric: 1-accuracy
[flaml.automl.logger: 08-24 23:39:11] {1900} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'xgboost', 'extra_tree', 'xgb_limitdepth', 'lrl1']
[flaml.automl.logger: 08-24 23:39:11] {2218} INFO - iteration 0, current learner lgbm
[flaml.automl.logger: 08-24 23:39:11] {2345} INFO - Estimated sufficient time budget=60067s. Estimated necessary time budget=1385s.
[flaml.automl.logger: 08-24 23:39:11] {2397} INFO -  at 0.8s,	estimator lgbm's best error=0.1406,	best estimator lgbm's best error=0.1406
[flaml.automl.logger: 08-24 23:39:11] {2218} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 08-24 23:39:11] {2397} INFO -  at 1.0s,	estimator lgbm's best error=0.1406,	best estimator lgbm's best error=0.1406
[flaml.automl.logger: 08-2



[flaml.automl.logger: 08-24 23:40:16] {2397} INFO -  at 66.2s,	estimator lrl1's best error=0.1406,	best estimator lgbm's best error=0.1056
[flaml.automl.logger: 08-24 23:40:16] {2218} INFO - iteration 52, current learner lrl1
[flaml.automl.logger: 08-24 23:40:17] {2397} INFO -  at 66.6s,	estimator lrl1's best error=0.1406,	best estimator lgbm's best error=0.1056
[flaml.automl.logger: 08-24 23:40:17] {2218} INFO - iteration 53, current learner lrl1




[flaml.automl.logger: 08-24 23:40:18] {2397} INFO -  at 67.9s,	estimator lrl1's best error=0.1406,	best estimator lgbm's best error=0.1056
[flaml.automl.logger: 08-24 23:40:18] {2218} INFO - iteration 54, current learner lrl1




[flaml.automl.logger: 08-24 23:40:19] {2397} INFO -  at 69.3s,	estimator lrl1's best error=0.1406,	best estimator lgbm's best error=0.1056
[flaml.automl.logger: 08-24 23:40:19] {2218} INFO - iteration 55, current learner extra_tree
[flaml.automl.logger: 08-24 23:40:20] {2397} INFO -  at 69.8s,	estimator extra_tree's best error=0.1372,	best estimator lgbm's best error=0.1056
[flaml.automl.logger: 08-24 23:40:20] {2218} INFO - iteration 56, current learner lgbm




[flaml.automl.logger: 08-24 23:40:48] {2397} INFO -  at 97.5s,	estimator lgbm's best error=0.1041,	best estimator lgbm's best error=0.1041
[flaml.automl.logger: 08-24 23:40:48] {2218} INFO - iteration 57, current learner lrl1
[flaml.automl.logger: 08-24 23:40:49] {2397} INFO -  at 98.8s,	estimator lrl1's best error=0.1406,	best estimator lgbm's best error=0.1041
[flaml.automl.logger: 08-24 23:40:49] {2218} INFO - iteration 58, current learner xgb_limitdepth




[flaml.automl.logger: 08-24 23:40:52] {2397} INFO -  at 102.3s,	estimator xgb_limitdepth's best error=0.1302,	best estimator lgbm's best error=0.1041
[flaml.automl.logger: 08-24 23:40:52] {2218} INFO - iteration 59, current learner lrl1
[flaml.automl.logger: 08-24 23:41:22] {2397} INFO -  at 132.0s,	estimator lrl1's best error=0.1377,	best estimator lgbm's best error=0.1041
[flaml.automl.logger: 08-24 23:41:22] {2218} INFO - iteration 60, current learner rf




[flaml.automl.logger: 08-24 23:41:23] {2397} INFO -  at 132.4s,	estimator rf's best error=0.1388,	best estimator lgbm's best error=0.1041
[flaml.automl.logger: 08-24 23:41:23] {2218} INFO - iteration 61, current learner lgbm
[flaml.automl.logger: 08-24 23:41:55] {2397} INFO -  at 165.4s,	estimator lgbm's best error=0.1041,	best estimator lgbm's best error=0.1041
[flaml.automl.logger: 08-24 23:41:55] {2218} INFO - iteration 62, current learner rf
[flaml.automl.logger: 08-24 23:41:56] {2397} INFO -  at 165.7s,	estimator rf's best error=0.1374,	best estimator lgbm's best error=0.1041
[flaml.automl.logger: 08-24 23:41:56] {2218} INFO - iteration 63, current learner lgbm
[flaml.automl.logger: 08-24 23:42:10] {2397} INFO -  at 180.1s,	estimator lgbm's best error=0.1041,	best estimator lgbm's best error=0.1041
[flaml.automl.logger: 08-24 23:42:10] {2218} INFO - iteration 64, current learner rf
[flaml.automl.logger: 08-24 23:42:11] {2397} INFO -  at 180.5s,	estimator rf's best error=0.1358,	be



[flaml.automl.logger: 08-25 00:02:15] {2397} INFO -  at 1385.3s,	estimator rf's best error=0.1332,	best estimator lgbm's best error=0.0472
[flaml.automl.logger: 08-25 00:02:15] {2218} INFO - iteration 84, current learner rf
[flaml.automl.logger: 08-25 00:02:16] {2397} INFO -  at 1385.7s,	estimator rf's best error=0.1332,	best estimator lgbm's best error=0.0472
[flaml.automl.logger: 08-25 00:02:16] {2218} INFO - iteration 85, current learner rf
[flaml.automl.logger: 08-25 00:02:16] {2397} INFO -  at 1386.3s,	estimator rf's best error=0.1319,	best estimator lgbm's best error=0.0472
[flaml.automl.logger: 08-25 00:02:16] {2218} INFO - iteration 86, current learner extra_tree
[flaml.automl.logger: 08-25 00:02:17] {2397} INFO -  at 1386.7s,	estimator extra_tree's best error=0.1372,	best estimator lgbm's best error=0.0472
[flaml.automl.logger: 08-25 00:02:17] {2218} INFO - iteration 87, current learner rf
[flaml.automl.logger: 08-25 00:02:17] {2397} INFO -  at 1387.1s,	estimator rf's best err

In [24]:
#二分类任务的召回率
def recall(y_true,y_pred):
    #Positive(类别为1)，True:预测正确，即预测值为1
    TP = np.sum(y_true + y_pred == 2)
    #Negative(类别为0),True:预测正确,即预测值为0
    TN = np.sum(y_true + y_pred == 0)
    #False表示预测错误，预测为1，实际是0
    FP = np.sum(np.where((y_true == 0) & (y_pred == 1),1,0))
    #False表示预测错误，预测为0,实际为1
    FN = np.sum(np.where((y_true == 1) & (y_pred == 0),1,0))
    #召回率的计算:
    Recall_rate = TP / (TP + FN)
    return Recall_rate

In [25]:
def accuracy(y_true,y_pred):
    return np.sum(y_true == y_pred) / len(y_true)

In [26]:
def f1_score(y_true,y_pred):
    Recall_rate = recall(y_true,y_pred)
    Acc = accuracy(y_true,y_pred)
    return 2 * Acc * Recall_rate / (Acc + Recall_rate)

In [27]:
train_pred = automl.predict(train_X)
print(f"train_accuracy:{accuracy(train_y,train_pred)},train_f1_score:{f1_score(train_y,train_pred)}")
valid_pred = automl.predict(valid_X)
print(f"valid_accuracy:{accuracy(valid_y,valid_pred)},valid_f1_score:{f1_score(valid_y,valid_pred)}")

train_accuracy:0.995344963461814,train_f1_score:0.9868817926273911
valid_accuracy:0.9546714810755046,valid_f1_score:0.8585805309886235


In [42]:
test_X = test_df.drop(['target'],axis = 1).values
test_pred = automl.predict(test_X)
test_pred[:10]

array([0., 0., 0., 0., 0., 0., 0., 0., 1., 0.])

In [None]:
def save_predictions_to_csv(test_pred):
    # 创建一个新的DataFrame，其中包含uuid和target两个列
    # 这里我假设uuid是一个从0开始的递增整数序列
    uuids = list(range(len(test_pred)))
    df = pd.DataFrame({'uuid': uuids, 'target': test_pred})

    # 将DataFrame保存到当前文件夹下的一个名为"predictions.csv"的文件中
    df.to_csv("predictions.csv", index=False)

# 调用函数，保存预测结果到CSV文件
save_predictions_to_csv(test_pred)


In [43]:
# 创建一个只有预测结果的 DataFrame
submission_df = pd.DataFrame({'target': test_pred})

# 将 DataFrame 保存为 CSV 文件，不包含行索引
submission_df.to_csv('predictions.csv', index=False)