In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import f1_score

# 导入 imblearn 库中的 SMOTE 和 Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB

# 加载数据
events = pd.read_csv('events.csv')
app_events = pd.read_csv('app_events.csv')
app_labels = pd.read_csv('app_labels.csv')
label_categories = pd.read_csv('label_categories.csv')
phone_brand_device_model = pd.read_csv('phone_brand_device_model.csv')
training_data = pd.read_csv('training_data.csv')
test_data = pd.read_csv('test_data.csv')

# 合并数据集
df = training_data.merge(phone_brand_device_model, on='device_id', how='left')
df = df.merge(events, on='device_id', how='left')
df = df.merge(app_events, on='event_id', how='left')
df = df.merge(app_labels, on='app_id', how='left')
df = df.merge(label_categories, on='label_id', how='left')

# 按设备聚合应用和品牌/型号信息
df['apps'] = df.groupby('device_id')['category'].transform(lambda x: ' '.join(x.dropna().unique()))
df['phone_brand'] = df.groupby('device_id')['phone_brand'].transform('first')
df['device_model'] = df.groupby('device_id')['device_model'].transform('first')

# 删除重复的设备行
df = df.drop_duplicates(subset='device_id')

# 提取特征和目标变量
df['group'] = df['group'].astype(str)
X = df[['apps', 'phone_brand', 'device_model']]
y = df['group']

# 编码目标标签
le = LabelEncoder()
y = le.fit_transform(y)

# 数据预处理：对应用类别使用 Tfidf，对手机品牌和型号使用 OneHot
preprocessor = ColumnTransformer(
    transformers=[
        ('tfidf', TfidfVectorizer(max_features=500), 'apps'),
        ('brand', OneHotEncoder(handle_unknown='ignore'), ['phone_brand', 'device_model'])
    ])

# 使用 SMOTE 和模型构建管道
model = Pipeline([
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    #('classifier', LogisticRegression(max_iter=2000, C=5, solver='lbfgs', random_state=42))
    #('classifier', RandomForestClassifier(n_estimators=100, random_state=42)) #, class_weight="balanced"
    #('classifier', LinearSVC(random_state=42, max_iter=10000))
    ('classifier', MultinomialNB())
])

# 将数据分为训练集和验证集
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 训练模型
model.fit(X_train, y_train)
#model.fit(X, y)

# 在验证集上进行预测
y_pred = model.predict(X_val)

# 评估模型
macro_f1 = f1_score(y_val, y_pred, average='macro')
print(f"验证集上的 Macro-F1 分数: {macro_f1:.4f}")

# 准备测试数据
test_df = test_data.merge(phone_brand_device_model, on='device_id', how='left')
test_df = test_df.merge(events, on='device_id', how='left')
test_df = test_df.merge(app_events, on='event_id', how='left')
test_df = test_df.merge(app_labels, on='app_id', how='left')
test_df = test_df.merge(label_categories, on='label_id', how='left')

# 按设备聚合应用和品牌/型号信息
test_df['apps'] = test_df.groupby('device_id')['category'].transform(lambda x: ' '.join(x.dropna().unique()))
test_df['phone_brand'] = test_df.groupby('device_id')['phone_brand'].transform('first')
test_df['device_model'] = test_df.groupby('device_id')['device_model'].transform('first')

# 删除重复的设备行
test_df = test_df.drop_duplicates(subset='device_id')

# 提取特征
X_test = test_df[['apps', 'phone_brand', 'device_model']]

# 在测试数据上进行预测
test_predictions = model.predict(X_test)

# 将预测结果映射回原始标签
test_predictions = le.inverse_transform(test_predictions)

# 将预测结果保存到 CSV 文件
submission = pd.DataFrame({'device_id': test_df['device_id'], 'group': test_predictions})
submission.to_csv('test_predictions.csv', index=False)

# 加载带有答案的测试数据以进行评估
test_data_with_ans = pd.read_csv('test_data_with_Ans.csv')

# 将预测结果与真实标签合并
test_data_with_ans = test_data_with_ans.merge(submission, on='device_id', how='left')

# 编码真实标签
y_true = le.transform(test_data_with_ans['group_x'])

# 编码预测标签
y_pred_test = le.transform(test_data_with_ans['group_y'])

# 在测试数据上评估模型
macro_f1_test = f1_score(y_true, y_pred_test, average='macro')
print(f"测试集上的 Macro-F1 分数: {macro_f1_test:.4f}")

验证集上的 Macro-F1 分数: 0.1319
测试集上的 Macro-F1 分数: 0.1314
