# 基线开发

> 徐皓玮


## 数据加载

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
# import xgboost as xgb
from imblearn.over_sampling import SMOTE


In [23]:
# 加载数据
train_res = pd.read_csv('Input/raw/train.csv', low_memory=False)
train_ans = pd.read_csv('Input/raw/labels.csv')
validation_res = pd.read_csv('Input/raw/val.csv')

# 合并训练数据和标签
train_data = pd.merge(train_res, train_ans, on='msisdn')

# 查看数据的基本信息
train_data.head()


Unnamed: 0,msisdn,start_time,end_time,call_event,other_party,ismultimedia,home_area_code,visit_area_code,called_home_code,called_code,...,phone1_type,phone2_type,phone1_loc_city,phone1_loc_province,phone2_loc_city,phone2_loc_province,update_time,date,date_c,is_sa
0,1203318,20231223095118,20231223095129,call_dst,1069460,0,351,351,29.0,29,...,2,9,太原,山西,西安,陕西,2023-12-25 09:30:30,2023-12-23,20240106,0
1,1203318,20231223135212,20231223135218,call_dst,1251008,0,351,351,21.0,21,...,2,1,太原,山西,上海,上海,2023-12-25 11:13:03,2023-12-23,20240106,0
2,1203318,20231224172817,20231224172839,call_dst,1296017,0,351,351,351.0,351,...,2,2,太原,山西,太原,山西,2023-12-25 21:46:48,2023-12-24,20240106,0
3,1203318,20231224191912,20231224192037,call_dst,1139033,0,351,351,351.0,351,...,2,2,太原,山西,太原,山西,2023-12-25 22:15:26,2023-12-24,20240106,0
4,1203318,20231225170350,20231225170358,call_dst,1299672,0,351,351,351.0,351,...,2,9,太原,山西,太原,山西,2023-12-26 06:12:35,2023-12-25,20240106,0


## 数据预处理

In [24]:
# 转换时间格式
train_data['start_time'] = pd.to_datetime(train_data['start_time'], format='%Y%m%d%H%M%S')
train_data['end_time'] = pd.to_datetime(train_data['end_time'], format='%Y%m%d%H%M%S')
train_data['open_datetime'] = pd.to_datetime(train_data['open_datetime'], format='%Y%m%d%H%M%S')
train_data['update_time'] = pd.to_datetime(train_data['update_time'])
train_data['date'] = pd.to_datetime(train_data['date'])
train_data['date_c'] = pd.to_datetime(train_data['date_c'], format='%Y%m%d')

# 检查并填充缺失值
train_data = train_data.fillna(-1)


## 特征工程

In [25]:
# 将混合类型字段转换为字符串类型
fields_to_convert = ['visit_area_code', 'called_code', 'phone1_loc_city', 'phone1_loc_province', 'phone2_loc_city', 'phone2_loc_province']
for field in fields_to_convert:
    train_data[field] = train_data[field].astype(str)

# 创建新的时间特征
train_data['call_duration_minutes'] = train_data['call_duration'] / 60
train_data['start_hour'] = train_data['start_time'].dt.hour
train_data['start_dayofweek'] = train_data['start_time'].dt.dayofweek

# 类别特征编码
categorical_features = ['call_event', 'ismultimedia', 'home_area_code', 'visit_area_code', 'called_home_code', 
                        'called_code', 'a_serv_type', 'long_type1', 'roam_type', 'a_product_id', 'phone1_type', 
                        'phone2_type', 'phone1_loc_city', 'phone1_loc_province', 'phone2_loc_city', 'phone2_loc_province']

train_data = pd.get_dummies(train_data, columns=categorical_features)

## 模型训练和评估

In [7]:
# 特征和标签
X = train_data.drop(columns=['msisdn', 'is_sa', 'start_time', 'end_time', 'open_datetime', 'update_time', 'date', 'date_c'])
y = train_data['is_sa']


In [8]:
# 划分训练集和测试集用于评估模型
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [26]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

# 训练随机森林模型
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# 预测和评估
y_pred = model.predict(X_test)
print('F1 Score:', f1_score(y_test, y_pred))


F1 Score: 0.8833125020030125


## 在测试集上进行预测

In [27]:
# 使用整个训练集重新训练模型
model.fit(X, y)

In [28]:
# 预处理验证集数据（同样的步骤）
validation_res['start_time'] = pd.to_datetime(validation_res['start_time'], format='%Y%m%d%H%M%S', errors='coerce')
validation_res['end_time'] = pd.to_datetime(validation_res['end_time'], format='%Y%m%d%H%M%S', errors='coerce')
validation_res['open_datetime'] = pd.to_datetime(validation_res['open_datetime'], format='%Y%m%d%H%M%S', errors='coerce')
validation_res['update_time'] = pd.to_datetime(validation_res['update_time'], errors='coerce')
validation_res['date'] = pd.to_datetime(validation_res['date'], errors='coerce')
validation_res['date_c'] = pd.to_datetime(validation_res['date_c'], format='%Y%m%d', errors='coerce')

validation_res['call_duration_minutes'] = validation_res['call_duration'] / 60
validation_res['start_hour'] = validation_res['start_time'].dt.hour
validation_res['start_dayofweek'] = validation_res['start_time'].dt.dayofweek

for field in fields_to_convert:
    validation_res[field] = validation_res[field].astype(str)

validation_res = pd.get_dummies(validation_res, columns=categorical_features)

# 确保训练集和验证集有相同的列
X_val = validation_res.reindex(columns=X.columns, fill_value=0)


In [29]:
# 预测
val_pred = model.predict(X_val)


In [30]:
np.bincount(val_pred)

array([218403,  13847])

In [31]:
# 保存预测结果
validation_res['is_sa'] = val_pred
validation_res[['msisdn', 'is_sa']].to_csv('Output/submissions/prediction.csv', index=False)