In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# 加载数据
data = pd.read_csv('data/fraudulent.csv')

# 查看数据信息，了解缺失情况
print(data.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10086 entries, 0 to 10085
Data columns (total 19 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   contain_IP             9996 non-null   float64
 1   is_long                9997 non-null   float64
 2   is_tinyurl             9998 non-null   float64
 3   contain_at             10004 non-null  float64
 4   contain_double_slash   9970 non-null   float64
 5   contain_dash           9992 non-null   float64
 6   contain_subdomain      9989 non-null   float64
 7   is_SSL                 9990 non-null   float64
 8   with_long_history      7291 non-null   float64
 9   contain_icon           8728 non-null   float64
 10  contain_ext_domain     8559 non-null   float64
 11  contain_email_to       8007 non-null   float64
 12  allow_right_click      6679 non-null   float64
 13  contain_pop_up_window  9807 non-null   float64
 14  contain_Iframe         9427 non-null   float64
 15  ha

In [2]:
# 处理缺失值
# 以用众数填充
imputer = SimpleImputer(strategy='most_frequent')
data_imputed = imputer.fit_transform(data)

# 将处理后的数据转换回DataFrame格式
columns = data.columns
data_imputed_df = pd.DataFrame(data_imputed, columns=columns)

# 分割数据集为训练集和测试集
X = data_imputed_df.drop('y', axis=1)
y = data_imputed_df['y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)



In [3]:

# 构建预处理管道
numeric_features = X_train.select_dtypes(include=['number']).columns.tolist()
categorical_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# 构建模型,使用逻辑回归
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(random_state=1))
])

# 训练模型
model.fit(X_train, y_train)


F1 Score: 0.85


In [4]:

# 测试模型
y_pred = model.predict(X_test)

# 计算F1分数
f1 = f1_score(y_test, y_pred)
print(f'F1 Score: {f1}')

F1 Score: 0.85
