In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# 1. 加载数据集
file_path = 'fraudulent.csv'
data = pd.read_csv(file_path)

# 2. 查看数据格式和缺失值
print("数据集前5行:\n", data.head())
print("\n每列缺失值总数:\n", data.isnull().sum())

# 3. 处理缺失值用平均值填充
imputer = SimpleImputer(strategy='mean')
data_filled = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)

# 4. 分割特征和标签
X = data_filled.drop('y', axis=1)  # 特征值
y = data_filled['y']              # 目标值

# 尽量模块化特征值
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 5. 分割训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=1)

# 6. 实现各类模型实现和评估
def train_and_evaluate_model(model, name):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    f1 = f1_score(y_test, y_pred)
    print(f"{name} F1评分: {f1:.4f}")

# 列出所有模型
models = {
    'KNN': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'Logistic Regression': LogisticRegression(max_iter=500),
    'SVM': SVC()
}

# 实现实验
for model_name, model in models.items():
    train_and_evaluate_model(model, model_name)


数据集前5行:
    contain_IP  is_long  is_tinyurl  contain_at  contain_double_slash  \
0         1.0      1.0         1.0         1.0                   1.0   
1         0.0      0.0         1.0         1.0                   1.0   
2         0.0      0.0         1.0         1.0                   1.0   
3         1.0      0.0         1.0         1.0                   1.0   
4         1.0      0.0         1.0         1.0                   1.0   

   contain_dash  contain_subdomain  is_SSL  with_long_history  contain_icon  \
0           0.0                1.0     0.0                0.0           1.0   
1           0.0                0.0     0.0                0.0           NaN   
2           0.0                1.0     1.0                NaN           NaN   
3           0.0                0.0     0.0                1.0           1.0   
4           0.0                0.0     1.0                NaN           1.0   

   contain_ext_domain  contain_email_to  allow_right_click  \
0                 1.0