- 实现Logistic算法
- 使用k-fold交叉验证进行实验（可调用工具包）
- 数据集Autistic Spectrum Disorder Screening df for Children df Set（https://archive.ics.uci.edu/ml/datasets/Autistic+Spectrum+Disorder+Screening+Data+for+Children++#）
- 列出实验结果表，包括Accuracy，precision，recall等
- 数据集需要必要的预处理
- 尽可能添加注释说明程序功能

In [4]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.discriminant_analysis import StandardScaler
from sklearn.model_selection import train_test_split

In [5]:
# 加载数据集并检查缺失值
df = pd.read_csv("./dataset/Autism-Screening-Child-Data Plus Description/Autism-Child-Data.arff", delimiter=",", comment="@", header=None)

# 添加属性名
df.columns = ['A1_Score', 'A2_Score', 'A3_Score', 'A4_Score', 'A5_Score',
              'A6_Score', 'A7_Score', 'A8_Score', 'A9_Score', 'A10_Score',
              'age', 'gender', 'ethnicity', 'jundice', 'austim', 'contry_of_res',
              'used_app_before', 'result', 'age_desc','relation','Class/ASD']


# 加载数据集并进行预处理
df.replace('?', pd.NaT, inplace=True)

# 去除多余属性列·
df = df.drop(['age_desc', 'relation', 'ethnicity', 'contry_of_res'], axis=1).dropna()

# 对数据重新编码
df['gender'] = df['gender'].apply(lambda x: 1 if x == 'm' else 0)
df['jundice'] = df['jundice'].apply(lambda x: 1 if x == 'yes' else 0)
df['austim'] = df['austim'].apply(lambda x: 1 if x == 'yes' else 0)
df['used_app_before'] = df['used_app_before'].apply(lambda x: 1 if x == 'yes' else 0)
df['Class/ASD']=df['Class/ASD'].apply(lambda x:1 if x == 'NO' else 0)
df['age']=df['age'].astype(int) # 转为int整型

# 划分X，y
X = df.drop('Class/ASD', axis=1).values
y = df['Class/ASD'].values
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 288 entries, 0 to 291
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype
---  ------           --------------  -----
 0   A1_Score         288 non-null    int64
 1   A2_Score         288 non-null    int64
 2   A3_Score         288 non-null    int64
 3   A4_Score         288 non-null    int64
 4   A5_Score         288 non-null    int64
 5   A6_Score         288 non-null    int64
 6   A7_Score         288 non-null    int64
 7   A8_Score         288 non-null    int64
 8   A9_Score         288 non-null    int64
 9   A10_Score        288 non-null    int64
 10  age              288 non-null    int32
 11  gender           288 non-null    int64
 12  jundice          288 non-null    int64
 13  austim           288 non-null    int64
 14  used_app_before  288 non-null    int64
 15  result           288 non-null    int64
 16  Class/ASD        288 non-null    int64
dtypes: int32(1), int64(16)
memory usage: 39.4 KB


In [6]:
df.head()

Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,age,gender,jundice,austim,used_app_before,result,Class/ASD
0,1,1,0,0,1,1,0,1,0,0,6,1,0,0,0,5,1
1,1,1,0,0,1,1,0,1,0,0,6,1,0,0,0,5,1
2,1,1,0,0,0,1,1,1,0,0,6,1,0,0,1,5,1
3,0,1,0,0,1,1,0,0,0,1,5,0,1,0,0,4,1
4,1,1,1,1,1,1,1,1,1,1,5,1,1,0,0,10,0


__init__方法接收两个参数，分别是学习率和迭代次数，用于初始化逻辑回归模型的超参数。

sigmoid方法定义sigmoid函数，该函数将任意实数映射到(0,1)之间的区间，用于将预测结果转换为概率值。

fit方法接收两个参数，分别是特征矩阵X和标签向量y，用于拟合模型。首先在特征矩阵X的第一列添加一个全为1的列，用于表示截距。然后初始化权重参数为全零向量，接着使用梯度下降算法迭代更新权重参数，直到达到指定的迭代次数。在每次迭代中，计算预测值h，然后计算梯度gradient，并根据学习率更新权重参数self.theta。

predict方法接收一个参数X，用于对给定的特征矩阵X进行预测。首先在特征矩阵X的第一列添加一个全为1的列，用于表示截距。然后计算预测值y_pred，并将其转换为二分类结果，即将概率值大于0.5的预测为1，否则预测为0。最后返回二分类结果。

In [7]:
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


class LogisticRegression:
    def __init__(self, learning_rate=0.1, num_iterations=10000):
        self.learning_rate = learning_rate
        self.num_iterations = num_iterations
    
    # 定义sigmoid函数
    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))
    

    def fit(self, X, y):
        self.X = np.insert(X, 0, 1, axis=1)  # 添加截距
        self.y = y.reshape(-1, 1)

        # 初始化权重参数
        self.theta = np.zeros((self.X.shape[1], 1))
        
        for i in range(self.num_iterations):
            z = np.dot(self.X, self.theta)
            h = self.sigmoid(z)

            # 计算梯度
            gradient = np.dot(self.X.T, (h - self.y)) / self.y.size
            # 更新权重参数
            self.theta -= self.learning_rate * gradient
    
    def predict(self, X):
        X = np.insert(X, 0, 1, axis=1)
        y_pred = self.sigmoid(np.dot(X, self.theta))
        return (y_pred > 0.5).astype(int)

In [8]:
# K折交叉验证
k = 5
kf = KFold(n_splits=k, shuffle=True, random_state=42)
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []

for train_index, val_index in kf.split(X):
    # 拆分数据集和验证集
    X_train_fold, X_val_fold = X[train_index], X[val_index]
    y_train_fold, y_val_fold = y[train_index], y[val_index]
    
    # 训练模型
    model = LogisticRegression(learning_rate=0.1, num_iterations=30000)
    model.fit(X_train_fold, y_train_fold)
    
    # 预测模型
    y_val_pred = model.predict(X_val_fold)
    
    # 计算评价指标
    accuracy = accuracy_score(y_val_fold, y_val_pred)
    precision = precision_score(y_val_fold, y_val_pred)
    recall = recall_score(y_val_fold, y_val_pred)
    f1 = f1_score(y_val_fold, y_val_pred)
    
    # 将评价指标加入列中
    accuracy_scores.append(accuracy)
    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)

In [9]:
# 打印平均评价指标
print("Accuracy:", np.mean(accuracy_scores))
print("Precision:", np.mean(precision_scores))
print("Recall:", np.mean(recall_scores))
print("F1-score:", np.mean(f1_scores))

Accuracy: 0.9722928009679371
Precision: 0.9885714285714287
Recall: 0.9572128851540616
F1-score: 0.9714926378984062
