In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

In [2]:
base_path = Path("../data")
base_path

WindowsPath('../data')

In [3]:
data_path = base_path / "中风人群预测.csv"
data_path

WindowsPath('../data/中风人群预测.csv')

- id: 唯一标识符
- gender: "Male"（男性）、"Female"（女性）或 "Other"（其他）
- age: 患者年龄
- hypertension: 如果患者没有高血压则为0，如果患者有高血压则为1
- heart_disease: 如果患者没有心脏疾病则为0，如果患者有心脏疾病则为1
- ever_married: "No"（否）或 "Yes"（是）
- work_type: "children"（儿童）、"Govt_jov"（政府职位）、"Never_worked"（从未工作）、"Private"（私人公司）或 "Self-employed"（自雇）
- Residence_type: "Rural"（农村）或 "Urban"（城市）
- avg_glucose_level: 血液中的平均血糖水平
- bmi: 身体质量指数
- smoking_status: "formerly smoked"（以前吸烟）、"never smoked"（从未吸烟）、"smokes"（吸烟）或 "Unknown"（未知）*
- stroke: 如果患者中风则为1，如果未中风则为0

In [4]:
data = pd.read_csv(data_path).set_index("id")
data.head()

Unnamed: 0_level_0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [5]:
data.shape

(5110, 11)

# 预处理

## gender

In [6]:
np.unique(data["gender"])

array(['Female', 'Male', 'Other'], dtype=object)

In [7]:
data.loc[data["gender"] == "Female", "gender"] = 0

In [8]:
data.loc[data["gender"] == "Male", "gender"] = 1

In [9]:
data.loc[data["gender"] == "Other", "gender"] = 2

In [10]:
data["gender"] = data["gender"].astype(np.int32)

In [11]:
np.unique(data["gender"])

array([0, 1, 2])

In [12]:
data.head()

Unnamed: 0_level_0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
9046,1,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
51676,0,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
31112,1,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
60182,0,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
1665,0,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


## ever_married

In [13]:
np.unique(data["ever_married"])

array(['No', 'Yes'], dtype=object)

In [14]:
data.loc[data["ever_married"] == "No", "ever_married"] = 0

In [15]:
data.loc[data["ever_married"] == "Yes", "ever_married"] = 1

In [16]:
data["ever_married"] = data["ever_married"].astype(np.int32)

In [17]:
np.unique(data["ever_married"])

array([0, 1])

In [18]:
data.head()

Unnamed: 0_level_0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
9046,1,67.0,0,1,1,Private,Urban,228.69,36.6,formerly smoked,1
51676,0,61.0,0,0,1,Self-employed,Rural,202.21,,never smoked,1
31112,1,80.0,0,1,1,Private,Rural,105.92,32.5,never smoked,1
60182,0,49.0,0,0,1,Private,Urban,171.23,34.4,smokes,1
1665,0,79.0,1,0,1,Self-employed,Rural,174.12,24.0,never smoked,1


## work_type

In [19]:
np.unique(data["work_type"])

array(['Govt_job', 'Never_worked', 'Private', 'Self-employed', 'children'],
      dtype=object)

In [20]:
data.loc[data["work_type"] == "Govt_job", "work_type"] = 0
data.loc[data["work_type"] == "Never_worked", "work_type"] = 1
data.loc[data["work_type"] == "Private", "work_type"] = 2
data.loc[data["work_type"] == "Self-employed", "work_type"] = 3
data.loc[data["work_type"] == "children", "work_type"] = 4

In [21]:
data["work_type"] = data["work_type"].astype(np.int32)

In [22]:
np.unique(data["work_type"])

array([0, 1, 2, 3, 4])

In [23]:
data.head()

Unnamed: 0_level_0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
9046,1,67.0,0,1,1,2,Urban,228.69,36.6,formerly smoked,1
51676,0,61.0,0,0,1,3,Rural,202.21,,never smoked,1
31112,1,80.0,0,1,1,2,Rural,105.92,32.5,never smoked,1
60182,0,49.0,0,0,1,2,Urban,171.23,34.4,smokes,1
1665,0,79.0,1,0,1,3,Rural,174.12,24.0,never smoked,1


## Residence_type

In [24]:
np.unique(data["Residence_type"])

array(['Rural', 'Urban'], dtype=object)

In [25]:
data.loc[data["Residence_type"] == "Rural", "Residence_type"] = 0
data.loc[data["Residence_type"] == "Urban", "Residence_type"] = 1

In [26]:
data["Residence_type"] = data["Residence_type"].astype(np.int32)

In [27]:
np.unique(data["Residence_type"])

array([0, 1])

In [28]:
data.head()

Unnamed: 0_level_0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
9046,1,67.0,0,1,1,2,1,228.69,36.6,formerly smoked,1
51676,0,61.0,0,0,1,3,0,202.21,,never smoked,1
31112,1,80.0,0,1,1,2,0,105.92,32.5,never smoked,1
60182,0,49.0,0,0,1,2,1,171.23,34.4,smokes,1
1665,0,79.0,1,0,1,3,0,174.12,24.0,never smoked,1


## avg_glucose_level

In [29]:
np.any(np.isnan(data["avg_glucose_level"]))

False

In [30]:
data["avg_glucose_level"].describe()

count    5110.000000
mean      106.147677
std        45.283560
min        55.120000
25%        77.245000
50%        91.885000
75%       114.090000
max       271.740000
Name: avg_glucose_level, dtype: float64

## bmi

In [31]:
np.any(np.isnan(data["bmi"]))

True

In [32]:
data["bmi"].fillna(data["bmi"].mean(), inplace=True)

In [33]:
np.any(np.isnan(data["bmi"]))

False

In [34]:
data["bmi"].describe()

count    5110.000000
mean       28.893237
std         7.698018
min        10.300000
25%        23.800000
50%        28.400000
75%        32.800000
max        97.600000
Name: bmi, dtype: float64

In [35]:
data.head()

Unnamed: 0_level_0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
9046,1,67.0,0,1,1,2,1,228.69,36.6,formerly smoked,1
51676,0,61.0,0,0,1,3,0,202.21,28.893237,never smoked,1
31112,1,80.0,0,1,1,2,0,105.92,32.5,never smoked,1
60182,0,49.0,0,0,1,2,1,171.23,34.4,smokes,1
1665,0,79.0,1,0,1,3,0,174.12,24.0,never smoked,1


## Residence_type

In [36]:
np.unique(data["smoking_status"])

array(['Unknown', 'formerly smoked', 'never smoked', 'smokes'],
      dtype=object)

In [37]:
data.loc[data["smoking_status"] == "Unknown", "smoking_status"] = 0
data.loc[data["smoking_status"] == "formerly smoked", "smoking_status"] = 1
data.loc[data["smoking_status"] == "never smoked", "smoking_status"] = 2
data.loc[data["smoking_status"] == "smokes", "smoking_status"] = 3

In [38]:
data["smoking_status"] = data["smoking_status"].astype(np.int32)

In [39]:
np.unique(data["smoking_status"])

array([0, 1, 2, 3])

In [40]:
data.head()

Unnamed: 0_level_0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
9046,1,67.0,0,1,1,2,1,228.69,36.6,1,1
51676,0,61.0,0,0,1,3,0,202.21,28.893237,2,1
31112,1,80.0,0,1,1,2,0,105.92,32.5,2,1
60182,0,49.0,0,0,1,2,1,171.23,34.4,3,1
1665,0,79.0,1,0,1,3,0,174.12,24.0,2,1


## stroke

In [41]:
np.unique(data["stroke"])

array([0, 1], dtype=int64)

# 数据预处理

## 划分data和label

In [42]:
label = data["stroke"]
label.head()

id
9046     1
51676    1
31112    1
60182    1
1665     1
Name: stroke, dtype: int64

In [43]:
data.drop("stroke", axis=1, inplace=True)
data.head()

Unnamed: 0_level_0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
9046,1,67.0,0,1,1,2,1,228.69,36.6,1
51676,0,61.0,0,0,1,3,0,202.21,28.893237,2
31112,1,80.0,0,1,1,2,0,105.92,32.5,2
60182,0,49.0,0,0,1,2,1,171.23,34.4,3
1665,0,79.0,1,0,1,3,0,174.12,24.0,2


## 标准化数据

In [44]:
from sklearn.preprocessing import StandardScaler

In [45]:
ss = StandardScaler()

In [46]:
# data[["avg_glucose_level", "bmi"]] = ss.fit_transform(data[["avg_glucose_level", "bmi"]])

In [47]:
data.head()

Unnamed: 0_level_0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
9046,1,67.0,0,1,1,2,1,228.69,36.6,1
51676,0,61.0,0,0,1,3,0,202.21,28.893237,2
31112,1,80.0,0,1,1,2,0,105.92,32.5,2
60182,0,49.0,0,0,1,2,1,171.23,34.4,3
1665,0,79.0,1,0,1,3,0,174.12,24.0,2


## train_test_split

In [48]:
from sklearn.model_selection import train_test_split

In [49]:
x_train, x_val, y_train, y_val = train_test_split(data, label, test_size=0.5, random_state=0)
x_train.shape, x_val.shape, y_train.shape, y_val.shape

((2555, 10), (2555, 10), (2555,), (2555,))

# KNeighborsClassifier

In [50]:
from sklearn.neighbors import KNeighborsClassifier

In [51]:
knnc = KNeighborsClassifier()
knnc

In [52]:
knnc.fit(x_train, y_train)

In [53]:
knnc.score(x_val, y_val)

0.9467710371819961

In [54]:
knnc.predict(x_val)

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

# SVC

In [55]:
from sklearn.svm import SVC

In [56]:
svc = SVC()
svc

In [57]:
svc.fit(x_train, y_train)

In [58]:
svc.score(x_val, y_val)

0.9522504892367906

In [59]:
svc.predict(x_val)

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

# DecisionTreeClassifier

In [60]:
from sklearn.tree import DecisionTreeClassifier

In [61]:
dtc = DecisionTreeClassifier()
dtc

In [62]:
dtc.fit(x_train, y_train)

In [63]:
dtc.score(x_val, y_val)

0.9178082191780822

In [64]:
dtc.predict(x_val)

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

# RandomForestClassifier

In [65]:
from sklearn.ensemble import RandomForestClassifier

In [66]:
rfc = RandomForestClassifier()
rfc

In [67]:
rfc.fit(x_train, y_train)

In [68]:
rfc.score(x_val, y_val)

0.9530332681017613

In [69]:
rfc.predict(x_val)

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

# XGBClassifier

In [70]:
from xgboost import XGBClassifier

In [71]:
xbgc = XGBClassifier()
xbgc

In [72]:
xbgc.fit(x_train, y_train)

In [73]:
xbgc.score(x_val, y_val)

0.9467710371819961

In [74]:
xbgc.predict(x_val)

array([0, 0, 0, ..., 0, 0, 0])

# LGBMClassifier

In [75]:
from lightgbm import LGBMClassifier

In [76]:
lgbmc = LGBMClassifier(force_row_wise=True)
lgbmc

In [77]:
lgbmc.fit(x_train, y_train)

[LightGBM] [Info] Number of positive: 127, number of negative: 2428
[LightGBM] [Info] Total Bins 624
[LightGBM] [Info] Number of data points in the train set: 2555, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.049706 -> initscore=-2.950636
[LightGBM] [Info] Start training from score -2.950636


In [78]:
lgbmc.score(x_val, y_val)

0.947945205479452

In [79]:
lgbmc.predict(x_val)

array([1, 0, 0, ..., 0, 0, 0], dtype=int64)

# CatBoostClassifier

In [80]:
from catboost import CatBoostClassifier

In [81]:
catbc = CatBoostClassifier()
catbc

<catboost.core.CatBoostClassifier at 0x200c0d57ad0>

In [82]:
catbc.fit(x_train, y_train)

Learning rate set to 0.015378
0:	learn: 0.6664733	total: 140ms	remaining: 2m 20s
1:	learn: 0.6426365	total: 143ms	remaining: 1m 11s
2:	learn: 0.6214903	total: 145ms	remaining: 48.2s
3:	learn: 0.5971651	total: 147ms	remaining: 36.6s
4:	learn: 0.5792250	total: 149ms	remaining: 29.7s
5:	learn: 0.5595910	total: 151ms	remaining: 25.1s
6:	learn: 0.5438253	total: 153ms	remaining: 21.8s
7:	learn: 0.5295094	total: 155ms	remaining: 19.2s
8:	learn: 0.5140220	total: 157ms	remaining: 17.3s
9:	learn: 0.4998144	total: 159ms	remaining: 15.7s
10:	learn: 0.4842559	total: 161ms	remaining: 14.5s
11:	learn: 0.4720225	total: 163ms	remaining: 13.4s
12:	learn: 0.4591082	total: 165ms	remaining: 12.5s
13:	learn: 0.4447254	total: 167ms	remaining: 11.8s
14:	learn: 0.4309226	total: 169ms	remaining: 11.1s
15:	learn: 0.4191006	total: 171ms	remaining: 10.5s
16:	learn: 0.4074384	total: 173ms	remaining: 10s
17:	learn: 0.3965501	total: 176ms	remaining: 9.57s
18:	learn: 0.3867573	total: 177ms	remaining: 9.16s
19:	learn: 

676:	learn: 0.0745058	total: 1.58s	remaining: 756ms
677:	learn: 0.0744388	total: 1.59s	remaining: 754ms
678:	learn: 0.0743573	total: 1.59s	remaining: 751ms
679:	learn: 0.0742795	total: 1.59s	remaining: 749ms
680:	learn: 0.0742157	total: 1.59s	remaining: 746ms
681:	learn: 0.0741538	total: 1.59s	remaining: 744ms
682:	learn: 0.0741179	total: 1.6s	remaining: 741ms
683:	learn: 0.0740717	total: 1.6s	remaining: 739ms
684:	learn: 0.0740439	total: 1.6s	remaining: 736ms
685:	learn: 0.0739606	total: 1.6s	remaining: 734ms
686:	learn: 0.0739242	total: 1.6s	remaining: 731ms
687:	learn: 0.0738801	total: 1.61s	remaining: 729ms
688:	learn: 0.0738344	total: 1.61s	remaining: 727ms
689:	learn: 0.0737543	total: 1.61s	remaining: 724ms
690:	learn: 0.0736685	total: 1.61s	remaining: 722ms
691:	learn: 0.0735884	total: 1.61s	remaining: 719ms
692:	learn: 0.0735434	total: 1.62s	remaining: 717ms
693:	learn: 0.0734746	total: 1.62s	remaining: 714ms
694:	learn: 0.0734181	total: 1.62s	remaining: 712ms
695:	learn: 0.073

<catboost.core.CatBoostClassifier at 0x200c0d57ad0>

In [83]:
catbc.score(x_val, y_val)

0.9518590998043053

In [84]:
catbc.predict(x_val)

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)