In [1]:
import pandas as pd
import os
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, classification_report, \
    accuracy_score
import joblib

In [2]:
# 读取数据
dataset_path="heart.csv"
df=pd.read_csv(dataset_path)

# 显示所有列
pd.set_option('display.max_columns', None)

# 显示前5条示例数据
print("示例数据：")
print(df.head())
print("-" * 40)

示例数据：
   age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \
0   63    1   3       145   233    1        0      150      0      2.3      0   
1   37    1   2       130   250    0        1      187      0      3.5      0   
2   41    0   1       130   204    0        0      172      0      1.4      2   
3   56    1   1       120   236    0        1      178      0      0.8      2   
4   57    0   0       120   354    0        1      163      1      0.6      2   

   ca  thal  target  
0   0     1       1  
1   0     2       1  
2   0     2       1  
3   0     2       1  
4   0     2       1  
----------------------------------------


In [3]:
# 数据基本信息
print("数据基本信息：")
print(df.info())
print("-" * 40)

数据基本信息：
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        303 non-null    int64  
 12  thal      303 non-null    int64  
 13  target    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB
None
----------------------------------------


In [4]:
# 数据分析
print("数据简要分析：")
print(df.describe(include='all'))
print("-" * 40)

数据简要分析：
              age         sex          cp    trestbps        chol         fbs  \
count  303.000000  303.000000  303.000000  303.000000  303.000000  303.000000   
mean    54.366337    0.683168    0.966997  131.623762  246.264026    0.148515   
std      9.082101    0.466011    1.032052   17.538143   51.830751    0.356198   
min     29.000000    0.000000    0.000000   94.000000  126.000000    0.000000   
25%     47.500000    0.000000    0.000000  120.000000  211.000000    0.000000   
50%     55.000000    1.000000    1.000000  130.000000  240.000000    0.000000   
75%     61.000000    1.000000    2.000000  140.000000  274.500000    0.000000   
max     77.000000    1.000000    3.000000  200.000000  564.000000    1.000000   

          restecg     thalach       exang     oldpeak       slope          ca  \
count  303.000000  303.000000  303.000000  303.000000  303.000000  303.000000   
mean     0.528053  149.646865    0.326733    1.039604    1.399340    0.729373   
std      0.525860  

In [5]:
# 标准化
# 对cp，restecg，slope, ca, thal 进行标准化
ss = StandardScaler()
df['cp'] = ss.fit_transform(df[['cp']])
df['restecg'] = ss.fit_transform(df[['restecg']])
df['slope'] = ss.fit_transform(df[['slope']])
df['ca'] = ss.fit_transform(df[['ca']])
df['thal'] = ss.fit_transform(df[['thal']])

In [6]:
# 输出预处理结果
path = 'preprocess_data.csv'
df.to_csv(path, index=False)

In [7]:
df=pd.read_csv(path)

# 显示所有列
pd.set_option('display.max_columns', None)

# 前十条示例数据
print(df.head(10))

   age  sex        cp  trestbps  chol  fbs   restecg  thalach  exang  oldpeak  \
0   63    1  1.973123       145   233    1 -1.005832      150      0      2.3   
1   37    1  1.002577       130   250    0  0.898962      187      0      3.5   
2   41    0  0.032031       130   204    0 -1.005832      172      0      1.4   
3   56    1  0.032031       120   236    0  0.898962      178      0      0.8   
4   57    0 -0.938515       120   354    0  0.898962      163      1      0.6   
5   57    1 -0.938515       140   192    0  0.898962      148      0      0.4   
6   56    0  0.032031       140   294    0 -1.005832      153      0      1.3   
7   44    1  0.032031       120   263    0  0.898962      173      0      0.0   
8   52    1  1.002577       172   199    1  0.898962      162      0      0.5   
9   57    1  1.002577       150   168    0  0.898962      174      0      1.6   

      slope        ca      thal  target  
0 -2.274579 -0.714429 -2.148873       1  
1 -2.274579 -0.714429 -0

In [8]:
import numpy as np

df_num = df.select_dtypes(include=[np.number])
df_cat = df.select_dtypes(exclude=[np.number])

# 独热编码
enc = OneHotEncoder(handle_unknown='ignore')
cat_enc_data = enc.fit_transform(df_cat).toarray()
df_cat_enc = pd.DataFrame(data=cat_enc_data, columns=enc.get_feature_names(df_cat.columns))
print(df_cat_enc)

# 合并数值类型和分类类型
df = pd.merge(df_num, df_cat_enc, left_index=True, right_index=True)
print(df.info())

Empty DataFrame
Columns: []
Index: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, ...]

[303 rows x 0 columns]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    float64
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    float64
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpea

In [9]:
y = df.get('target')
X = df.drop('target', axis=1)


# 特征重要性
rf = RandomForestClassifier()
rf.fit(X, y)

importance = dict(zip(X.columns, rf.feature_importances_))
importance = sorted(importance.items(), key=lambda kv: (kv[1], kv[0]), reverse=True)
print(importance)

# 数据划分
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
train_df = pd.merge(X_train, y_train, left_index=True, right_index=True)
test_df = pd.merge(X_test, y_test, left_index=True, right_index=True)

train_df.to_csv("train_data.csv", index=False)
test_df.to_csv("test_data.csv", index=False)

[('cp', 0.13868640423731615), ('thalach', 0.11913566876826913), ('oldpeak', 0.10983854226354828), ('ca', 0.10805447832932717), ('thal', 0.105781879177623), ('age', 0.08394447141263667), ('trestbps', 0.07872527510850652), ('chol', 0.07274602832597533), ('exang', 0.06887610536988822), ('slope', 0.05138204331563184), ('sex', 0.033662509312909907), ('restecg', 0.019604030201399038), ('fbs', 0.0095625641769688)]


In [10]:
path='train_data.csv'

# 读取数据
df = pd.read_csv(path)

y = df.get('target')

X = df.drop('target', axis=1)

# 逻辑回归
lr = LogisticRegression(max_iter=5000)

# 拟合
model = lr.fit(X, y)

model_path = 'lr.pkl'

joblib.dump(model, model_path, compress=3)

['lr.pkl']

In [11]:
train_data_path = 'train_data.csv'
test_data_path = 'test_data.csv'

# 读取数据
train_df = pd.read_csv(train_data_path)
y_train = train_df.get('target')
X_train = train_df.drop('target', axis=1)
    

test_df = pd.read_csv(test_data_path)
y_test = test_df.get('target')
X_test = test_df.drop('target', axis=1)
   
# id
id_test = test_df[['target']]

model_path = 'lr.pkl'
model = joblib.load(model_path)
y_pred = model.predict(X_test)
id_test['prediction'] = y_pred

# 查看预测结果
print(f"预测结果:{id_test}")

# 评估
# 准确率
accuracy_score_value = accuracy_score(y_test, y_pred)
print(f"准确率:{accuracy_score_value}")

precision_score_value = precision_score(y_test, y_pred)
print(f"精确率:{precision_score_value}")

recall_score_value = recall_score(y_test, y_pred)
print(f"召回率:{recall_score_value}")

f1_score_value = f1_score(y_test, y_pred)
print(f"f1值:{f1_score_value}")

confusion_matrix_value = confusion_matrix(y_test, y_pred)
print(f"混淆矩阵:{confusion_matrix_value}")

report = classification_report(y_test, y_pred)
print(f"分类报告:{report}")

预测结果:    target  prediction
0        1           1
1        0           0
2        1           1
3        0           0
4        1           1
..     ...         ...
86       1           1
87       0           1
88       0           0
89       0           0
90       0           0

[91 rows x 2 columns]
准确率:0.8241758241758241
精确率:0.803921568627451
召回率:0.8723404255319149
f1值:0.8367346938775511
混淆矩阵:[[34 10]
 [ 6 41]]
分类报告:              precision    recall  f1-score   support

           0       0.85      0.77      0.81        44
           1       0.80      0.87      0.84        47

    accuracy                           0.82        91
   macro avg       0.83      0.82      0.82        91
weighted avg       0.83      0.82      0.82        91



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  id_test['prediction'] = y_pred


In [12]:
pd.DataFrame(y_pred).to_csv("Prediction_Result.csv")

array([1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1,
       1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1,
       0, 0, 0], dtype=int64)

In [13]:
import boto3

access_key='9CC493F7E25194E921E7'
secret_key='WzRENEQ0RUJGRkY5NzQyMDhCQjZFMDM0QkY0NTEx'
host='http://scut.depts.bingosoft.net:29997'

s3 = boto3.client('s3',
    aws_access_key_id=access_key,
    aws_secret_access_key=secret_key,
    endpoint_url=host)

s3.upload_file('lr.pkl', 'xiongbin', 'lr.pkl')