In [16]:
import polars as pl
import pandas as pd
from sklearn import linear_model
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

In [2]:
data_path = '/home/gyli/bishe/data/hypertension.csv'
raw_data = pl.read_csv(data_path, dtypes={'年龄' : pl.Float64})

In [3]:
# 年龄有些奇怪的值
raw_data.filter(pl.col('年龄') < 1)

年龄,中性粒细胞数,淋巴细胞数,嗜酸性粒细胞数,嗜碱性粒细胞数,总蛋白,白蛋白,球蛋白,总胆红素,直接胆红素,钾,钠,钙,尿素氮,谷草谷丙,甘油三酯,高密度脂蛋白,低密度脂蛋白,红细胞平均体积,凝血酶原时间,国际标准化比值,活化部分凝血活酶时间,纤维蛋白原,是否高血压
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64
0.0,4.96,2.51,0.06,0.0,62.2,37.9,24.3,8.7,2.7,3.961723,140.880901,2.226504,4.564519,1.5,0.98,1.68,2.9,98.0,12.7,0.98,37.7,4.64,0
0.0,4.885651,2.247269,0.1767,0.014171,75.8,44.9,30.9,10.5,2.9,3.8,142.0,2.49,5.3,1.2,1.18,1.41,3.3,92.087414,12.805008,1.001135,33.431035,3.918851,0
0.001,18.57,4.33,0.2,0.12,50.3,31.7,18.6,96.2,6.2,5.1,141.7,1.96,8.5,6.4,1.779246,1.87046,2.771729,107.2,12.949614,0.990166,34.685492,4.949974,0
0.0,5.02,2.42,0.15,0.0,44.6,18.4,26.2,5.3,0.8,3.8,140.0,2.85,9.8,1.5,1.315927,1.416631,2.86093,96.8,12.720712,0.998678,32.345378,3.394177,0
0.0,5.78,2.33,0.05,0.0,41.7,12.5,29.2,3.1,0.7,3.6,141.0,2.02,5.5,1.9,1.325564,1.461032,2.854548,93.5,12.531642,0.987379,31.730008,3.509742,0
0.005,6.48,2.74,0.25,0.02,54.771741,30.349769,24.42826,7.89242,2.291928,4.4,141.0,1.68,1.0,1.177081,1.888296,1.376351,2.909613,107.2,13.187249,1.017804,34.744938,3.725706,0
0.002,20.01,5.32,0.28,0.08,60.929035,33.388982,27.543561,9.9005,2.65883,3.751221,138.314364,2.17827,1.4,1.302109,1.366557,1.534254,2.844574,104.6,12.2,0.89,41.9,3.53,0
0.002,8.3,2.06,0.55,0.01,60.033584,33.299721,26.736938,7.098926,2.073033,4.0,140.0,2.0,1.0,1.170647,1.90308,1.363987,2.915877,104.7,13.111966,1.014731,34.448263,3.713614,0
0.001,5.14,8.15,0.24,0.03,76.447044,50.979759,25.468512,14.165689,3.214224,4.013859,143.869074,2.449987,4.514982,0.927715,1.028259,1.18794,2.274966,123.8,19.5,1.71,63.7,1.51,0
0.001,7.16,5.25,0.07,0.02,72.001096,44.703034,27.297813,10.757845,2.689839,4.2,139.0,2.44,1.9,1.109148,2.044409,1.245789,2.975762,80.8,12.392265,0.985347,31.612052,3.598008,0


In [4]:
# 筛下奇怪的值
data = raw_data.filter(pl.col('年龄') >= 1)
data.null_count()

年龄,中性粒细胞数,淋巴细胞数,嗜酸性粒细胞数,嗜碱性粒细胞数,总蛋白,白蛋白,球蛋白,总胆红素,直接胆红素,钾,钠,钙,尿素氮,谷草谷丙,甘油三酯,高密度脂蛋白,低密度脂蛋白,红细胞平均体积,凝血酶原时间,国际标准化比值,活化部分凝血活酶时间,纤维蛋白原,是否高血压
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [5]:
# 算下每个特征和是否高血压的相关度
cols = data.columns[:-1]
corr = pl.DataFrame()
for col in cols:
    corr = corr.with_columns(pl.corr(data[col], data['是否高血压']))
corr

年龄,中性粒细胞数,淋巴细胞数,嗜酸性粒细胞数,嗜碱性粒细胞数,总蛋白,白蛋白,球蛋白,总胆红素,直接胆红素,钾,钠,钙,尿素氮,谷草谷丙,甘油三酯,高密度脂蛋白,低密度脂蛋白,红细胞平均体积,凝血酶原时间,国际标准化比值,活化部分凝血活酶时间,纤维蛋白原
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
0.173126,-0.23544,0.155703,0.104577,-0.029597,0.389186,0.435219,0.036935,0.074015,-0.000422,0.144729,0.226332,0.333133,0.050992,-0.205804,0.045469,-0.025464,0.077063,-0.000579,0.067236,0.032862,0.107435,-0.043088


In [6]:
# 选一下相关度值在0.1以上的
rel_cols = []
discard_cols = []
for col in cols:
    if abs(corr[col][0]) >= 0.1:
        rel_cols.append(col)
    else:
        discard_cols.append(col)
print(rel_cols)
print(discard_cols)

['年龄', '中性粒细胞数', '淋巴细胞数', '嗜酸性粒细胞数', '总蛋白', '白蛋白', '钾', '钠', '钙', '谷草谷丙', '活化部分凝血活酶时间']
['嗜碱性粒细胞数', '球蛋白', '总胆红素', '直接胆红素', '尿素氮', '甘油三酯', '高密度脂蛋白', '低密度脂蛋白', '红细胞平均体积', '凝血酶原时间', '国际标准化比值', '纤维蛋白原']


In [12]:
# 获取x,y
raw_x = raw_data.select(rel_cols).to_pandas()
raw_y = raw_data.select('是否高血压').to_pandas()

In [10]:
raw_x

Unnamed: 0,年龄,中性粒细胞数,淋巴细胞数,嗜酸性粒细胞数,总蛋白,白蛋白,钾,钠,钙,谷草谷丙,活化部分凝血活酶时间
0,79.0,9.260000,2.140000,0.180000,67.948927,39.546489,4.100000,137.200000,2.340000,1.20000,33.946848
1,73.0,2.890000,1.330000,0.130000,61.000000,40.100000,3.900000,135.800000,2.340000,1.40000,26.000000
2,65.0,4.680000,2.130000,0.200000,61.500000,39.100000,3.700000,139.400000,2.330000,0.60000,27.100000
3,44.0,4.603406,2.157543,0.172844,73.600000,40.500000,4.000000,146.300000,2.350000,1.00000,32.554017
4,53.0,4.847728,1.971421,0.163157,70.900000,41.200000,4.400000,140.200000,2.400000,1.00000,33.047475
...,...,...,...,...,...,...,...,...,...,...,...
33282,68.0,4.857917,2.117560,0.194136,70.353715,43.506534,4.100000,143.000000,2.360000,1.09598,33.649698
33283,53.0,3.770986,2.197268,0.180861,76.000000,48.300000,4.127902,141.998793,2.419122,0.50000,33.556394
33284,51.0,4.710000,2.970000,0.780000,85.100000,46.200000,4.600000,140.000000,2.300000,0.80000,35.800000
33285,69.0,5.990000,2.310000,0.320000,63.700000,36.100000,4.200000,142.000000,2.050000,0.90000,41.500000


In [11]:
raw_y

Unnamed: 0,是否高血压
0,0
1,0
2,0
3,0
4,0
...,...
33282,0
33283,0
33284,1
33285,0


In [21]:
lm = linear_model.LinearRegression()
x_train, x_test, y_train, y_test = tra
model = lm.fit(raw_x, raw_y)
# cross_val_score(lm, raw_x, raw_y, cv=5)
model.score()

TypeError: score() missing 2 required positional arguments: 'X' and 'y'