In [1]:
# 1,Logistic Regression:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegressionCV,LinearRegression
from sklearn.linear_model.coordinate_descent import ConvergenceWarning
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings("ignore")  # 拦截异常

# 1.1,加载数据：
names = ['id','Clump Thickness','Uniformity of Cell Size','Uniformity of Cell Shape',
         'Marginal Adhesion','Single Epithelial Cell Size','Bare Nuclei',
        'Bland Chromatin','Normal Nucleoli','Mitoses','Class']

df = pd.read_csv('../DataSets/breast-cancer-wisconsin.data', header=None,names=names)

data = df.replace('?', np.nan).dropna(how = 'any') # 删除缺省的数据

X = data[names[1:10]]
Y = data[names[10]]

X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.1,random_state=0)

ss = StandardScaler()                # 数据归一化
X_train = ss.fit_transform(X_train)  # 训练模型及归一化数据

In [2]:
# 1.2，训练模型：
lr = LogisticRegressionCV(multi_class='ovr',fit_intercept=True, Cs=np.logspace(-2, 2, 20), cv=2, penalty='l2', solver='lbfgs', tol=0.01)
lr.fit(X_train, Y_train)

score = lr.score(X_train, Y_train)
print ('R_square：', score, '\n', lr.predict_proba(X_test))

# 预测：
X_test = ss.transform(X_test) # 使用模型进行归一化操作
Y_predict = lr.predict(X_test)
print(Y_predict)

R_square： 0.9706840390879479 
 [[6.61838068e-06 9.99993382e-01]
 [3.78575185e-05 9.99962142e-01]
 [2.44249065e-15 1.00000000e+00]
 [0.00000000e+00 1.00000000e+00]
 [1.52850624e-03 9.98471494e-01]
 [6.67061684e-05 9.99933294e-01]
 [6.75536843e-07 9.99999324e-01]
 [0.00000000e+00 1.00000000e+00]
 [2.43117004e-05 9.99975688e-01]
 [6.13092842e-04 9.99386907e-01]
 [0.00000000e+00 1.00000000e+00]
 [2.00330728e-06 9.99997997e-01]
 [0.00000000e+00 1.00000000e+00]
 [3.78575185e-05 9.99962142e-01]
 [4.65824155e-08 9.99999953e-01]
 [5.47788703e-10 9.99999999e-01]
 [0.00000000e+00 1.00000000e+00]
 [0.00000000e+00 1.00000000e+00]
 [0.00000000e+00 1.00000000e+00]
 [6.27260778e-07 9.99999373e-01]
 [3.78575185e-05 9.99962142e-01]
 [3.85098865e-06 9.99996149e-01]
 [1.80189197e-12 1.00000000e+00]
 [9.44640398e-05 9.99905536e-01]
 [0.00000000e+00 1.00000000e+00]
 [0.00000000e+00 1.00000000e+00]
 [4.11688915e-06 9.99995883e-01]
 [1.85886872e-05 9.99981411e-01]
 [5.83016713e-06 9.99994170e-01]
 [0.00000000

In [3]:
# 1.3,模型持久化：
from sklearn.externals import joblib

joblib.dump(ss, "../OutPut/StandardScaler.model")              # 将标准化模型保存
joblib.dump(lr, "../OutPut/LogisticRegression.model")          # 将模型保存


re_ss = joblib.load("../OutPut/StandardScaler.model")            # 加载保存的模型
re_lr = joblib.load("../OutPut/LogisticRegression.model")
re_Y_predict = re_lr.predict(X_test)
print(re_Y_predict)

[2 2 4 4 2 2 2 4 2 2 4 2 4 2 2 2 4 4 4 2 2 2 4 2 4 4 2 2 2 4 2 4 4 2 2 2 4
 4 2 4 2 2 2 2 2 2 2 4 2 2 4 2 4 2 2 2 4 4 2 4 2 2 2 2 2 2 2 2 4]


In [4]:
# 2,Softmax回归：多分类
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegressionCV
from sklearn.linear_model.coordinate_descent import ConvergenceWarning
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import label_binarize
from sklearn import metrics

# 2.1，加载数据:
df1 = pd.read_csv('../DataSets/winequality-red.csv', sep=";")
df1['type'] = 1 # 设置数据类型为红葡萄酒

df2 = pd.read_csv('../DataSets/winequality-white.csv', sep=";")
df2['type'] = 2 # 设置数据类型为白葡萄酒

df = pd.concat([df1,df2], axis=0)  # 合并上面两个数据

names = ["fixed acidity","volatile acidity","citric acid",    # 自变量
         "residual sugar","chlorides","free sulfur dioxide",
         "total sulfur dioxide","density","pH","sulphates",
         "alcohol", "type"]
quality = "quality"   # 因变量

new_df = df.replace('?', np.nan)
datas = new_df.dropna(how = 'any')  # 删除缺省值数据

X = datas[names]
Y = datas[quality]

X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.25,random_state=0)

ss = MinMaxScaler()
X_train = ss.fit_transform(X_train) ## 训练模型及归一化数据

In [5]:
# 2.2，训练模型：
lr = LogisticRegressionCV(fit_intercept=True, Cs=np.logspace(-5, 1, 100),multi_class='multinomial',
                                                                          penalty='l2', solver='lbfgs')
lr.fit(X_train, Y_train)

score = lr.score(X_train, Y_train)
print("R值：", score, '\n',"特征稀疏化比率：%.2f%%" % (np.mean(lr.coef_.ravel() == 0) * 100),'\n',"参数：",lr.coef_,'\n',"截距：",lr.intercept_)

# 2.3,预测：
X_test = ss.transform(X_test)    # 使用模型进行归一化操作
Y_predict = lr.predict(X_test)
print(Y_predict)

R值： 0.5500821018062397 
 特征稀疏化比率：0.00% 
 参数： [[ 0.87719251  1.96480308 -0.37832656 -0.42670921  0.80200971  1.27286857
   0.67818804  0.23525776  0.00975145 -0.61077361 -0.66140713 -0.30954531]
 [ 0.6309557   4.97581156 -0.40181374 -2.05731766  1.12202499 -3.42214394
  -1.41038174  1.22978147  0.32419906 -0.82718658 -2.66624581  1.94244679]
 [-1.6305739   1.95498632  0.46851473 -1.84374856  0.64689031 -1.59012206
   2.16409024  1.41354498 -1.3088555  -2.19079446 -4.99713251 -0.73587632]
 [-1.11782192 -2.55986567 -0.3401666   0.19983594 -0.03159367  0.80905268
  -0.27762203  0.47773847 -0.63682238  0.17660177  0.00994473 -0.6871474 ]
 [ 1.10774948 -4.53639686 -0.26402906  2.09866167 -1.88039805  1.59802337
  -0.99916104 -1.99545673  0.78119684  2.59933529  3.37969172 -0.68629186]
 [-0.08978949 -1.6723386   0.67815689  1.93924532 -0.57960831  1.35271288
  -0.13247165 -1.30317882  0.692054    0.97610152  4.57634029  0.09278391]
 [ 0.22228761 -0.12699982  0.23766433  0.09003251 -0.07932498