In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier


In [2]:
# 切分数据集
df = pd.read_csv("耳机.csv")
#df = pd.read_excel('耳机2.xlsx')
target = df.pop("Helpfulness")
data2 = df.values
X = data2
Y = target
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size = 0.2,random_state = 3)

# 描述性分析

In [3]:
df['Relevancy'].describe()

count    1032.000000
mean        5.916667
std         6.362996
min         0.000000
25%         2.000000
50%         4.000000
75%         7.000000
max        61.000000
Name: Relevancy, dtype: float64

# 用于验证Relevancy是否有用

In [4]:
data3 = df.drop('Relevancy',axis=1)
data2 = data3.values
X_r = data2
X_train_r,X_test_r = train_test_split(X_r,test_size = 0.2,random_state = 3)

In [5]:
# 标准化数据（标准差标准化）
sc = StandardScaler().fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)
# 没有R的数据
sc = StandardScaler().fit(X_train_r)
X_train_std_r = sc.transform(X_train_r)
X_test_std_r = sc.transform(X_test_r)

# LR

In [6]:
# LR模型预测
lr = LogisticRegression()  #初始化LogisticRegression
lr.fit(X_train_std, Y_train)  # 调用LogisticRegression中的fit函数训练模型参数
lr_pres = lr.predict(X_test_std) # 使用训练好的模型lr对X_test进行预测
print('准确率：',accuracy_score(Y_test, lr_pres))
print('精确率：',precision_score(Y_test, lr_pres))
print('召回率：',recall_score(Y_test, lr_pres))
print('F1：',f1_score(Y_test,lr_pres))

准确率： 0.893719806763285
精确率： 0.8939393939393939
召回率： 0.7972972972972973
F1： 0.8428571428571429


## 改变阈值

In [7]:
ss = 1/(1+np.exp(-lr.decision_function(X_test_std)))
ss = pd.DataFrame(ss)

In [8]:
#使用二分法多次尝试
for i in np.linspace(0.4, 0.5, num=9):
    mm = ss[0].apply(lambda x:1 if x>i else 0)
    mm = np.array(mm)
    print('阈值为%s时准确率为：%s'%(i, accuracy_score(Y_test, mm)))
#得出结论阈值为0.46时可以得出较好的结果

阈值为0.4时准确率为：0.9033816425120773
阈值为0.41250000000000003时准确率为：0.9033816425120773
阈值为0.42500000000000004时准确率为：0.9082125603864735
阈值为0.4375时准确率为：0.9082125603864735
阈值为0.45时准确率为：0.9130434782608695
阈值为0.4625时准确率为：0.8985507246376812
阈值为0.475时准确率为：0.8985507246376812
阈值为0.4875时准确率为：0.9033816425120773
阈值为0.5时准确率为：0.893719806763285


In [9]:
mm = ss[0].apply(lambda x:1 if x>0.45 else 0)
mm = np.array(mm)
print('准确率为：%s'%accuracy_score(Y_test, mm))
print('精确率：',precision_score(Y_test, mm))
print('召回率：',recall_score(Y_test, mm))
print('F1：',f1_score(Y_test,mm))

准确率为：0.9130434782608695
精确率： 0.8888888888888888
召回率： 0.8648648648648649
F1： 0.8767123287671232


# 对比

In [10]:
lr = LogisticRegression()  #初始化LogisticRegression
lr.fit(X_train_std_r, Y_train)  # 调用LogisticRegression中的fit函数训练模型参数
lr_pres = lr.predict(X_test_std_r) # 使用训练好的模型lr对X_test进行预测
print('准确率：',accuracy_score(Y_test, lr_pres))
print('精确率：',precision_score(Y_test, lr_pres))
print('召回率：',recall_score(Y_test, lr_pres))
print('F1：',f1_score(Y_test,lr_pres))

准确率： 0.8743961352657005
精确率： 0.8529411764705882
召回率： 0.7837837837837838
F1： 0.8169014084507041


In [11]:
lr = LogisticRegression(max_iter=3000)
scores = cross_val_score(lr,X,Y,cv=10,scoring='accuracy')  #cv：选择每次测试折数  accuracy：评价指标是准确度
scores.mean()

0.8603808812546678

# AdaBoost-DT

In [12]:
# Fit a simple decision tree(weak classifier) first
clf_tree = DecisionTreeClassifier(max_depth = 2, random_state = 1)

def my_adaboost_clf(Y_train, X_train, Y_test, X_test, M=20, weak_clf=DecisionTreeClassifier(max_depth = 2)):
    n_train, n_test = len(X_train), len(X_test)
    # Initialize weights
    w = np.ones(n_train) / n_train
    pred_train, pred_test = [np.zeros(n_train), np.zeros(n_test)]

    for i in range(M):
        # Fit a classifier with the specific weights
        weak_clf.fit(X_train, Y_train, sample_weight = w)
        pred_train_i = weak_clf.predict(X_train)
        pred_test_i = weak_clf.predict(X_test)

        # Indicator function
        miss = [int(x) for x in (pred_train_i != Y_train)]
        print("weak_clf_%02d train acc: %.4f"
         % (i + 1, 1 - sum(miss) / n_train))

        # Error
        err_m = np.dot(w, miss)
        # Alpha
        alpha_m = 0.5 * np.log((1 - err_m) / float(err_m))
        # New weights
        miss2 = [x if x==1 else -1 for x in miss] # -1 * y_i * G(x_i): 1 / -1
        w = np.multiply(w, np.exp([float(x) * alpha_m for x in miss2]))
        w = w / sum(w)

        # Add to prediction
        pred_train_i = [1 if x == 1 else -1 for x in pred_train_i]
        pred_test_i = [1 if x == 1 else -1 for x in pred_test_i]
        pred_train = pred_train + np.multiply(alpha_m, pred_train_i)
        pred_test = pred_test + np.multiply(alpha_m, pred_test_i)

    pred_train = (pred_train > 0) * 1
    pred_test = (pred_test > 0) * 1

    print("My AdaBoost clf train accuracy: %.4f" % (sum(pred_train == Y_train) / n_train))
    print("My AdaBoost clf test accuracy: %.4f" % (sum(pred_test == Y_test) / n_test))

    return pred_test
ab = my_adaboost_clf(Y_train, X_train_std, Y_test, X_test_std)
print('准确率：',accuracy_score(Y_test, ab))
print('精确率：',precision_score(Y_test, ab))
print('召回率：',recall_score(Y_test, ab))
print('F1：',f1_score(Y_test,ab))

weak_clf_01 train acc: 0.8788
weak_clf_02 train acc: 0.8024
weak_clf_03 train acc: 0.5200
weak_clf_04 train acc: 0.8024
weak_clf_05 train acc: 0.6642
weak_clf_06 train acc: 0.7333
weak_clf_07 train acc: 0.6642
weak_clf_08 train acc: 0.7152
weak_clf_09 train acc: 0.6642
weak_clf_10 train acc: 0.7152
weak_clf_11 train acc: 0.6473
weak_clf_12 train acc: 0.8776
weak_clf_13 train acc: 0.3964
weak_clf_14 train acc: 0.2897
weak_clf_15 train acc: 0.7309
weak_clf_16 train acc: 0.6642
weak_clf_17 train acc: 0.7164
weak_clf_18 train acc: 0.6642
weak_clf_19 train acc: 0.7333
weak_clf_20 train acc: 0.6291
My AdaBoost clf train accuracy: 0.8945
My AdaBoost clf test accuracy: 0.9034
准确率： 0.9033816425120773
精确率： 0.8648648648648649
召回率： 0.8648648648648649
F1： 0.8648648648648649


# 对比

In [13]:
ab = my_adaboost_clf(Y_train, X_train_std_r, Y_test, X_test_std_r)
print('准确率：',accuracy_score(Y_test, ab))
print('精确率：',precision_score(Y_test, ab))
print('召回率：',recall_score(Y_test, ab))
print('F1：',f1_score(Y_test,ab))

weak_clf_01 train acc: 0.8788
weak_clf_02 train acc: 0.8024
weak_clf_03 train acc: 0.4036
weak_clf_04 train acc: 0.8012
weak_clf_05 train acc: 0.6642
weak_clf_06 train acc: 0.7152
weak_clf_07 train acc: 0.6715
weak_clf_08 train acc: 0.7152
weak_clf_09 train acc: 0.6473
weak_clf_10 train acc: 0.6642
weak_clf_11 train acc: 0.7333
weak_clf_12 train acc: 0.6376
weak_clf_13 train acc: 0.7333
weak_clf_14 train acc: 0.5564
weak_clf_15 train acc: 0.6630
weak_clf_16 train acc: 0.7491
weak_clf_17 train acc: 0.7224
weak_clf_18 train acc: 0.2400
weak_clf_19 train acc: 0.6121
weak_clf_20 train acc: 0.5358
My AdaBoost clf train accuracy: 0.8861
My AdaBoost clf test accuracy: 0.8744
准确率： 0.8743961352657005
精确率： 0.8428571428571429
召回率： 0.7972972972972973
F1： 0.8194444444444444


# SVM(支持向量机) 

## Linear model

In [14]:
svm_l = svm.SVC(kernel='linear', C=2)
svm_l.fit(X_train_std, Y_train)
y_test_pred_l = svm_l.predict(X_test_std)
print('准确率：',accuracy_score(Y_test, y_test_pred_l))
print('精确率：',precision_score(Y_test, y_test_pred_l))
print('召回率：',recall_score(Y_test, y_test_pred_l))
print('F1：',f1_score(Y_test,y_test_pred_l))

准确率： 0.8888888888888888
精确率： 0.8695652173913043
召回率： 0.8108108108108109
F1： 0.8391608391608392


In [15]:
# 对比
svm_l.fit(X_train_std_r, Y_train)
y_test_pred_l = svm_l.predict(X_test_std_r)
print('准确率：',accuracy_score(Y_test, y_test_pred_l))
print('精确率：',precision_score(Y_test, y_test_pred_l))
print('召回率：',recall_score(Y_test, y_test_pred_l))
print('F1：',f1_score(Y_test,y_test_pred_l))

准确率： 0.855072463768116
精确率： 0.84375
召回率： 0.7297297297297297
F1： 0.7826086956521738


# Random Forest(随机森林)

In [16]:
# Building a random forest
RF_class = RandomForestClassifier(n_estimators=25, min_samples_leaf=15, random_state=1234)
# Random forest fitting
RF_class.fit(X_train_std, Y_train)
# Model predictions on the test set
RFclass_pred = RF_class.predict(X_test_std)
print('准确率：',accuracy_score(Y_test, RFclass_pred))
print('精确率：',precision_score(Y_test, RFclass_pred))
print('召回率：',recall_score(Y_test, RFclass_pred))
print('F1：',f1_score(Y_test,RFclass_pred))

准确率： 0.8985507246376812
精确率： 0.8441558441558441
召回率： 0.8783783783783784
F1： 0.8609271523178808


In [17]:
# 对比
RF_class.fit(X_train_std_r, Y_train)
# Model predictions on the test set
RFclass_pred = RF_class.predict(X_test_std_r)
print('准确率：',accuracy_score(Y_test, RFclass_pred))
print('精确率：',precision_score(Y_test, RFclass_pred))
print('召回率：',recall_score(Y_test, RFclass_pred))
print('F1：',f1_score(Y_test,RFclass_pred))

准确率： 0.8695652173913043
精确率： 0.8309859154929577
召回率： 0.7972972972972973
F1： 0.8137931034482759


In [18]:
rfc = RandomForestClassifier(n_estimators=25,random_state=15)
score_pre = cross_val_score(rfc,X,Y,cv=10).mean()
score_pre

0.8575149365197909

# 朴素贝叶斯

In [19]:
from sklearn.naive_bayes import BernoulliNB
nb = BernoulliNB(alpha=0.1,binarize = 5,fit_prior=True)
nb.fit(X_train, Y_train)
nb_pred = nb.predict(X_test)
print('准确率：',accuracy_score(Y_test, nb_pred))
print('精确率：',precision_score(Y_test, nb_pred))
print('召回率：',recall_score(Y_test, nb_pred))
print('F1：',f1_score(Y_test,nb_pred))

准确率： 0.893719806763285
精确率： 0.8714285714285714
召回率： 0.8243243243243243
F1： 0.8472222222222222


In [24]:
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
nb.fit(X_train_std, Y_train)
nb_pred = nb.predict(X_test_std)
print('准确率：',accuracy_score(Y_test, nb_pred))
print('精确率：',precision_score(Y_test, nb_pred))
print('召回率：',recall_score(Y_test, nb_pred))
print('F1：',f1_score(Y_test,nb_pred))

准确率： 0.8454106280193237
精确率： 0.9038461538461539
召回率： 0.6351351351351351
F1： 0.746031746031746


In [41]:
nb.fit(X_train_r, Y_train)
nb_pred = nb.predict(X_test_r)
print('准确率：',accuracy_score(Y_test, nb_pred))
print('精确率：',precision_score(Y_test, nb_pred))
print('召回率：',recall_score(Y_test, nb_pred))
print('F1：',f1_score(Y_test,nb_pred))

准确率： 0.8599033816425121
精确率： 0.9411764705882353
召回率： 0.6486486486486487
F1： 0.7680000000000001


# DNN

In [47]:
# 神经元个数
for unit in range(20,20):
    # 激活函数：relu, logistic, tanh
    # 优化算法：lbfgs, sgd, adam。adam适用于较大的数据集，lbfgs适用于较小的数据集。
    #初始化模型
    ann_model = MLPClassifier(hidden_layer_sizes=[unit], activation='logistic', solver='adam',max_iter=1000, random_state=0)
    #训练模型
    ann_model.fit(X_train_std, Y_train)
    print('神经元个数={}，准确率：{:.3f}'.format(unit, ann_model.score(X_test_std, Y_test)))

神经元个数=20，准确率：0.899
神经元个数=21，准确率：0.899
神经元个数=22，准确率：0.899
神经元个数=23，准确率：0.899
神经元个数=24，准确率：0.894
神经元个数=25，准确率：0.899
神经元个数=26，准确率：0.894
神经元个数=27，准确率：0.894
神经元个数=28，准确率：0.899
神经元个数=29，准确率：0.894
神经元个数=30，准确率：0.894
神经元个数=31，准确率：0.889
神经元个数=32，准确率：0.894
神经元个数=33，准确率：0.894
神经元个数=34，准确率：0.889
神经元个数=35，准确率：0.899
神经元个数=36，准确率：0.894
神经元个数=37，准确率：0.889
神经元个数=38，准确率：0.894
神经元个数=39，准确率：0.894
神经元个数=40，准确率：0.894
神经元个数=41，准确率：0.889
神经元个数=42，准确率：0.894
神经元个数=43，准确率：0.894
神经元个数=44，准确率：0.884
神经元个数=45，准确率：0.889
神经元个数=46，准确率：0.894
神经元个数=47，准确率：0.889
神经元个数=48，准确率：0.889
神经元个数=49，准确率：0.889


In [49]:
aaaa = 31
ann_model = MLPClassifier(hidden_layer_sizes=[aaaa], activation='logistic', solver='adam',max_iter=1000, random_state=0)
#训练模型
ann_model.fit(X_train, Y_train)
ann = ann_model.predict(X_test)
print('准确率：',accuracy_score(Y_test, ann))
print('精确率：',precision_score(Y_test, ann))
print('召回率：',recall_score(Y_test, ann))
print('F1：',f1_score(Y_test,ann))

准确率： 0.9033816425120773
精确率： 0.8461538461538461
召回率： 0.8918918918918919
F1： 0.868421052631579


In [50]:
# 对比
ann_model.fit(X_train_r, Y_train)
ann = ann_model.predict(X_test_r)
print('准确率：',accuracy_score(Y_test, ann))
print('精确率：',precision_score(Y_test, ann))
print('召回率：',recall_score(Y_test, ann))
print('F1：',f1_score(Y_test,ann))

准确率： 0.8792270531400966
精确率： 0.855072463768116
召回率： 0.7972972972972973
F1： 0.8251748251748252


# 奇怪的集成学习

## LR+AdaBoost+ANN

In [18]:
lr = mm


weak_clf_01 train acc: 0.8788
weak_clf_02 train acc: 0.8024
weak_clf_03 train acc: 0.5200
weak_clf_04 train acc: 0.8024
weak_clf_05 train acc: 0.6642
weak_clf_06 train acc: 0.7333
weak_clf_07 train acc: 0.6642
weak_clf_08 train acc: 0.7152
weak_clf_09 train acc: 0.6642
weak_clf_10 train acc: 0.7152
weak_clf_11 train acc: 0.6473
weak_clf_12 train acc: 0.8776
weak_clf_13 train acc: 0.3964
weak_clf_14 train acc: 0.2897
weak_clf_15 train acc: 0.7309
weak_clf_16 train acc: 0.6642
weak_clf_17 train acc: 0.7164
weak_clf_18 train acc: 0.6642
weak_clf_19 train acc: 0.7333
weak_clf_20 train acc: 0.6291
My AdaBoost clf train accuracy: 0.8945
My AdaBoost clf test accuracy: 0.9034


In [19]:
el = lr+ann+ab
el = pd.DataFrame(el)
el= el[0].apply(lambda x:1 if x>1.5 else 0)
el = np.array(el)
print('准确率为：%s'%accuracy_score(Y_test, el))
print('精确率：',precision_score(Y_test, el))
print('召回率：',recall_score(Y_test, el))
print('F1：',f1_score(Y_test,el))

准确率为：0.9178743961352657
精确率： 0.88
召回率： 0.8918918918918919
F1： 0.8859060402684563
