## 半监督支持向量机
- 形式化地说$$min_{w,b,\tilde{y},\xi} = \frac{1}{2}||w||^2 + C_{l}\sum_{i=1}^{l}\xi_{i} + C_{u}\sum_{l+1}^{m}\xi_{i} \tag{1.1}\\
s.t. y_{i}(w^Tx_{i}+b)>=1-\xi_{i},i = 1,2,...,l,\\
 \tilde{y}_{i}(w^Tx_{i}+b)>=1-\xi_{i},i = l+1,l+2,...,m \\
 \xi_{i} >=0 , i=1,2,...,m \\ 其中u属于未标记样本$$
- TSVM 采用局部搜索来迭代地寻找式子1.1的值，它先用有标记样本学的一个SVM，然后利用这个SVM对为标记数据进行标记指派，即将SVM预测的结果作为”伪标记“赋予为标记样本，此时$\tilde{y}$成为已知，于是将其代入式1.1即得到一个标准的SVM问题，于是可求解出新的划分超平面和松弛变量，注意到此时未标记样本的未标记很可能是不准确的，因此$C_{u}$要设置为$C_{l}$小的值，使得有标记的样本所起的作用更大，接下来TSVM 找出两个标记指派为异类且很有可能标记错误的未标记样本，交换他们的标记，再重新基于式1.1求解更新后的划分超平面和松弛变量，依次循环。直到$C_{u}=C_{l}$ 

![tsvm](img/tsvm.png)

In [136]:
import numpy as np
from utils import *
import pandas as pd
from sklearn.svm import LinearSVC
from collections import Counter

In [153]:
frame = pd.read_csv('data/mushrooms.csv')
frame['class'].value_counts()
columns = frame.columns
# label encode 对于每一列
from sklearn.preprocessing import LabelEncoder
labelencoder=LabelEncoder()
for col in frame.columns:
    frame[col] = labelencoder.fit_transform(frame[col])
X = frame.values
X_same = X[X[:,10] == 0]
for i in range(len(X_same)):
    if X_same[i,0] == 0:
        X_same[i,0] = -1
Counter(X_same[:,0])

Counter({-1: 1616, 1: 1900})

In [156]:
class TSVM(object):
    def __init__(self):
        self.Cu = 0.001 # 分别对应有label的样本权重和无label的样本权重
        self.Cl = 1
    def fit(self,train_data):
        # 找到一个正例，一个反例
#         np.random.shuffle(train_data)
        pos_one = train_data[train_data[:,0] == 1][0]
        pos_other = train_data[train_data[:,0] == 1][1:]
        neg_one = train_data[train_data[:,0] == -1][0]
        neg_other = train_data[train_data[:,0] == -1][1:]
        train = np.vstack((pos_one,neg_one))
        print('train.shape: ',train.shape)
        self.other = np.vstack((pos_other,neg_other)) # S 还用于对数据进行测试
        print('self.other.shape : ',self.other.shape)
        # 训练一个初始的分类器
        self.clf = LinearSVC(class_weight = 'balanced')
        self.clf.fit(train[:,1:],train[:,0])
        pred_y = self.clf.predict(self.other[:,1:])
        
        X = np.vstack((train,self.other))
        y = np.vstack((train[:,0].reshape(-1,1),pred_y.reshape(-1,1)))[:,0] # 将预测结果放到SVM中进行训练
        
        self.w = np.ones(train_data.shape[0])
        
        self.w[len(train):] = self.Cu
        
        while self.Cu < self.Cl:
            print(X.shape,y.shape,self.w.shape)
            self.clf.fit(X[:,1:],y,sample_weight = self.w)
            while True:
                dist = self.clf.decision_function(X[:,1:]) # 返回的是带符号的距离
                xi = 1 - y * dist # 算出每一个xi
                # 取出预判为正例的id,取出预判为反例的id
                xi_pos_index, xi_neg_index = np.where(y[2:] > 0), np.where( y[2:]<0 )
                xi_pos , xi_neg = xi[xi_pos_index],xi[xi_neg_index]
                xi_pos_max_index = np.argmax(xi_pos)
                xi_neg_max_index = np.argmax(xi_neg)
                xi_pos_max_value = xi_pos[xi_pos_max_index]
                xi_neg_max_value = xi_neg[xi_neg_max_index]
                # 不断地拿最大的两个距离的，并且其中有一个误分类，或者两个都误分类了
                if xi_pos_max_value > 0 and xi_neg_max_value > 0 and (xi_pos_max_value + xi_pos_max_value) > 2:
                    # 交换类别
                    y[xi_pos_max_index],y[xi_neg_max_index] = y[xi_neg_max_index],y[xi_pos_max_index]
                    self.clf.fit(X[:,1:],y,sample_weight = self.w)
                else:
                    break
            self.Cu = min(2 * self.Cu ,self.Cl)
            # 交换权重
            self.w[len(train):] = self.Cu
    def predict(self):
        pred_y = self.clf.predict(self.other[:,1:])
        return 1 - np.mean(pred_y == self.other[:,0])
            

In [157]:
tsvm = TSVM()
tsvm.fit(X_same)
tsvm.predict()

train.shape:  (2, 23)
self.other.shape :  (3514, 23)
(3516, 23) (3516,) (3516,)
(3516, 23) (3516,) (3516,)
(3516, 23) (3516,) (3516,)
(3516, 23) (3516,) (3516,)
(3516, 23) (3516,) (3516,)
(3516, 23) (3516,) (3516,)
(3516, 23) (3516,) (3516,)
(3516, 23) (3516,) (3516,)
(3516, 23) (3516,) (3516,)
(3516, 23) (3516,) (3516,)


0.4832100170745589

- 论文中是0.439