此项目将用SVM来预测一栋大楼是否在举行活动    

观察building_event_binary.txt文件中的数据，对于每一行数据从左到右分别代表着
- 星期
- 日期
- 时间
- 离开大楼的人数
- 进入大楼的人数
- 是否有活动    

前5个数据组成输入数据，我们的任务是预测大楼是否举行活动

In [25]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
from sklearn.svm import SVC
from sklearn import preprocessing

def txt_read(filename,delim):
    x_data = []
    with open(filename,'r') as f:
        for line in f:
            data = line.strip().split(delim)
            x_data.append([data[0]] + data[2:])

    x_data = np.array(x_data)
    return x_data

def encode(x_data):
    # 将字符串转化为数值
    label_encoder = []
    x_encoded = np.empty(x_data.shape)
    for i,item in enumerate(x_data[0]):
        if item.isdigit():
            x_encoded[:,i] = x_data[:,i]
        else:
            label_encoder.append(preprocessing.LabelEncoder())
            x_encoded[:,i] = label_encoder[-1].fit_transform(x_data[:,i])
            
    x_values = x_encoded[:,:-1].astype(int)
    y_values = x_encoded[:,-1].astype(int)
    print(y_values)
    return x_values,y_values,label_encoder

x_data = txt_read('building_event_multiclass.txt',',')
x_values,y_values,label_encoder = encode(x_data)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 2 2 2 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 2 2
 2 2 2 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 2 2 2 2 2 2 2 2 2 2]


In [26]:
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

def svm(params,x_data,y_data):

    x_train,x_test,y_train,y_test = train_test_split(x_data,y_data,test_size=0.25,random_state=5)
    classifier = SVC(**params)
    classifier.fit(x_train,y_train)

    # 训练集合分类结果

    target_names = ['Class-' + str(int(i)) for i in set(y_data)]
    print('\n','#'*30)
    print('\nClassifier performance on training dataset\n')
    y_pred = classifier.predict(x_train)
    print('\n',classification_report(y_train,y_pred,target_names=target_names),'\n')
    print('\n','#'*30)
    
    from sklearn.model_selection import cross_val_score
    
    num_validation = 5
    accuracy = cross_val_score(classifier,x_train,y_train,scoring='accuracy',cv=num_validation)
    print('Accuracy:',round(100*accuracy.mean(),2),'%')

    # 测试集合分类结果

    target_names = ['Class-' + str(int(i)) for i in set(y_data)]
    print('\n','#'*30)
    print('\nClassifier performance on testing dataset\n')
    y_test_pred = classifier.predict(x_test)
    print('\n',classification_report(y_test,y_test_pred,target_names=target_names),'\n')
    print('\n','#'*30)
    

    num_validation = 5
    accuracy = cross_val_score(classifier,x_test,y_test,scoring='accuracy',cv=num_validation)
    print('Accuracy:',round(100*accuracy.mean(),2),'%')
    
    return classifier
    
params = {'kernel':'rbf','probability':True,'class_weight':'balanced'}
classifier = svm(params,x_values,y_values)


 ##############################

Classifier performance on training dataset


               precision    recall  f1-score   support

     Class-0       1.00      0.99      0.99        81
     Class-1       0.96      1.00      0.98        25
     Class-2       1.00      1.00      1.00        26

    accuracy                           0.99       132
   macro avg       0.99      1.00      0.99       132
weighted avg       0.99      0.99      0.99       132
 


 ##############################
Accuracy: 64.4 %

 ##############################

Classifier performance on testing dataset


               precision    recall  f1-score   support

     Class-0       0.79      0.97      0.87        34
     Class-1       0.50      0.17      0.25         6
     Class-2       0.00      0.00      0.00         4

    accuracy                           0.77        44
   macro avg       0.43      0.38      0.37        44
weighted avg       0.68      0.77      0.71        44
 


 #######################

In [27]:
def testone(classifier,input_data):
    count = 0
    input_data_encoded = [-1]*len(input_data)
    for i,item in enumerate(input_data):
        contain = []
        contain.append(item)
        if item.isdigit():
            input_data_encoded[i] = int(item) # 注意，如果不化为int型则会报错
        else:    
            input_data_encoded[i] = int(label_encoder[count].transform(contain)) 
            count += 1
        
    input_data_encoded = np.array(input_data_encoded).reshape(1,-1)

    output_class = classifier.predict(input_data_encoded)
    print('Output class:',label_encoder[-1].inverse_transform(output_class))
    
input_data = ['Tuesday','12:30:00','21','23']
testone(classifier,input_data)

Output class: ['eventA']
