## SVM 分类

In [46]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn import datasets
from sklearn.model_selection import train_test_split
import pandas as pd
import joblib
from sklearn import metrics
import math


In [8]:
# 读取数据
iris = datasets.load_iris()
X = iris.data
y = iris.target

# 标准化
std = StandardScaler()
X_std = std.fit_transform(X)

# 拆分训练集
X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size=0.3)

# SVM建模
svm_classification = SVC()
svm_classification.fit(X_train, y_train)

# 模型效果
svm_classification.score(X_test, y_test)

0.9777777777777777

## SVM 回归

In [None]:
from sklearn.svm import SVR

# 读取数据
boston = datasets.load_boston()
X = boston.data
y = boston.target

# 标准化
std = StandardScaler()
X_std = std.fit_transform(X)

# 拆分训练集
X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size=0.3)

# SVM建模
svm_regression = SVR(C=2, kernel='rbf')
svm_regression.fit(X_train, y_train)

# 模型效果
svm_regression.score(X_test, y_test)

## 模型调参：网格搜索

In [23]:
from sklearn.model_selection import GridSearchCV

# 定义参数的组合
params = {
    "kernel":['linear', 'rbf', 'poly', 'sigmoid'],
    'C':[0.01, 0.1, 0.5, 1, 2,]
}

# 用网格搜索拟合模型
model = GridSearchCV(svm_regression, param_grid=params, cv=10)
model.fit(X,y)

# 查看结果
print("最好的参数组和：", model.best_params_)
print("最好的得分：", model.best_score_)


最好的参数组和： {'C': 2, 'kernel': 'linear'}
最好的得分： 0.3151273617911972


## part1 SVM applied in ads effectiveness prediction

In [42]:
df = pd.read_csv("./data/ads_3.csv")

X = df[df.columns[:62]]
Y = df[df.columns[62:]]
std = StandardScaler()     #标准化特征
X_std = std.fit_transform(X)

X_std

array([[ 0.2901905 ,  0.55117825, -1.97604704, ..., -0.84656167,
         0.82146013,  0.33691767],
       [ 0.2901905 ,  0.55117825,  0.50606083, ..., -0.84656167,
        -1.21734453,  0.33691767],
       [ 0.2901905 , -1.81429509,  0.50606083, ..., -0.84656167,
         0.82146013,  0.33691767],
       ...,
       [-3.44601219, -1.81429509, -1.97604704, ..., -0.84656167,
         0.82146013,  0.33691767],
       [ 0.2901905 , -1.81429509,  0.50606083, ...,  1.18124885,
         0.82146013,  0.33691767],
       [ 0.2901905 , -1.81429509,  0.50606083, ..., -0.84656167,
        -1.21734453,  0.33691767]])

In [39]:
# 定义参数的组合
params = {
    "kernel":['linear', 'rbf', 'poly', 'sigmoid'],
    'C':[0.01, 0.1, 0.5, 1, 2, 10, 100]
}

svm_regression = SVR()
model = GridSearchCV(svm_regression, param_grid=params, cv=10)
model.fit(X, Y[Y.columns[1]])

# 查看结果
print("最好的参数组和：", model.best_params_)
print("最好的得分：", model.best_score_)

最好的参数组和： {'C': 0.5, 'kernel': 'rbf'}
最好的得分： -0.13549245031976037


In [44]:
MSE = []
RMSE = []
R_squared = []
for i in range(12):
    y = Y[Y.columns[i]]
    X_train, X_test, y_train, y_test = train_test_split(X_std, y, random_state=0)
    
    params = {
        "kernel":['linear', 'rbf', 'poly', 'sigmoid'],
        'C':[0.01, 0.1, 0.5, 1, 2, 10, 100]
    }
    
    svm_regression = SVR()
    model = GridSearchCV(svm_regression, param_grid=params, cv=10)
    model.fit(X_std, Y[Y.columns[1]])
    C = model.best_params_["C"]
    kernel = model.best_params_["kernel"]
    
    svm_regression = SVR(C=C, kernel=kernel)
    svm_regression.fit(X_train, y_train)

    joblib.dump(svm_regression, "model/SVM_regression/model{}.pkl".format(i+1))
    y_pred = svm_regression.predict(X_test)
    MSE.append(metrics.mean_squared_error(y_test, y_pred))
    RMSE.append(math.sqrt(metrics.mean_squared_error(y_test, y_pred)))
    R_squared.append(metrics.r2_score(y_test, y_pred))

In [45]:
result_dic = {"MSE":MSE, "RMSE":RMSE, "R_squared":R_squared}
result_df = pd.DataFrame(result_dic, index=Y.columns)
result_df.to_csv("result/SVM_regression.csv")