In [1]:
import csv
import os
import numpy as np
from sklearn.preprocessing import normalize

In [8]:
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

In [2]:
cwd = os.getcwd()

In [3]:
new_train_csv = os.path.join(cwd, 'data', 'train.csv')
new_val_csv = os.path.join(cwd, 'data', 'val.csv')

In [4]:
# 使用csv.reader读取数据
def ReadData(file):
    with open(file, 'r') as f:
        newfile = csv.reader(f, delimiter = ';')
        dataList = np.array(list(newfile))
    return dataList

In [5]:
def getXY(dataList, label):
    features_name = dataList[0]
    features_value = dataList[1:]
    # 归一化所有数据
    cols_name = list(features_name)
    label_index = cols_name.index(label)
    # 提取label列的数据
    Y = [y for y in features_value[:,label_index]]
    # 提取feature的数据
    X = np.delete(features_value, label_index, axis = 1)
    X = normalize(X, axis=0, norm='max')
    return X, Y

In [6]:
trainData = ReadData(new_train_csv)
valData = ReadData(new_val_csv)

In [7]:
X_train_class, Y_train_class = getXY(trainData, 'Reason for absence')
X_val_class, Y_val_class = getXY(valData, 'Reason for absence')
X_train_reg, Y_train_reg = getXY(trainData, 'Absenteeism time in hours')
X_val_reg, Y_val_reg = getXY(valData, 'Absenteeism time in hours')

### 使用SVM进行分类和回归

In [9]:
# 用SVM分类：linear > rbf > sigmoid > poly
clf = SVC(kernel='linear')
clf.fit(X_train_class, Y_train_class)
clf_predict = clf.predict(X_val_class)

In [10]:
from sklearn.metrics import f1_score 
micro_f1 = f1_score(Y_val_class, clf_predict, average='micro')
print(micro_f1)

0.24


In [11]:
Y_val_reg = list(map(float, Y_val_reg))

In [12]:
# 用SVR回归
from sklearn.svm import SVR
clr = SVR()
clr.fit(X_train_reg, Y_train_reg)
clr_predict = clr.predict(X_val_reg)

In [13]:
from sklearn.metrics import mean_squared_error
mse_reg = mean_squared_error(Y_val_reg, clr_predict)
print(mse_reg)

190.663576374


### 使用MLP进行分类和回归

In [14]:
clf = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(100, 50), random_state=1)
clf.fit(X_train_class,Y_train_class)
clf_predict = clf.predict(X_val_class)

In [15]:
micro_f1 = f1_score(Y_val_class, clf_predict, average='micro')
print(micro_f1)

0.15


In [16]:
X_train_reg0 = [list(map(float, x)) for x in X_train_reg]
Y_train_reg0 = list(map(float, Y_train_reg))
Y_val_reg = list(map(float, Y_val_reg))

In [17]:
# 用MLPRegressor进行回归分析
from sklearn.neural_network import MLPRegressor
clr = MLPRegressor(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(100, 50), random_state=1)
clr.fit(X_train_reg0, Y_train_reg0)
clr_predict = clr.predict(X_val_reg)

In [18]:
from sklearn.metrics import mean_squared_error
mse_reg = mean_squared_error(Y_val_reg, clr_predict)
print(mse_reg)

521.929142871
