In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import ensemble
import numpy as np
import matplotlib.pyplot as plt

# constant parameters
gt_path = './data/gt_file_path'
feature_path = './data/output/'


# get file name
def get_name(t):
    return 'hdcctv1_' + t[0].split('/')[-1]

# concat dataframes on the basis of 'bg' timestamp
def data_concat(gt_file_path_list,file_name_list):
    label_value = pd.read_csv(gt_file_path_list.iloc[0][0], header=None, skiprows=1, sep=' ',
                              names=['bg', 'label']).set_index('bg')
    feature_vector = pd.read_csv(feature_path + file_name_list.iloc[0], header=None).set_index(0)
    X = feature_vector.join(label_value)
    for i in range(1, file_name_list.shape[0]):
        label_value = pd.read_csv(gt_file_path_list.iloc[0][0], header=None, skiprows=1, sep=' ',
                                  names=['bg', 'label']).set_index('bg')
        feature_vector = pd.read_csv(feature_path + file_name_list.iloc[0], header=None).set_index(0)
        df = feature_vector.join(label_value)
        X = pd.concat([X, df], axis=0)
    X = X.dropna()  # ignore NaN vector
    X.index.names = ['bg']
    return X


# get training-set & validation-set
def get_train_vali(X,rate,seed):
    tt, vv = train_test_split(X, test_size=rate, random_state=seed)
    x_train, x_vali = tt.drop(columns='label'), vv.drop(columns='label')
    y_train,y_vali = tt['label'], vv['label']
    return x_train,y_train, x_vali,y_vali


# try machine learning methods
def try_different_model(model_, x_train,y_train, x_vali,y_vali, model_name):
    model_.fit(x_train, y_train)     # training process
    score = model_.score(x_vali, y_vali) # evaluation
    self_score = model_.score(x_train, y_train)  # self evaluation
    return [score, self_score]


gt_file_path_list = pd.read_csv(gt_path,header=None)
file_name_list = gt_file_path_list.apply(get_name,axis=1)

# get the whole matrix
data = data_concat(gt_file_path_list, file_name_list)
# # hold-out method: 30%
# x_train, y_train, x_vali, y_vali = get_train_vali(data,rate=0.3,seed=4)


# # random forest
# model_= ensemble.RandomForestClassifier(oob_score=True,random_state=10,n_estimators=20)
# # validation
# scores = try_different_model(model_,x_train, y_train, x_vali, y_vali,'RandomForest')

In [2]:
data.shape

(4994, 5)

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4994 entries, 15 to 1787
Data columns (total 5 columns):
1        4994 non-null int64
2        4994 non-null int64
3        4994 non-null float64
4        4994 non-null float64
label    4994 non-null float64
dtypes: float64(3), int64(2)
memory usage: 234.1 KB


In [22]:
data[data['label']==1].count()/data[data['label']==0].count()

1        0.050926
2        0.050926
3        0.050926
4        0.050926
label    0.050926
dtype: float64

In [19]:
model = ensemble.RandomForestClassifier(oob_score=True,random_state=10,n_estimators=20)
x_train, x_vali, y_train, y_vali = train_test_split(data[[1,2,3,4]],data['label'], test_size=0.3, random_state=4)
model.fit(x_train,y_train.astype(int))
from sklearn.metrics import accuracy_score
print(accuracy_score(y_vali, model.predict(x_vali)))
model.score(x_vali, y_vali.astype(int))

1.0


1.0

In [None]:
model = ensemble.DecisionTree(oob_score=True,random_state=10,n_estimators=20)
x_train, x_vali, y_train, y_vali = train_test_split(data[[1,2,3,4]],data['label'], test_size=0.3, random_state=4)
model.fit(x_train,y_train.astype(int))
from sklearn.metrics import accuracy_score
print(accuracy_score(y_vali, model.predict(x_vali)))
model.score(x_vali, y_vali.astype(int))