-
Notifications
You must be signed in to change notification settings - Fork 0
/
vector.py
58 lines (44 loc) · 2.02 KB
/
vector.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
# coding:utf-8
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import *
from evaluate import print_res
def Removing_features_with_low_variance(X, threshold):
# Removing features with low variance
selector = VarianceThreshold(threshold)
selector.fit(X)
print("Variances is %s" % selector.variances_)
print("After transform is %s" % selector.transform(X))
print("The surport is %s" % selector.get_support(True))
print("After reverse transform is %s" % selector.inverse_transform(selector.transform(X)))
return selector.transform(X)
def GetVector():
subtrainLabel = pd.read_csv("./dataset/trainLabels2.csv")
subtrainLabel['Class'] = subtrainLabel['Class'] - 1 # classify from 0 to 8
subtrainfeature = pd.read_csv("./dataset/imgfeature_extent_10000.csv") #
# subtrainfeature = subtrainfeature.iloc[:, :800 + 1]
subtrain = pd.merge(subtrainLabel, subtrainfeature, on='Id')
# print(subtrain)
labels = subtrain.Class
subtrain.drop(["Class", "Id"], axis=1, inplace=True)
subtrain = subtrain.values
# 保留超过90%变化的数据
# threshold = .9 * (1 - .9)
# subtrain = Removing_features_with_low_variance(X=subtrain, threshold=threshold)
X_train, X_test, y_train, y_test = train_test_split(subtrain, labels, test_size=0.15)
train_data, train_label, test_data, test_label = \
np.array(X_train), np.array(y_train), np.array(X_test), np.array(y_test)
print(" =========== train data ==========")
print_res(train_label)
print(" =========== test data ==========")
print_res(test_label)
ImageSize = train_data.shape[1]
train_data_num = train_data.shape[0]
test_data_num = test_data.shape[0]
train_data = train_data.reshape(train_data_num, ImageSize)
test_data = test_data.reshape(test_data_num, ImageSize)
train_data = train_data / 255
test_data = test_data / 255
return train_data, train_label, test_data, test_label
a, b, c, d = GetVector()