/
load_data.py
124 lines (94 loc) · 4.89 KB
/
load_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
"""
This module loads the genome sequence corresponding to SARS-COV-2 and other
coronaviruses. The genome sequence is mapped to the following numeric values:
C = 0.25
T = 0.50
G = 0.75
A = 1.0
The absolute value of the fast fourier transform coefficients is found next.
The module returns the absolute value of the fft coefficients of the input data
and labels.
Author: Harikrishnan N B
Email: harikrishnannb07@gmail.com
Date: 29 July 2020
"""
import logging
import numpy as np
import pandas as pd
from numpy.fft import fft
def get_data(classification_type):
"""
Parameters
----------
classification_type : string
DESCRIPTION : classification_type == "binary_class"
binary classification data will be loaded.
classification_type == "multi_class"
multiclass classification data is loaded
Returns
-------
fourier_data_normalized : array, 2D
labels : array, 2D
"""
if classification_type == 'multi_class':
data_covid = np.array(pd.read_csv(classification_type+'/data/data.csv', header=None))
labels = np.array(pd.read_csv(classification_type+'/data/labels.csv', header=None))
num_instance = data_covid.shape[0] # Number of rows in the data.
num_features = data_covid.shape[1] # Number of columns in the data.
fourier_features = np.zeros((num_instance, num_features))
#Computing the absolute value Fast Fourier transform coefficients of each data instance.
for data_instance in range(0, num_instance):
fourier_features[data_instance, :] = np.abs(fft(data_covid[data_instance, :]))
# Normalization done for each row.
numerator = fourier_features.T - np.min(fourier_features, axis=1)
denominator = np.max(fourier_features, axis=1) - np.min(fourier_features, axis=1)
fourier_data_normalized = (numerator/denominator).T
# Checking whether the data is normalized.
try:
assert np.min(fourier_data_normalized) >= 0.0 and np.max(fourier_data_normalized) <= 1.0
except AssertionError:
logging.error("Error-Data should be in the range [0, 1]", exc_info=True)
return fourier_data_normalized, labels
elif classification_type == 'binary_class':
data_covid = np.array(pd.read_csv(classification_type + '/data/data.csv', header=None))
labels = np.array(pd.read_csv(classification_type + '/data/labels.csv', header=None))
num_instance = data_covid.shape[0] # Number of rows in the data.
num_features = data_covid.shape[1] # Number of columns in the data.
fourier_features = np.zeros((num_instance, num_features))
# Computing the absolute value Fast Fourier transform coefficients of each data instance.
for data_instance in range(0, num_instance):
fourier_features[data_instance, :] = np.abs(fft(data_covid[data_instance, :]))
# Normalization done for each row.
numerator = fourier_features.T - np.min(fourier_features, axis=1)
denominator = np.max(fourier_features, axis=1) - np.min(fourier_features, axis=1)
fourier_data_normalized = (numerator/denominator).T
# Checking whether the data is normalized.
try:
assert np.min(fourier_data_normalized) >= 0.0 and np.max(fourier_data_normalized) <= 1.0
except AssertionError:
logging.error("Error-Data should be in the range [0, 1]", exc_info=True)
return fourier_data_normalized, labels
else:
data_path = "PREPROCESSED_DATA/"
cov_1_data = np.load(data_path + "COV_1_DATA.npy")# SARS-COV-1 data
cov_1_label = np.load(data_path + "COV_1_LABEL.npy")
cov_2_data = np.load(data_path + "COV_2_DATA.npy")# SARS-COV-2 data
cov_2_label = np.load(data_path + "COV_2_LABEL.npy")
data_covid = np.vstack((cov_1_data, cov_2_data))
labels = np.vstack((cov_1_label, cov_2_label))
num_instance = data_covid.shape[0] # Number of rows in the data.
num_features = data_covid.shape[1] # Number of columns in the data.
fourier_features = np.zeros((num_instance, num_features))
# Absolute value of fast fourier transform of the input data.
for data_instance in range(0, num_instance):
fourier_features[data_instance, :] = np.abs(fft(data_covid[data_instance, :]))
#Normalization done for each row.
numerator = fourier_features.T - np.min(fourier_features, axis=1)
denominator = np.max(fourier_features, axis=1) - np.min(fourier_features, axis=1)
fourier_data_normalized = (numerator/denominator).T
# Checking whether the data is normalized.
try:
assert np.min(fourier_data_normalized) >= 0.0 and np.max(fourier_data_normalized) <= 1.0
except AssertionError:
logging.error("Error-Data should be in the range [0, 1]", exc_info=True)
return fourier_data_normalized, labels