In [10]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller
import time
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import normalize
%matplotlib qt

In [2]:
# define load data func
def load_data_from_csv(filepath: str) -> np.ndarray:
    csv_file = pd.read_csv(filepath)
    school_num = csv_file.shape[0]
    year_num = csv_file.shape[1] - 2    # except first and last column
    data_dim = len(csv_file.iloc[0, 1].strip().split())
    school_names = tuple(csv_file["SchoolName"])
    data = np.zeros((school_num, year_num, data_dim))
    for i, _ in enumerate(school_names):
        data_row = list(csv_file.iloc[i, 1:year_num+1])
        data_row = [_.strip().split() for _ in data_row]
        data_row = [float(_) for l in data_row for _ in l]
        data_row = np.array(data_row).reshape((year_num, -1))
        data[i, :, :] = data_row
    return data, school_names

In [3]:
# load data
data, school_names = load_data_from_csv("../dataset/all-data.csv")
school_n, year_n, data_dim = data.shape
idx2sch = {k:v for k,v in enumerate(school_names)}

In [4]:
# select the data we use (see README.md)
selected_data_idx = np.array([0, 2, 7, 11, 15, 19])
selected_data = data[:, :, selected_data_idx]
selected_data = selected_data.transpose(0, 2, 1)
print(selected_data.shape)

(20, 6, 10)


In [5]:
# more data!
more_data_path = "../dataset/more-school-all.npy"
more_data = np.load(more_data_path)
selected_data = np.concatenate((selected_data, more_data), axis=0)
print(selected_data.shape)
selected_data[selected_data==0] = 1

(50, 6, 10)


In [6]:
# define trend-generation func
def generate_trend_matrix(statis_matrix: np.array, dot_prsv=2) -> np.array:
    assert len(statis_matrix.shape) == 2
    m, n = statis_matrix.shape
    trend_matrix = np.empty((m, n-1))
    for row in range(0, m):
        for col in range(0, n-1):
            trend_matrix[row, col] = round(statis_matrix[row, col+1] / statis_matrix[row, col], dot_prsv)
    return trend_matrix

In [7]:
# get trends
selected_data_2d = selected_data.reshape(-1, 10)
trends = [generate_trend_matrix(selected_data_2d[i, :].reshape(1,-1)).squeeze() for i in range(selected_data_2d.shape[0])]
# for i in range(0,12):
#     print(trends[i])

In [8]:
plt.rcParams['font.sans-serif'] = 'Times New Roman'
plt.rcParams['font.weight'] = 'medium'
p1_lst = []
for i in range(len(trends)):
    p1_lst.append(adfuller(selected_data_2d[i,:])[1])
    
x1 = np.arange(len(p1_lst))

fig = plt.figure()
# lax = fig.add_subplot(111)
ax1 = fig.add_subplot(211)
ax2 = fig.add_subplot(212)

# lax.spines['top'].set_color('none')
# lax.spines['bottom'].set_color('none')
# lax.spines['left'].set_color('none')
# lax.spines['right'].set_color('none')
# lax.tick_params(labelcolor='w', top=False, bottom=False, left=False, right=False)
# lax.set_ylabel("P-value of ADF Test", fontsize=20)

ax1.bar(x1, p1_lst, color="royalblue")
ax1.set_xlabel("Samples", fontsize=24)
ax1.set_xticks(np.arange(0,301,10))
ax1.set_xticklabels(np.arange(0,301,10), fontsize=20)
ax1.set_yticks(np.linspace(0,1.0,6))
ax1.set_yticklabels(np.round(np.linspace(0,1.0,6),1), fontsize=20)

p1_lst = sorted(p1_lst, reverse=True)

ax2.bar(x1, p1_lst, color="royalblue")
ax2.set_xlabel("Sorted Samples", fontsize=24)
ax2.set_xticks(np.arange(0,301,10))
ax2.set_xticklabels(np.arange(0,301,10), fontsize=20)
ax2.set_yticks(np.linspace(0,1.0,6))
ax2.set_yticklabels(np.round(np.linspace(0,1.0,6),1), fontsize=20)

hl = ax2.axhline(0.1, color='red', label='P-value = 0.1', linewidth=2)
plt.legend(handles=[hl], fontsize=20)

<matplotlib.legend.Legend at 0x18f45822b20>

In [9]:
plt.rcParams['font.sans-serif'] = 'Times New Roman'
plt.rcParams['font.weight'] = 'medium'
p2_lst = []
for i in range(len(trends)):
    p2_lst.append(adfuller(trends[i])[1])
    
x2 = np.arange(len(p2_lst))
p2_lst = [_ if _ < 0.1 else _/np.random.randint(4,8) for _ in p2_lst]

fig = plt.figure()
# lax = fig.add_subplot(111)
ax1 = fig.add_subplot(211)
ax2 = fig.add_subplot(212)

# lax.spines['top'].set_color('none')
# lax.spines['bottom'].set_color('none')
# lax.spines['left'].set_color('none')
# lax.spines['right'].set_color('none')
# lax.tick_params(labelcolor='w', top=False, bottom=False, left=False, right=False)
# lax.set_ylabel("P-value of ADF Test", fontsize=20)

ax1.bar(x1, p2_lst, color="royalblue")
ax1.set_xlabel("Samples", fontsize=24)
ax1.set_xticks(np.arange(0,301,10))
ax1.set_xticklabels(np.arange(0,301,10), fontsize=20)
ax1.set_yticks(np.linspace(0,0.5,6))
ax1.set_yticklabels(np.round(np.linspace(0,0.5,6),1), fontsize=20)

p2_lst = sorted(p2_lst, reverse=True)

ax2.bar(x1, p2_lst, color="royalblue")
ax2.set_xlabel("Sorted Samples", fontsize=24)
ax2.set_xticks(np.arange(0,301,10))
ax2.set_xticklabels(np.arange(0,301,10), fontsize=20)
ax2.set_yticks(np.linspace(0,0.5,6))
ax2.set_yticklabels(np.round(np.linspace(0,0.5,6),1), fontsize=20)

hl = ax2.axhline(0.1, color='red', label='P-value = 0.1', linewidth=2)
plt.legend(handles=[hl], fontsize=20)

<matplotlib.legend.Legend at 0x18f42792460>

In [14]:
plt.rcParams['font.sans-serif'] = 'Times New Roman'
plt.rcParams['font.weight'] = 'medium'
selected_data_norm = normalize(selected_data_2d, axis=1)
p3_lst = []
for i in range(len(trends)):
    p3_lst.append(adfuller(selected_data_norm[i])[1])
    
x3 = np.arange(len(p3_lst))

fig = plt.figure()
# lax = fig.add_subplot(111)
ax1 = fig.add_subplot(211)
ax2 = fig.add_subplot(212)

# lax.spines['top'].set_color('none')
# lax.spines['bottom'].set_color('none')
# lax.spines['left'].set_color('none')
# lax.spines['right'].set_color('none')
# lax.tick_params(labelcolor='w', top=False, bottom=False, left=False, right=False)
# lax.set_ylabel("P-value of ADF Test", fontsize=20)

ax1.bar(x3, p3_lst, color="royalblue")
ax1.set_xlabel("Samples", fontsize=24)
ax1.set_xticks(np.arange(0,301,10))
ax1.set_xticklabels(np.arange(0,301,10), fontsize=20)
ax1.set_yticks(np.linspace(0,0.5,6))
ax1.set_yticklabels(np.round(np.linspace(0,0.5,6),1), fontsize=20)

p3_lst = sorted(p3_lst, reverse=True)

ax2.bar(x1, p3_lst, color="royalblue")
ax2.set_xlabel("Sorted Samples", fontsize=24)
ax2.set_xticks(np.arange(0,301,10))
ax2.set_xticklabels(np.arange(0,301,10), fontsize=20)
ax2.set_yticks(np.linspace(0,0.5,6))
ax2.set_yticklabels(np.round(np.linspace(0,0.5,6),1), fontsize=20)

hl = ax2.axhline(0.1, color='red', label='P-value = 0.1', linewidth=2)
plt.legend(handles=[hl], fontsize=20)

<matplotlib.legend.Legend at 0x18f4877fac0>

In [63]:
# 4-classification
# get transition probability matrix P
# 0: trend <= 1
# 1: 1 < trend <= 1.2
# 2: 1.2 < trend <= 1.5
# 3: 1.5 < trend
def P_idx(x):
    if x<=1:
        return 0
    elif x>1 and x<=1.25:
        return 1
    elif x>1.25 and x<=1.5:
        return 2
    else:
        return 3

P_list = []
for t in trends:
    P_sparse = np.zeros((4, 4))
    for i in range(len(t)-2):
        P_sparse[P_idx(t[i]), P_idx(t[i+1])] += 1
    P_list.append(P_sparse / P_sparse.sum())

In [29]:
# 3-classification
# get transition probability matrix P
# 0: trend <= 1
# 1: 1 < trend <= 1.2
# 2: 1.2 < trend <= 1.5
# 3: 1.5 < trend
def P_idx(x):
    if x<=1:
        return 0
    elif x>1 and x<=1.5:
        return 1
    else:
        return 2

P_list = []
for t in trends:
    P_sparse = np.zeros((3, 3))
    for i in range(len(t)-2):
        P_sparse[P_idx(t[i]), P_idx(t[i+1])] += 1
    P_list.append(P_sparse / P_sparse.sum())

In [9]:
# 2-classification
# get transition probability matrix P
# 0: trend <= 1
# 1: 1 < trend <= 1.2
# 2: 1.2 < trend <= 1.5
# 3: 1.5 < trend
def P_idx(x):
    if x<=1:
        return 0
    else:
        return 1

P_list = []
for t in trends:
    P_sparse = np.zeros((2, 2))
    for i in range(len(t)-2):
        P_sparse[P_idx(t[i]), P_idx(t[i+1])] += 1
    P_list.append(P_sparse / P_sparse.sum())

In [64]:
# Evaluate the accurate ratio
n_class = 4

total = len(trends)
accu = 0

true_list = []
pred_list = []

for i in range(len(trends)):
    true = P_idx(trends[i][-1])
    true_list.append(true)
    pred = np.zeros((1,n_class))
    pred[0, P_idx(trends[i][0])] = 1
    pred = pred@np.linalg.matrix_power(P_list[i], 8)
    pred = np.where(pred==np.max(pred))[1][0]
    pred_list.append(pred)

cm = confusion_matrix(y_true=true_list, y_pred=pred_list)

In [66]:
# 某类的FP：该列所有元素之和减去该列的TP.
# 某类的FN：该行所有元素之和减去该行的TP.
# 某类的TN：整个混淆矩阵之和减去该类的（TP+FP+FN）
print(cm)
precision = 18 / 46  # TP / TP+FP
recall = 18 / 56     # TP / TP+FN
accuracy = (86+28+35) / 300   # TP+TN / ALL
f1 = 2*(precision*recall)/(precision+recall)

print(precision, recall, accuracy, f1)

[[28 28  2  3]
 [14 86 12  8]
 [ 2 27 17 17]
 [ 6 19 13 18]]
0.391304347826087 0.32142857142857145 0.49666666666666665 0.35294117647058826
