In [None]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
from ripser import ripser
from persim import plot_diagrams
from scipy.spatial.distance import pdist, squareform
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.svm import LinearSVC, SVC, SVR
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix
from gtda.time_series import TakensEmbedding
from PyEMD import EMD
from statsmodels.tsa.stattools import adfuller
from pylab import mpl
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from statsmodels.tsa.arima.model import ARIMA
import statsmodels.tsa.stattools as st    

%matplotlib qt

In [None]:
# define load data func
def load_data_from_csv(filepath: str) -> np.ndarray:
    csv_file = pd.read_csv(filepath)
    school_num = csv_file.shape[0]
    year_num = csv_file.shape[1] - 2    # except first and last column
    data_dim = len(csv_file.iloc[0, 1].strip().split())
    school_names = tuple(csv_file["SchoolName"])
    data = np.zeros((school_num, year_num, data_dim))
    for i, _ in enumerate(school_names):
        data_row = list(csv_file.iloc[i, 1:year_num+1])
        data_row = [_.strip().split() for _ in data_row]
        data_row = [float(_) for l in data_row for _ in l]
        data_row = np.array(data_row).reshape((year_num, -1))
        data[i, :, :] = data_row
    return data, school_names

In [None]:
# load data
data, school_names = load_data_from_csv("../dataset/all-data.csv")
school_n, year_n, data_dim = data.shape
idx2sch = {k:v for k,v in enumerate(school_names)}

In [None]:
# select the data we use (see README.md)
selected_data_idx = np.array([0, 2, 7, 11, 15, 19])
selected_data = data[:, :, selected_data_idx]
selected_data = selected_data.transpose(0, 2, 1)
print(selected_data.shape)

In [None]:
# more data!
more_data_path = "../dataset/more-school-all.npy"
more_data = np.load(more_data_path)
selected_data = np.concatenate((selected_data, more_data), axis=0)
print(selected_data.shape)

In [None]:
# define trend-generation func
def generate_trend_matrix(statis_matrix: np.array, dot_prsv=2) -> np.array:
    assert len(statis_matrix.shape) == 2
    m, n = statis_matrix.shape
    trend_matrix = np.empty((m, n-1))
    for row in range(0, m):
        for col in range(0, n-1):
            trend_matrix[row, col] = round(statis_matrix[row, col+1] / statis_matrix[row, col], dot_prsv)
    return trend_matrix

In [None]:
selected_data_2d = selected_data.reshape(-1, 10)
selected_data_2d[selected_data_2d == 0] = np.finfo('float32').eps
trends = [generate_trend_matrix(selected_data_2d[i, :].reshape(1,-1)).squeeze() for i in range(selected_data_2d.shape[0])]

In [None]:
# 0: trend <= 1
# 1: 1 < trend <= 1.2
# 2: 1.2 < trend <= 1.5
# 3: 1.5 < trend
def P_idx(x):
    if x<=1:
        return 0
    elif x>1 and x<=1.25:
        return 1
    elif x>1.25 and x<=1.5:
        return 2
    else:
        return 3

# def P_idx(x):
#     if x<=1:
#         return 0
#     elif x>1 and x<=1.5:
#         return 1
#     else:
#         return 2

# def P_idx(x):
#     if x<=1:
#         return 0
#     else:
#         return 1

In [None]:
X = np.array(trends)[:, :-1]
print(X.shape)
Y = np.array(trends)[:, -1].reshape(-1,1)
print(Y.shape)
# print(selected_data_2d)
# print(Y)

P_vec = np.vectorize(P_idx)
Y = P_vec(Y)

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y)

clf = SVC(kernel='poly')
clf.fit(X_train, np.ravel(Y_train))
print(precision_score(y_true=Y_test, y_pred=clf.predict(X_test), average='macro'))
print(recall_score(y_true=Y_test, y_pred=clf.predict(X_test), average='macro'))
print(accuracy_score(y_true=Y_test, y_pred=clf.predict(X_test)))
print(f1_score(y_true=Y_test, y_pred=clf.predict(X_test), average='macro'))