This notebook shows training process of the models.

In [None]:
! pip install tensorflow

Collecting tf-estimator-nightly==2.8.0.dev2021122109
  Downloading tf_estimator_nightly-2.8.0.dev2021122109-py2.py3-none-any.whl (462 kB)
[K     |████████████████████████████████| 462 kB 8.7 MB/s 
Installing collected packages: tf-estimator-nightly
Successfully installed tf-estimator-nightly-2.8.0.dev2021122109


In [None]:
import tensorflow as tf
from tensorflow.keras import Input
from tensorflow.keras.layers import Bidirectional,Dropout, Dense, Concatenate, LSTM, BatchNormalization 
from tensorflow.keras.layers import MaxPooling1D, Flatten, Conv1D
from tensorflow.keras.regularizers import l2
from tensorflow.keras.models import Model
from tensorflow.keras.losses import binary_crossentropy
from tensorflow.keras.utils import Sequence,to_categorical
from tensorflow.keras.activations import softmax
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import Accuracy, Precision, Recall
from tensorflow.keras.callbacks import ModelCheckpoint
from sklearn.preprocessing import StandardScaler

In [None]:
import numpy as np
import pandas as pd
import random
from training_utils.generator import DataGenerator
from training_utils.utils import train_test_split_timeseries, create_checkpoint, show_stats, train_model

from models.arch import rnn_architecture, cnn_architecture

In [None]:
from google.colab import drive
drive.mount('/content/drive',  force_remount = True)

Mounted at /content/drive


In [None]:
path_to_file = './data/data_for_training.csv'
df_all = pd.read_csv(path_to_file)

In [None]:
df_train, df_valid = train_test_split_timeseries(df_all, 0.75)
df_valid, df_test  = train_test_split_timeseries(df_valid, 0.6)

In [None]:
least_months_repo_is_active = 24
batch_size = 4
num_classes = 2
# number of timeseries per one data point
num_timeseries = len( [col for col in df_train.columns if "_count" in col] )
num_epochs = 40

In [None]:
train_generator = DataGenerator(list_IDs = df_train.project_id.unique(),
                                   data = df_train,
                                   batch_size = batch_size, dim = (least_months_repo_is_active, num_timeseries),
                                   n_classes = num_classes, shuffle = True, months_cnt = least_months_repo_is_active)

valid_generator = DataGenerator(list_IDs = df_valid.project_id.unique(),
                                   data = df_valid,
                                   batch_size = batch_size, dim = (least_months_repo_is_active, num_timeseries),
                                   n_classes = num_classes, shuffle = False, months_cnt = least_months_repo_is_active)

test_generator  = DataGenerator(list_IDs = df_test.project_id.unique(),
                                   data = df_test,
                                   batch_size = batch_size, dim = (least_months_repo_is_active, num_timeseries),
                                   n_classes = num_classes, shuffle = False, months_cnt = least_months_repo_is_active)

x_shape, y_shape = train_generator[0][0].shape, train_generator[0][1].shape
print(f'x shape: {x_shape}\ny shape: {y_shape}')

x shape: (4, 24, 3)
y shape: (4, 2)


In [None]:
# Define timesteps and the number of features
n_timesteps = x_shape[1]
n_features = x_shape[2]

In [None]:
# train LSTM network

model = rnn_architecture(num_classes = num_classes, n_timesteps = n_timesteps, n_features = n_features)
opt = Adam(learning_rate=0.001)
model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=["accuracy",])
# model.summary()
model_checkpoint = create_checkpoint("LSTM")
history = train_model(model ,train_generator, num_epochs, batch_size, valid_generator, model_checkpoint= None)


In [None]:
# train CNN network

model = cnn_architecture(num_classes = num_classes, n_timesteps = n_timesteps, n_features = n_features)
opt = Adam(learning_rate=0.0001)
model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=["accuracy",])# focal loss
# model.summary()
model_checkpoint = create_checkpoint("CNN")
history = train_model(model ,train_generator, num_epochs, batch_size, valid_generator, model_checkpoint= None)

Test accuracy for CNN on test data.



In [None]:
from sklearn.metrics import confusion_matrix
path_to_weights = {'LSTM':'./model_weights/weights_LSTM.15-0.9049-0.7679.hdf5', 'CNN':'weights_CNN.13-0.5174-0.7857.hdf5'}

model.load_weights(path_to_weights['CNN'])
x_test = []
y_test = []

for X,y in test_generator:
  x_test.append(X)
  y_test.append(y)

x_test = np.stack(x_test).reshape((-1,least_months_repo_is_active,num_timeseries))
y_test = np.stack(y_test).reshape((-1,num_classes))
y_test = np.array([np.argmax(y, axis=None, out=None) for y in y_test])

y_pred = model.predict(x_test)
y_pred = np.argmax(y_pred, axis=1)
print(f'y_pred: {y_pred}')
print(f'y_test: {y_test}')

conf_mat = confusion_matrix(y_test, y_pred)
print("----------conf matrix----------")
print(conf_mat)
print(f'test acc: {(conf_mat[0][0] + conf_mat[1][1])/conf_mat.sum()}')

y_pred: [1 1 0 0 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 1 0 0 0 0 1 1 0 1 1 0 0 1 0 0 1 0]
y_test: [1 1 0 0 1 0 1 1 1 1 1 0 1 0 1 1 1 0 1 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0 1 0]
----------conf matrix----------
[[14  5]
 [ 0 17]]
test acc: 0.8611111111111112
