In [None]:
#@title Dataset Download and Preprocessing

import requests, os, zipfile

def download_file_from_google_drive(id, destination):
    URL = "https://docs.google.com/uc?export=download"
    session = requests.Session()
    response = session.get(URL, params = { 'id' : id }, stream = True)
    token = get_confirm_token(response)
    if token:
        params = { 'id' : id, 'confirm' : token }
        response = session.get(URL, params = params, stream = True)
    save_response_content(response, destination)    

def get_confirm_token(response):
    for key, value in response.cookies.items():
        if key.startswith('download_warning'):
            return value
    return None

def save_response_content(response, destination):
    CHUNK_SIZE = 32768
    with open(destination, "wb") as f:
        for chunk in response.iter_content(CHUNK_SIZE):
            if chunk: # filter out keep-alive new chunks
                f.write(chunk)

# Download the dataset
dataset_zip = '/tmp/events.zip'
download_file_from_google_drive('1ZPgbhToHLXfLg2IPl0QkxgP6I1TI-KdU', dataset_zip)

# Unzip the dataset
dataset_dir = '/tmp/events/'
with zipfile.ZipFile(dataset_zip, 'r') as zip_ref:
    zip_ref.extractall(dataset_dir)

# The dataset file path
dataset_file = '/tmp/events.pkl'

In [None]:
import pickle
import glob
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split


In [None]:

# Define the input and the output
X = np.zeros((0, 10, 10, 1), dtype=float)
y = np.zeros((0), dtype=int)

# Merge the event files
pkl_data = glob.glob(dataset_dir+'*.pkl')
encoder = LabelEncoder()

for pkl in pkl_data:
    # Open the event file
    pkl_file = open(pkl, 'rb')
    event = pickle.load(pkl_file)
    
    # Convert to NumPy arrays
    data, target = np.stack(event[0], axis=0), np.array(event[1])

    # Append to the existing arrays
    X = np.append(X, np.expand_dims(data, axis=3), axis=0)
    y = np.append(y, encoder.fit_transform(target))

# Normalize the input
X_max = np.amax(X)
X /= X_max

# Split into the trainining, validation and testing sets
X_train, X_rest, y_train, y_rest = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_rest, y_rest, test_size=0.33, random_state=0, stratify=y_rest)

# Save the dataset file
dataset = [(X_train, y_train), (X_val, y_val), (X_test, y_test)]
pkl_file = open(dataset_file, 'wb')
pickle.dump(dataset, pkl_file)

In [None]:
# Load the dataset
dataset = pickle.load(open(dataset_file, 'rb'))
(X_train, y_train), (X_val, y_val), (X_test, y_test) = dataset

# Print the distribution of classes in each set
for data in dataset:
  (X, y) = data
  t_unique = np.unique(y, return_counts=True)
  t_labels = t_unique[0]
  t_counts = t_unique[1]
  distribution = t_counts / np.sum(t_counts)
  print(t_labels, distribution)


[0 1 2 3 4] [0.00267506 0.04707649 0.73186834 0.12904538 0.08933472]
[0 1 2 3 4] [0.00267263 0.04707977 0.73186987 0.12904331 0.08933442]
[0 1 2 3 4] [0.00267876 0.04707569 0.73186689 0.12904389 0.08933478]


In [None]:
# Calculate class weights on the training set
train_unique = np.unique(y_train, return_counts=True)
train_total = y_train.size
num_classes = train_unique[0].size

class_weight = {}
for label, count in zip(train_unique[0], train_unique[1]):
  class_weight[label] = train_total/(num_classes*count)

class_weight

{0: 74.7645937358148,
 1: 4.248404653202301,
 2: 0.27327319455067633,
 3: 1.5498423870597424,
 4: 2.2387712382764713}

In [None]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

train_datagen = ImageDataGenerator()
val_datagen = ImageDataGenerator()

In [None]:
batch_size = 1000

train_generator = train_datagen.flow(X_train, y_train, batch_size)
validation_generator = val_datagen.flow(X_val, y_val, batch_size)
batch_size

NameError: ignored

In [None]:
from tensorflow.keras import layers, Model

def build_model(num_classes, input_shape=(10, 10, 1)):

  # Input layer returning the `input_img` tensor
  input_img = layers.Input(shape=input_shape)

  x = layers.Conv2D(16, (7, 1), activation='relu', padding='same')(input_img)
  x = layers.Conv2D(16, (1, 7), activation='relu', padding='same')(x)
  x = layers.BatchNormalization()(x)
  x = layers.MaxPool2D((2, 2))(x)

  x = layers.Conv2D(32, (5, 1), activation='relu', padding='same')(x)
  x = layers.Conv2D(32, (1, 5), activation='relu', padding='same')(x)
  x = layers.BatchNormalization()(x)
  x = layers.MaxPool2D((2, 2))(x)

  x = layers.Conv2D(64, (3, 1), activation='relu', padding='same')(x)
  x = layers.Conv2D(64, (1, 3), activation='relu', padding='same')(x)
  x = layers.BatchNormalization()(x)
  x = layers.GlobalMaxPool2D()(x)

  x = layers.Dropout(.5)(x)

  output = layers.Dense(num_classes, 'softmax')(x)

  # Build the model by defining the input and output tensors
  model = Model(input_img, output)

  return model

In [None]:
model = build_model(num_classes)
print(model.summary())

Model: "model_23"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_34 (InputLayer)        [(None, 10, 10, 1)]       0         
_________________________________________________________________
conv2d_86 (Conv2D)           (None, 10, 10, 16)        128       
_________________________________________________________________
conv2d_87 (Conv2D)           (None, 10, 10, 16)        1808      
_________________________________________________________________
batch_normalization_52 (Batc (None, 10, 10, 16)        64        
_________________________________________________________________
max_pooling2d_53 (MaxPooling (None, 5, 5, 16)          0         
_________________________________________________________________
conv2d_88 (Conv2D)           (None, 5, 5, 32)          2592      
_________________________________________________________________
conv2d_89 (Conv2D)           (None, 5, 5, 32)          515

In [None]:
from tensorflow.keras.optimizers import Adam

learning_rate = 1e-3

model.compile(loss='sparse_categorical_crossentropy',
              optimizer=Adam(lr=learning_rate),
              metrics=['accuracy'])

In [None]:
train_steps = np.ceil(y_train.size / train_generator.batch_size)
val_steps = np.ceil(y_val.size / validation_generator.batch_size)

history = model.fit(train_generator,
                    steps_per_epoch=train_steps,
                    epochs=100,
                    validation_data=validation_generator,
                    validation_steps=val_steps,
                    class_weight=class_weight,
                    verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
 25/824 [..............................] - ETA: 10s - loss: 1.5319 - accuracy: 0.1514

KeyboardInterrupt: ignored

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [None]:
X_train_tree = np.reshape(X_train, (-1,100))

In [None]:
clf = RandomForestClassifier(
    n_estimators= 300,
    n_jobs=-1,
    class_weight = "balanced",
    max_depth = 100
                            )

clf.fit(X_train_tree, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                       criterion='gini', max_depth=100, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=300,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [None]:
clf.score(X_val_ros, y_val_res)

0.23860759493670886

In [None]:
from sklearn.naive_bayes import GaussianNB

import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn import tree
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

In [None]:
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter



In [None]:
ros_train = RandomUnderSampler(sampling_strategy='majority', random_state=0)
print('Resampled dataset shape (train) {}'.format(Counter(y_train)))
X_train_ros, y_train_res = ros_train.fit_resample(X_train_tree, y_train)
print('Resampled dataset shape (train) {}'.format(Counter(y_train_res)))

ros_train = SMOTE()
X_train_ros, y_train_res = ros_train.fit_resample(X_train_ros, y_train_res)

ros_val = RandomUnderSampler(random_state=0)
X_val_ros, y_val_res = ros_val.fit_resample(np.reshape(X_val, (-1, 100)), y_val)
print('Resampled dataset shape (train) {}'.format(Counter(y_train_res)))
print('Resampled dataset shape (val) {}'.format(Counter(y_val_res)))

Resampled dataset shape (train) Counter({2: 602717, 3: 106273, 4: 73570, 1: 38769, 0: 2203})




Resampled dataset shape (train) Counter({3: 106273, 4: 73570, 1: 38769, 0: 2203, 2: 2203})




Resampled dataset shape (train) Counter({0: 106273, 1: 106273, 2: 106273, 3: 106273, 4: 106273})
Resampled dataset shape (val) Counter({0: 632, 1: 632, 2: 632, 3: 632, 4: 632})




In [None]:
clf_simple = RandomForestClassifier(n_jobs=-1, max_depth=30,n_estimators=300, criterion='entropy')
clf_simple.fit(X_train_ros,y_train_res)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=30, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=300,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [None]:
clf_simple.score(X_val_ros, y_val_res)

0.23386075949367088

In [None]:
y_pred = clf_simple.predict_proba(np.reshape(X_val,(-1,100)))

In [None]:
print(y_pred)

[[0.11139537 0.20552205 0.19331526 0.19146458 0.29830274]
 [0.19234054 0.19936957 0.20634678 0.2049496  0.19699351]
 [0.21013658 0.19751726 0.19869799 0.20000882 0.19363935]
 ...
 [0.12830318 0.1916715  0.18954533 0.18533796 0.30514202]
 [0.17936808 0.20927525 0.20566462 0.18877106 0.21692099]
 [0.19909311 0.1986726  0.20712519 0.20427117 0.19083793]]


In [None]:
clf_nb = GaussianNB()
clf_nb.fit(X_train_ros,y_train_res)

GaussianNB(priors=None, var_smoothing=1e-09)

In [None]:
clf_nb.score(X_val_ros, y_val_res)

0.20537974683544305