In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

import tensorflow as tf
import tensorflow.keras.layers as tfl
from tensorflow.keras.models import Sequential, Model
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
test = pd.read_csv("/kaggle/input/dataset/test_final.csv")
data = pd.read_csv("/kaggle/input/dataset/train_final.csv")

In [None]:
data.head()

In [None]:
len(data)

In [None]:
normal = data[data['target'] == 'normal']
anomaly = data[data['target'] != 'normal']
len(normal), len(anomaly)

In [None]:
normal.drop_duplicates(['month', 'day', 'hour', 'minute', 'nums', 'alpha', 'small'], inplace=True)
len(normal)

In [None]:
data = pd.concat([ normal, anomaly], axis=0)

In [None]:
len(data)

In [None]:
data["target"].value_counts()/len(data)

In [None]:
m_train = 1
m_test = (98.1/1.91)/(53.3/46.7)
m_test


In [None]:
def normalize_col(col):
    data[col] = data[col]*m_train
    test[col] = test[col]*m_test
    max_test = max(test[col])
    data[col] = data[col]/max_test
    test[col] = test[col]/max_test

In [None]:
data.columns

In [None]:
cols = ['month', 'day', 'hour', 'minute', 'nums', 'alpha']
for c in cols:
    print(c)
    normalize_col(c)

In [None]:
data.sample(5)

In [None]:
target = data["target"]
lb = LabelEncoder()
target = lb.fit_transform(target)

In [None]:
lb.transform(['normal']), lb.inverse_transform([1])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.1, random_state=42, stratify=target)

In [None]:
train_small = X_train["small"]
test_small = X_test["small"]
test_data_small = test["small"]
test_ID = test["ID"]

X_train.drop(["target", "small"], axis=1, inplace=True)
X_test.drop(["target", "small"], axis=1, inplace=True)
test.drop(["ID", "small"], axis=1, inplace=True)

In [None]:
assert len(X_train.columns) == len(X_test.columns) == len(test.columns)
X_train.columns == test.columns

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def calculate_results(y_true, y_pred):
  """
  Calculates model accuracy, precision, recall and f1 score of a binary classification model.

  Args:
  -----
  y_true = true labels in the form of a 1D array
  y_pred = predicted labels in the form of a 1D array

  Returns a dictionary of accuracy, precision, recall, f1-score.
  """
  # Calculate model accuracy
  y_pred = np.round(y_pred,0).astype(int)
  model_accuracy = accuracy_score(y_true, y_pred) * 100
  # Calculate model precision, recall and f1 score using "weighted" average
  model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
  model_results = {"accuracy": model_accuracy,
                  "precision": model_precision,
                  "recall": model_recall,
                  "f1": model_f1}
  return model_results

In [None]:
import tensorflow_hub as hub
# We can use this encoding layer in place of our text_vectorizer and embedding layer
sentence_encoder_layer = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder/4",
                                        input_shape=[], # shape of inputs coming to our model 
                                        dtype=tf.string, # data type of inputs coming to the USE layer
                                        trainable=False, # keep the pretrained weights (we'll create a feature extractor)
                                        name="USE")

In [None]:
del model

In [None]:
tf.random.set_seed(42)

text_input = tfl.Input(shape=[], dtype=tf.string, name="text_input")
text_embedding = sentence_encoder_layer(text_input)
text_x = tfl.Reshape((32, 16), input_shape=(512,), name="text_reshape")(text_embedding)
# text_x = tfl.Conv1D(32, 5, activation="relu", name="text_conv1d_1")(text_x)
# text_x = tfl.Conv1D(32, 5, activation="relu", name="text_conv1d_2")(text_x)
text_x = tfl.Flatten()(text_x)
# text_x = tfl.Dense(16, activation="relu", name="text_dense_1")(text_x)
# text_x = tfl.Dense(32, activation="relu", name="text_dense_2")(text_x)
# text_x = tfl.GlobalMaxPool1D(name="text_global_max_pooling1d")(text_x)

In [None]:
tf.random.set_seed(42)

number_input = tfl.Input(shape=(len(X_train.columns),), name="number_input")
number_x = tfl.Dense(32, activation="relu")(number_input)

x = tfl.Concatenate()([text_x, number_x])
x = tfl.Dense(4, activation="relu")(x)
# x = tfl.Dense(32, activation="relu")(x)
output = tfl.Dense(1, activation="sigmoid")(x)

model  = Model(inputs=[text_input, number_input], outputs=output)

In [None]:
model.summary()

In [None]:
from tensorflow.keras.utils import plot_model
plot_model(model, show_shapes=True)

In [None]:
model.compile(loss="binary_crossentropy",
                optimizer="adam",
                metrics=["accuracy"])

In [None]:
BATCH = 128
train_text_data = tf.data.Dataset.from_tensor_slices(train_small.values)
train_number_data = tf.data.Dataset.from_tensor_slices(X_train.values)
train_dataset = tf.data.Dataset.zip((train_text_data, train_number_data))
train_targets = tf.data.Dataset.from_tensor_slices(y_train)
train_dataset = tf.data.Dataset.zip((train_dataset, train_targets))

test_text_data = tf.data.Dataset.from_tensor_slices(test_small.values)
test_number_data = tf.data.Dataset.from_tensor_slices(X_test.values)
test_dataset = tf.data.Dataset.zip((test_text_data, test_number_data))
test_targets = tf.data.Dataset.from_tensor_slices(y_test)
test_dataset = tf.data.Dataset.zip((test_dataset, test_targets))

train_dataset = train_dataset.batch(BATCH).prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.batch(BATCH).prefetch(tf.data.AUTOTUNE)

final_test_text_data = tf.data.Dataset.from_tensor_slices(test_data_small.values)
final_test_number_data = tf.data.Dataset.from_tensor_slices(test.values)
final_test_dataset = tf.data.Dataset.zip(((final_test_text_data, final_test_number_data),))
final_test_dataset = final_test_dataset.batch(BATCH).prefetch(tf.data.AUTOTUNE)


In [None]:
history_1 = model.fit(train_dataset,
                        epochs=1,
                        steps_per_epoch=len(train_dataset),
                        validation_data=test_dataset,
                        validation_steps=len(test_dataset)//10)

In [None]:
# pred = model.predict(final_test_dataset)
pred = model.predict(final_test_dataset)
pred = np.round(pred).astype(int)
pred_df = pd.DataFrame(np.array([test_ID,np.squeeze(pred)]).T, columns=["ID","target"])
sample = pd.read_csv("/kaggle/input/dataset/sample_submission.csv")

pred_df["ID"]=pred_df["ID"].astype(str)
sample["ID"] = sample["ID"].astype(str)
final_sub = pd.merge(sample, pred_df, on="ID", how="left")
final_sub["target"].fillna(0, inplace=True)
final_sub.drop(" Label", axis=1, inplace=True)
final_sub.columns = ["ID", " Label"]

final_sub[" Label"] = lb.inverse_transform(final_sub[" Label"].astype(int))
final_sub[" Label"].value_counts()/final_sub.shape[0]

In [None]:
res_test = calculate_results(y_test, model.predict(test_dataset))
print("Test results: ", res_test)

res_train = calculate_results(y_train, model.predict(train_dataset))
print("Train results: ", res_train)

In [None]:
final_sub[" Label"].value_counts()/final_sub.shape[0]  #1/2

In [None]:
final_sub.to_csv("sub.csv", index=False)

In [None]:
!rm sub.zip

In [None]:
!zip sub.zip sub.csv