In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import Model, Sequential
import tensorflow.keras.layers as tfl
from sklearn.metrics import accuracy_score

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score

2022-12-20 18:45:28.461527: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Preparing Data

In [2]:
train = pd.read_csv("data/train_final.csv")
test = pd.read_csv("data/test_final.csv")
sample = pd.read_csv("data/sample_submission.csv")

In [3]:
normal = train[train["target"] == "normal"]
anomaly = train[train["target"] != "normal"]

In [4]:
len(normal), len(anomaly)

(3931871, 69692)

In [5]:
normal.drop_duplicates(keep="first", inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  normal.drop_duplicates(keep="first", inplace=True)


In [6]:
len(normal)

1998979

In [7]:
data = pd.concat([normal, anomaly], axis=0)
data.head()

Unnamed: 0,target,month,day,hour,minute,nums,alpha,small,NULL DISCOVERY ERROR,NULL DISCOVERY SEVERE,NULL DISCOVERY WARNING,NULL HARDWARE WARNING,NULL MONITOR FAILURE,RAS APP FATAL,RAS KERNEL FATAL,RAS KERNEL INFO,RAS LINKCARD INFO
0,normal,0.068621,0.0,0.0,0.016326,0.000479,0.029706,instruction cache parity error corrected,0,0,0,0,0,0,0,1,0
64,normal,0.068621,0.0,0.0,0.019392,0.000479,0.029706,instruction cache parity error corrected,0,0,0,0,0,0,0,1,0
249,normal,0.068621,0.0,0.0,0.018225,0.000479,0.029706,instruction cache parity error corrected,0,0,0,0,0,0,0,1,0
645,normal,0.068621,0.0,0.0,0.023912,0.000479,0.029706,instruction cache parity error corrected,0,0,0,0,0,0,0,1,0
1044,normal,0.068621,0.0,0.0,0.017102,0.000479,0.029706,instruction cache parity error corrected,0,0,0,0,0,0,0,1,0


In [8]:
data["target"].value_counts()/len(data)

normal      0.966311
abnormal    0.033689
Name: target, dtype: float64

In [9]:
m_train = 1
m_test = (0.966311/0.033689)/(53/47)
m_test

25.436115702062768

In [10]:
def normalize_column(col):
    column_train = data[col]*m_train
    column_test = test[col]*m_test
    sc = StandardScaler()
    column_train = sc.fit_transform(column_train.values.reshape(-1, 1))
    column_test = sc.transform(column_test.values.reshape(-1, 1))
    data[col] = column_train
    test[col] = column_test

In [11]:
normalize_column("hour")

In [12]:
data.columns

Index(['target', 'month', 'day', 'hour', 'minute', 'nums', 'alpha', 'small',
       'NULL DISCOVERY ERROR', 'NULL DISCOVERY SEVERE',
       'NULL MONITOR FAILURE', 'RAS APP FATAL', 'RAS KERNEL FATAL',
       'RAS KERNEL INFO', 'RAS LINKCARD INFO'],
      dtype='object')

In [13]:
cols = ['month', 'day', 'minute', 'nums', 'alpha']
for c in cols:
    print(c)
    normalize_column(c)

month
day
minute
nums
alpha


In [14]:
data.head()

Unnamed: 0,target,month,day,hour,minute,nums,alpha,small,NULL DISCOVERY ERROR,NULL DISCOVERY SEVERE,NULL DISCOVERY WARNING,NULL HARDWARE WARNING,NULL MONITOR FAILURE,RAS APP FATAL,RAS KERNEL FATAL,RAS KERNEL INFO,RAS LINKCARD INFO
0,normal,1.723459,-0.194276,-0.493431,-0.239343,-0.274876,0.815973,instruction cache parity error corrected,0,0,0,0,0,0,0,1,0
64,normal,1.723459,-0.194276,-0.493431,0.440413,-0.274876,0.815973,instruction cache parity error corrected,0,0,0,0,0,0,0,1,0
249,normal,1.723459,-0.194276,-0.493431,0.181777,-0.274876,0.815973,instruction cache parity error corrected,0,0,0,0,0,0,0,1,0
645,normal,1.723459,-0.194276,-0.493431,1.442423,-0.274876,0.815973,instruction cache parity error corrected,0,0,0,0,0,0,0,1,0
1044,normal,1.723459,-0.194276,-0.493431,-0.06731,-0.274876,0.815973,instruction cache parity error corrected,0,0,0,0,0,0,0,1,0


In [15]:
test.head()

Unnamed: 0,ID,month,day,hour,minute,nums,alpha,small,NULL DISCOVERY ERROR,NULL DISCOVERY SEVERE,NULL DISCOVERY WARNING,NULL HARDWARE WARNING,NULL MONITOR FAILURE,RAS APP FATAL,RAS KERNEL FATAL,RAS KERNEL INFO,RAS LINKCARD INFO
0,0,-0.580228,-0.194276,8.092905,85.952783,-0.126736,51.461902,rts: kernel terminated for reason 1001,0,0,0,0,0,0,1,0,0
1,1,58.016633,136.247422,-0.493431,61.763226,100.112788,51.461902,data TLB error interrupt,0,0,0,0,0,0,1,0,0
2,2,58.016633,136.247422,36.363334,88.436575,100.112788,51.461902,data TLB error interrupt,0,0,0,0,0,0,1,0,0
3,3,58.016633,-0.027628,-0.465753,81.336313,100.112788,51.461902,generating core.6463,0,0,0,0,0,0,0,1,0
4,4,58.016633,136.247422,66.28254,62.593698,100.112788,51.461902,data TLB error interrupt,0,0,0,0,0,0,1,0,0


In [16]:
target = data["target"]
data.drop(["target"], axis=1, inplace=True)
test_ID = test["ID"]
test.drop(["ID"], axis=1, inplace=True)

In [17]:
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=42, stratify=target)

In [18]:
len(X_train), len(X_test)

(1654936, 413735)

In [19]:
train_small = X_train["small"]
X_train.drop(["small"], axis=1, inplace=True)

test_small = X_test["small"]
X_test.drop(["small"], axis=1, inplace=True)

test_final_small = test["small"]
test.drop(["small"], axis=1, inplace=True)

In [20]:
X_train.columns == test.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True])

## Dealing With `small` Column

In [None]:
train_small_tf = tf.convert_to_tensor(train_small.values.reshape(-1, 1))
test_small_tf = tf.convert_to_tensor(test_small.values.reshape(-1, 1))
test_final_small_tf = tf.convert_to_tensor(test_final_small.values.reshape(-1, 1))



In [25]:
import tensorflow_hub as hub
# We can use this encoding layer in place of our text_vectorizer and embedding layer
sentence_encoder_layer = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder/4",
                                        input_shape=[], # shape of inputs coming to our model 
                                        dtype=tf.string, # data type of inputs coming to the USE layer
                                        trainable=False, # keep the pretrained weights (we'll create a feature extractor)
                                        name="USE") 



2022-12-20 18:47:14.517139: W tensorflow/tsl/framework/cpu_allocator_impl.cc:82] Allocation of 34133760 exceeds 10% of free system memory.
2022-12-20 18:47:14.577895: W tensorflow/tsl/framework/cpu_allocator_impl.cc:82] Allocation of 34133760 exceeds 10% of free system memory.
2022-12-20 18:47:14.664863: W tensorflow/tsl/framework/cpu_allocator_impl.cc:82] Allocation of 34133760 exceeds 10% of free system memory.
2022-12-20 18:47:14.759346: W tensorflow/tsl/framework/cpu_allocator_impl.cc:82] Allocation of 34133760 exceeds 10% of free system memory.
2022-12-20 18:47:14.854790: W tensorflow/tsl/framework/cpu_allocator_impl.cc:82] Allocation of 34133760 exceeds 10% of free system memory.


In [26]:
normalize = Sequential([
    sentence_encoder_layer, # take in sentences and then encode them into an embedding
  tfl.Reshape((64, 8), input_shape=(512,)),
  tfl.Resizing(1,8),
    tfl.Flatten(),
])

Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


In [27]:
normalize.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 USE (KerasLayer)            (None, 512)               256797824 
                                                                 
 reshape (Reshape)           (None, 64, 8)             0         
                                                                 
 resizing (Resizing)         (1, 8, 8)                 0         
                                                                 
 flatten (Flatten)           (1, 64)                   0         
                                                                 
Total params: 256,797,824
Trainable params: 0
Non-trainable params: 256,797,824
_________________________________________________________________


In [29]:
train_small_tf_f = normalize(train_small_tf)

: 

: 