In [None]:
import numpy as np
import pandas as pd
import torch
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from transformers import BertTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, BertForSequenceClassification, DistilBertTokenizerFast, TFDistilBertForSequenceClassification
from torch.utils.data import Dataset, DataLoader
import tensorflow as tf

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
data = pd.read_csv('/content/drive/MyDrive/NATOPS_sid20-C_TRAIN_su.csv')
print("Shape of csv file :", data.shape)

Shape of csv file : (18360, 23)


In [None]:
data.head()

Unnamed: 0,isTest,fea2,fea3,fea4,fea5,fea6,fea7,fea8,fea9,fea10,...,fea14,fea15,fea16,fea17,fea18,fea19,fea20,fea21,sid,class
0,0,0.117647,0.039216,0.0,0.137255,0.019608,0.098039,0.294118,0.0,0.0,...,0.0,0.0,0.0,0.0,0.137255,0.0,0.0,0.0,1,3
1,0,0.117647,0.039216,0.0,0.137255,0.019608,0.098039,0.294118,0.0,0.0,...,0.0,0.0,0.0,0.0,0.137255,0.0,0.0,0.0,1,3
2,0,0.117647,0.039216,0.0,0.137255,0.019608,0.098039,0.294118,0.0,0.0,...,0.0,0.0,0.0,0.0,0.137255,0.0,0.0,0.0,1,3
3,0,0.117647,0.039216,0.0,0.137255,0.019608,0.098039,0.294118,0.0,0.0,...,0.0,0.0,0.0,0.0,0.137255,0.0,0.0,0.0,1,3
4,0,0.117647,0.039216,0.0,0.137255,0.019608,0.098039,0.294118,0.0,0.0,...,0.0,0.0,0.0,0.0,0.137255,0.0,0.0,0.0,1,3


In [None]:
if data.isnull().values.any():
    print("There are missing values")
else:
    print("There are no missing values")

There are no missing values


In [None]:
# Whichever row has isTest as 0 will be train_data and 1 will be test_data
train_data = data[data['isTest'] == 0]
test_data = data[data['isTest'] == 1]

In [None]:
print("Shape of train_data :", train_data.shape)
print("Shape of test_data :", test_data.shape)

Shape of train_data : (9180, 23)
Shape of test_data : (9180, 23)


In [None]:
X_train_unscaled = train_data.drop(['isTest', 'sid', 'class'], axis=1)
y_train = train_data['class']
X_test_unscaled = test_data.drop(['isTest', 'sid', 'class'], axis=1)
y_test = test_data['class']

In [None]:
print("Shape of X_train_unscaled :", X_train_unscaled.shape)
print("Shape of X_test_unscaled :", X_test_unscaled.shape)

Shape of X_train_unscaled : (9180, 20)
Shape of X_test_unscaled : (9180, 20)


In [None]:
# All the values in csv are on same scale but we are doing standaridization anyway
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train_unscaled)
X_test = scaler.transform(X_test_unscaled)

In [None]:
# After scaling
print("Shape of X_train :", X_train.shape)
print("Shape of X_test :", X_test.shape)
print("Shape of y_train :", y_train.shape)
print("Shape of y_test :", y_test.shape)

Shape of X_train : (9180, 20)
Shape of X_test : (9180, 20)
Shape of y_train : (9180,)
Shape of y_test : (9180,)


In [None]:
# Im defining a method to give evalutions on a model
def evaluate_model(X_train, y_train, X_test, y_test, model, model_name):
    print("Evalutions for", model_name,":")

    y_train_pred = model.predict(X_train)
    train_accuracy = accuracy_score(y_train, y_train_pred)
    print(f"Train Accuracy of {model_name} : {train_accuracy}")

    y_test_pred = model.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    print(f"Test Accuracy of {model_name} : {test_accuracy}")

    cm = confusion_matrix(y_test, y_test_pred)
    print(f"\nConfusion Matrix of {model_name}:\n", cm)

    clf_report = classification_report(y_test, y_test_pred)
    print(f"\nClassification report for {model_name}:\n", clf_report)

In [None]:
# Logistic Regression
log_reg = LogisticRegression(random_state=42,max_iter=500)
log_reg.fit(X_train, y_train)

In [None]:
evaluate_model(X_train, y_train, X_test, y_test, log_reg, "logistic regression model")

Evalutions for logistic regression model :
Train Accuracy of logistic regression model : 0.7611111111111111
Test Accuracy of logistic regression model : 0.6666666666666666

Confusion Matrix of logistic regression model:
 [[1224  204  102    0    0    0]
 [  51  969  459   51    0    0]
 [ 102  867  510   51    0    0]
 [   0    0    0 1020  510    0]
 [   0    0    0  663  867    0]
 [   0    0    0    0    0 1530]]

Classification report for logistic regression model:
               precision    recall  f1-score   support

           0       0.89      0.80      0.84      1530
           1       0.47      0.63      0.54      1530
           2       0.48      0.33      0.39      1530
           3       0.57      0.67      0.62      1530
           4       0.63      0.57      0.60      1530
           5       1.00      1.00      1.00      1530

    accuracy                           0.67      9180
   macro avg       0.67      0.67      0.66      9180
weighted avg       0.67      0.67    

In [None]:
# Random Forest classifier
rfc = RandomForestClassifier(random_state=42)
rfc.fit(X_train, y_train)

In [None]:
evaluate_model(X_train, y_train, X_test, y_test, rfc, "Random forest model")

Evalutions for Random forest model :
Train Accuracy of Random forest model : 0.9888888888888889
Test Accuracy of Random forest model : 0.6555555555555556

Confusion Matrix of Random forest model:
 [[1224  255   51    0    0    0]
 [ 204  765  561    0    0    0]
 [ 102  816  612    0    0    0]
 [   0    0    0  918  612    0]
 [   0    0    0  561  969    0]
 [   0    0    0    0    0 1530]]

Classification report for Random forest model:
               precision    recall  f1-score   support

           0       0.80      0.80      0.80      1530
           1       0.42      0.50      0.45      1530
           2       0.50      0.40      0.44      1530
           3       0.62      0.60      0.61      1530
           4       0.61      0.63      0.62      1530
           5       1.00      1.00      1.00      1530

    accuracy                           0.66      9180
   macro avg       0.66      0.66      0.66      9180
weighted avg       0.66      0.66      0.66      9180



In [None]:
# Multi Layer perceptron (MLP)
mlp1 = MLPClassifier(hidden_layer_sizes=(100,50), activation='relu', random_state=42, max_iter=100)
mlp1.fit(X_train, y_train)



In [None]:
evaluate_model(X_train, y_train, X_test, y_test, mlp1, "MLP model 1")

Evalutions for MLP model 1 :
Train Accuracy of MLP model 1 : 0.9666666666666667
Test Accuracy of MLP model 1 : 0.6888888888888889

Confusion Matrix of MLP model 1:
 [[1224  255    0   51    0    0]
 [ 102  714  612  102    0    0]
 [ 102  459  918    0   51    0]
 [   0   51    0 1071  408    0]
 [   0    0  102  510  918    0]
 [   0    0   51    0    0 1479]]

Classification report for MLP model 1:
               precision    recall  f1-score   support

           0       0.86      0.80      0.83      1530
           1       0.48      0.47      0.47      1530
           2       0.55      0.60      0.57      1530
           3       0.62      0.70      0.66      1530
           4       0.67      0.60      0.63      1530
           5       1.00      0.97      0.98      1530

    accuracy                           0.69      9180
   macro avg       0.69      0.69      0.69      9180
weighted avg       0.69      0.69      0.69      9180



In [None]:
mlp2 = MLPClassifier(hidden_layer_sizes=(100,50,50), activation='relu', random_state=42, max_iter=100) # learning_rate_init=0.001
mlp2.fit(X_train, y_train)



In [None]:
evaluate_model(X_train, y_train, X_test, y_test, mlp2, "MLP model 2")

Evalutions for MLP model 2 :
Train Accuracy of MLP model 2 : 0.9722222222222222
Test Accuracy of MLP model 2 : 0.7111111111111111

Confusion Matrix of MLP model 2:
 [[1224  255   51    0    0    0]
 [ 153  816  561    0    0    0]
 [  51  612  867    0    0    0]
 [   0    0    0 1275  255    0]
 [   0    0    0  612  918    0]
 [   0    0  102    0    0 1428]]

Classification report for MLP model 2:
               precision    recall  f1-score   support

           0       0.86      0.80      0.83      1530
           1       0.48      0.53      0.51      1530
           2       0.55      0.57      0.56      1530
           3       0.68      0.83      0.75      1530
           4       0.78      0.60      0.68      1530
           5       1.00      0.93      0.97      1530

    accuracy                           0.71      9180
   macro avg       0.72      0.71      0.71      9180
weighted avg       0.72      0.71      0.71      9180



In [None]:
mlp3 = MLPClassifier(hidden_layer_sizes=(100,50,50), activation='relu', random_state=42, max_iter=100, learning_rate_init=0.003)
mlp3.fit(X_train, y_train)

In [None]:
evaluate_model(X_train, y_train, X_test, y_test, mlp3, "MLP model 3")

Evalutions for MLP model 3 :
Train Accuracy of MLP model 3 : 0.9888888888888889
Test Accuracy of MLP model 3 : 0.6944444444444444

Confusion Matrix of MLP model 3:
 [[1122  357   51    0    0    0]
 [ 102  816  612    0    0    0]
 [ 102  612  816    0    0    0]
 [   0   51    0 1275  204    0]
 [   0    0    0  663  867    0]
 [   0    0   51    0    0 1479]]

Classification report for MLP model 3:
               precision    recall  f1-score   support

           0       0.85      0.73      0.79      1530
           1       0.44      0.53      0.48      1530
           2       0.53      0.53      0.53      1530
           3       0.66      0.83      0.74      1530
           4       0.81      0.57      0.67      1530
           5       1.00      0.97      0.98      1530

    accuracy                           0.69      9180
   macro avg       0.72      0.69      0.70      9180
weighted avg       0.72      0.69      0.70      9180



In [None]:
preds1 = mlp1.predict_proba(X_test)
preds2 = mlp2.predict_proba(X_test)
preds3 = mlp3.predict_proba(X_test)

ensemble_preds = np.mean([preds1, preds2, preds3], axis=0)
final_predictions = np.argmax(ensemble_preds, axis=1)

t_accuracy = accuracy_score(y_train, final_predictions)
accuracy = accuracy_score(y_test, final_predictions)
print("Ensemble Model train Accuracy :", t_accuracy)
print("Ensemble Model Accuracy :", accuracy)

Ensemble Model train Accuracy : 0.15
Ensemble Model Accuracy : 0.7166666666666667


In [None]:
data

Unnamed: 0,isTest,fea2,fea3,fea4,fea5,fea6,fea7,fea8,fea9,fea10,...,fea14,fea15,fea16,fea17,fea18,fea19,fea20,fea21,sid,class
0,0,0.117647,0.039216,0.0,0.137255,0.019608,0.098039,0.294118,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.137255,0.0,0.0,0.0,1,3
1,0,0.117647,0.039216,0.0,0.137255,0.019608,0.098039,0.294118,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.137255,0.0,0.0,0.0,1,3
2,0,0.117647,0.039216,0.0,0.137255,0.019608,0.098039,0.294118,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.137255,0.0,0.0,0.0,1,3
3,0,0.117647,0.039216,0.0,0.137255,0.019608,0.098039,0.294118,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.137255,0.0,0.0,0.0,1,3
4,0,0.117647,0.039216,0.0,0.137255,0.019608,0.098039,0.294118,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.137255,0.0,0.0,0.0,1,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18355,1,0.215686,0.000000,0.0,0.196078,0.078431,0.156863,0.000000,0.0,0.0,...,0.0,0.0,0.078431,0.098039,0.156863,0.0,0.0,0.0,360,3
18356,1,0.215686,0.000000,0.0,0.196078,0.078431,0.156863,0.000000,0.0,0.0,...,0.0,0.0,0.078431,0.098039,0.156863,0.0,0.0,0.0,360,3
18357,1,0.215686,0.000000,0.0,0.196078,0.078431,0.156863,0.000000,0.0,0.0,...,0.0,0.0,0.078431,0.098039,0.156863,0.0,0.0,0.0,360,3
18358,1,0.215686,0.000000,0.0,0.196078,0.078431,0.156863,0.000000,0.0,0.0,...,0.0,0.0,0.078431,0.098039,0.156863,0.0,0.0,0.0,360,3


In [None]:
feature_columns = []
for i in range(2, 22):
    feature_columns.append(f'fea{i}')

X = data[feature_columns].astype(str).apply(lambda row: ' '.join(row.values), axis=1)
y = data['class']
y = y.apply(lambda x: int(x))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

In [None]:
# Transformer
# Define the tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

# Tokenize the inputs
train_encodings = tokenizer([str(text) for text in X_train], truncation=True, padding=True)
test_encodings = tokenizer([str(text) for text in X_test], truncation=True, padding=True)


# Prepare the training dataset
classification_train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    list(y_train)
))

# Prepare the test dataset
classification_test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    list(y_test)
))

# Define the model
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=6)

# Define the loss function and optimizer
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

# Train the model
history = model.fit(classification_train_dataset.shuffle(1000).batch(16), epochs=3, batch_size=16)

# Evaluate the model
results = model.evaluate(classification_test_dataset.shuffle(1000).batch(16), batch_size=16)
print('Test loss:', results[0])
print('Test accuracy:', results[1])

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

Epoch 1/3
Epoch 2/3
Epoch 3/3
Test loss: 0.07962657511234283
Test accuracy: 0.9648148417472839
