## Imports and tools


In [1]:
import numpy as np
import pandas as pd
from transformers import TrainingArguments
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer
from transformers import BertTokenizer

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import OneHotEncoder
# from transformers import CharTokenizer
from sklearn.feature_extraction.text import CountVectorizer
import re

from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score,accuracy_score,classification_report

In [2]:
from torch.utils.data import TensorDataset, DataLoader
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device


device(type='cuda')

# Transformer Experiment


In [3]:
import gdown
# Download the CSV file from Google Drive
url = 'https://drive.google.com/'
output = 'data.csv'
gdown.download(url, output, quiet=False)

ModuleNotFoundError: No module named 'gdown'

In [4]:

# %cd /content/drive/MyDrive/speed/vision/malware-analysis
data = pd.read_csv('data.csv')
data.head()

Unnamed: 0,MD5,Released Date,App Name,Package Name,AV-Rank,SHA256,SHA1,Version,Apk Size,malware
0,e1d7b9140b18aeba6860a6f800d5f5a0,2020-06-27 08:04:27,Coronavirus Help,appinventor.ai_david_taylor.Coronavirus_help2020,1,f6b2680ada05d9b575e5d9f1e50d60c037ef51460f86f2...,0e1a86d155f947180ef0d0134d7946dcdf2e9866,1.1,6151195,1
1,018c277310a26c282c2b13d87588c04e,2020-06-27 07:21:34,COVID-19 in HK,org.chromium.webapk.ab640bf847b1ddee8,0,da4470ab9f8badd2022e2c3eaa992ea252ac5e952c52ce...,a6ba510f42f5ba56487b68886e25ec6ffc08ca77,29,333060,0
2,c46df19bb879da90c8eb9a1f363c2e5e,2020-06-27 07:06:10,SM-Covid-19,it.softmining.projects.covid19.savelifestyle,0,57a907c6411115f4ba4b29ffceec67f4fb2de01068c246...,fa9b5c4bfeaceff1e9077ffbec298102cd0cf2a7,4.1,5133947,0
3,cbae9a40ca0cc1887a3a5d237f086f41,2020-06-27 04:16:51,COVID-19 CDMX,mx.gob.cdmx.adip.covid19cdmx,0,d152238137dd5b1dfb07fa7f1e8431531fcbb651e8d069...,030293f11a50c46022380cf6bd9dd989620b446b,1.0.3,7973144,0
4,61886afb3928d41158775ea0a86cfa36,2020-06-26 20:20:43,Coronavirus Advice,appinventor.ai_david_taylor.Coronavirus_help2020,0,95e7833dcac3605a551bab2f41045fd5b55ba9687b28bc...,9351606c7afed56c647766aa712bb72d3c2f81bf,1,6134811,0


In [5]:
data['malware'].value_counts()

0    2040
1     338
Name: malware, dtype: int64

# Transformer experiment

### A character tokenizer



In [6]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

## Collecting and preprocessing the data:




In [7]:
X = data[['Package Name', 'Apk Size']]
y = data['malware']

## Splitting the data



In [48]:
import pandas as pd
from sklearn.model_selection import train_test_split

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Assuming X, y are your data and labels, and they are in pandas DataFrame format
data = pd.concat([X, y], axis=1)

# Split into train and test
train_data, test_data = train_test_split(data, test_size=0.3, random_state=42)

# In train dataset, find the majority class (assumes binary 0/1 target variable)
majority_class = train_data[y.name].mode()[0]
minority_class = 1 - majority_class

# Split the train dataset into majority and minority
majority_data = train_data[train_data[y.name] == majority_class]
minority_data = train_data[train_data[y.name] == minority_class]

# Oversample the minority class to have the same number of samples as the majority class
oversampled_minority_data = minority_data.sample(len(majority_data), replace=True, random_state=42)

# Concatenate the majority data and oversampled minority data
balanced_train_data = pd.concat([majority_data, oversampled_minority_data])

# Split the data back into X_train and y_train
X_train = balanced_train_data.drop(y.name, axis=1)
y_train = balanced_train_data[y.name]
# For the test set, you can just split the data and labels
X_test = test_data.drop(y.name, axis=1)
y_test = test_data[y.name]

In [29]:
majority_class, minority_class

(0, 1)

In [49]:
y_train.value_counts()

0    1420
1    1420
Name: malware, dtype: int64

In [31]:
y_test.value_counts()

0    620
1     94
Name: malware, dtype: int64

## Tokenizing the package names:




In [4]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


class PackageNameDataset(Dataset):
    def __init__(self, tokens, labels):
        self.tokens = tokens
        self.labels = labels

    def __len__(self):
        return len(self.tokens)

    def __getitem__(self, idx):
      tokens = torch.tensor(self.tokens[idx])
      label = torch.tensor(self.labels[idx])
      return tokens, label

# Tokenize and convert the package names to numerical data
from torch.nn.utils.rnn import pad_sequence

# Define the maximum length
MAX_LEN = 20

# Pad the sequences
# Pad and truncate the sequences
X_train_indices = [
    torch.tensor(tokenizer.encode(' '.join(['Package Name']+row['Package Name'].split(".")+['Apk Size ',str(row['Apk Size'])]), add_special_tokens=True, max_length=MAX_LEN, pad_to_max_length=True)) for i,row in X_train.iterrows()
]
X_test_indices = [
    torch.tensor(tokenizer.encode(' '.join(['Package Name']+row['Package Name'].split(".")+['Apk Size ',str(row['Apk Size'])]), add_special_tokens=True, max_length=MAX_LEN, pad_to_max_length=True)) for i,row in X_test.iterrows()
    ]

# Create the Dataset
train_dataset = PackageNameDataset(X_train_indices, y_train.values)
test_dataset = PackageNameDataset(X_test_indices, y_test.values)

# Create the DataLoader
batch_size = 128
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

Downloading:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

NameError: name 'X_train' is not defined

In [60]:
X_train.shape

(2840, 2)

In [63]:
X_train_indices[0]

tensor([  101,  7427,  2171,  7327,  1058, 17751,  7637,  9706,  2243,  2946,
        28202,  2581,  2575,  2620,   102,     0,     0,     0,     0,     0])

## Defining Transformer Model



In [61]:
data = next(iter(train_dataloader))
print("Shape of the entire dataset:", (len(train_dataloader.dataset),) + data[0].shape[1:])

Shape of the entire dataset: (2840, 20)


  tokens = torch.tensor(self.tokens[idx])


In [89]:
from transformers import BertForSequenceClassification, AdamW, BertConfig

# Load a pre-trained BERT model
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", # Use the smaller "base" version
    num_labels = 2, # The number of output labels
    output_attentions = True, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

import torch.nn as nn
import torch.optim as optim

# Freeze the layers
for i,param in enumerate(model.bert.parameters()):
    if i<150:
        param.requiresGrad = False
    

# Add a classification layer
model.classifier = nn.Linear(768, 2)

model = model.to(device)


# Define the loss function
criterion = nn.CrossEntropyLoss()

# Define the optimizer
optimizer = optim.Adam(model.parameters(), lr=2e-6)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

### Training



In [90]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast
from tqdm import tqdm
from torch.nn.utils.rnn import pad_sequence

# Train the model
epochs = 50
for epoch in range(epochs):
    running_loss = 0.0
    total_predictions = 0.0
    correct_predictions = 0.0

    # Progress bar
    progress_bar = tqdm(enumerate(train_dataloader), total=len(train_dataloader))

    model.train()

    for i, (tokens, labels) in progress_bar:
        tokens = tokens.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=tokens)
        _, predicted = torch.max(outputs.logits, 1)
        total_predictions += labels.size(0)
        correct_predictions += (predicted == labels).sum().item()

        loss = criterion(outputs.logits, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        progress_bar.set_description(f"Epoch {epoch+1} Iteration {i+1}: loss {loss.item():.5f}.")

    epoch_loss = running_loss / len(train_dataloader)
    epoch_acc = (correct_predictions / total_predictions) * 100.0

    print(f'Epoch {epoch + 1}: Loss: {epoch_loss:.4f}, Accuracy: {epoch_acc:.2f}%')

    model.eval()
    valid_running_loss = 0.0

    with torch.no_grad():
        for i, (tokens, labels) in enumerate(test_dataloader):
            tokens = tokens.to(device)
            labels = labels.to(device)

            outputs = model(input_ids=tokens)
            loss = criterion(outputs.logits, labels)
            valid_running_loss += loss.item()

    epoch_valid_loss = valid_running_loss / len(test_dataloader)
    print(f'Validation Loss: {epoch_valid_loss:.4f}')

print('Finished Training')

  tokens = torch.tensor(self.tokens[idx])
Epoch 1 Iteration 23: loss 0.84323.: 100%|██████████| 23/23 [00:02<00:00,  9.92it/s]


Epoch 1: Loss: 0.7662, Accuracy: 31.65%
Validation Loss: 0.6059


Epoch 2 Iteration 23: loss 0.79076.: 100%|██████████| 23/23 [00:02<00:00,  9.95it/s]


Epoch 2: Loss: 0.6964, Accuracy: 52.46%
Validation Loss: 0.6216


Epoch 3 Iteration 23: loss 0.70656.: 100%|██████████| 23/23 [00:02<00:00,  9.76it/s]


Epoch 3: Loss: 0.6852, Accuracy: 56.30%
Validation Loss: 0.6381


Epoch 4 Iteration 23: loss 0.65727.: 100%|██████████| 23/23 [00:02<00:00,  9.49it/s]


Epoch 4: Loss: 0.6745, Accuracy: 59.93%
Validation Loss: 0.6519


Epoch 5 Iteration 23: loss 0.66987.: 100%|██████████| 23/23 [00:02<00:00,  9.64it/s]


Epoch 5: Loss: 0.6670, Accuracy: 62.08%
Validation Loss: 0.6514


Epoch 6 Iteration 23: loss 0.63023.: 100%|██████████| 23/23 [00:02<00:00,  9.58it/s]


Epoch 6: Loss: 0.6579, Accuracy: 63.24%
Validation Loss: 0.6342


Epoch 7 Iteration 23: loss 0.59258.: 100%|██████████| 23/23 [00:02<00:00,  9.60it/s]


Epoch 7: Loss: 0.6368, Accuracy: 65.60%
Validation Loss: 0.6238


Epoch 8 Iteration 23: loss 0.54983.: 100%|██████████| 23/23 [00:02<00:00,  9.77it/s]


Epoch 8: Loss: 0.6115, Accuracy: 68.27%
Validation Loss: 0.6212


Epoch 9 Iteration 23: loss 0.50673.: 100%|██████████| 23/23 [00:02<00:00,  9.88it/s]


Epoch 9: Loss: 0.5973, Accuracy: 68.66%
Validation Loss: 0.6030


Epoch 10 Iteration 23: loss 0.51601.: 100%|██████████| 23/23 [00:02<00:00,  9.76it/s]


Epoch 10: Loss: 0.5856, Accuracy: 70.88%
Validation Loss: 0.5827


Epoch 11 Iteration 23: loss 0.48235.: 100%|██████████| 23/23 [00:02<00:00,  9.31it/s]


Epoch 11: Loss: 0.5687, Accuracy: 71.80%
Validation Loss: 0.5701


Epoch 12 Iteration 23: loss 0.52725.: 100%|██████████| 23/23 [00:02<00:00,  9.47it/s]


Epoch 12: Loss: 0.5650, Accuracy: 72.18%
Validation Loss: 0.5545


Epoch 13 Iteration 23: loss 0.47303.: 100%|██████████| 23/23 [00:02<00:00,  9.52it/s]


Epoch 13: Loss: 0.5472, Accuracy: 73.52%
Validation Loss: 0.5293


Epoch 14 Iteration 23: loss 0.50919.: 100%|██████████| 23/23 [00:02<00:00,  9.32it/s]


Epoch 14: Loss: 0.5399, Accuracy: 74.37%
Validation Loss: 0.5133


Epoch 15 Iteration 23: loss 0.45005.: 100%|██████████| 23/23 [00:02<00:00,  9.48it/s]


Epoch 15: Loss: 0.5224, Accuracy: 75.81%
Validation Loss: 0.4959


Epoch 16 Iteration 23: loss 0.40340.: 100%|██████████| 23/23 [00:02<00:00,  9.50it/s]


Epoch 16: Loss: 0.5081, Accuracy: 76.69%
Validation Loss: 0.4865


Epoch 17 Iteration 23: loss 0.44225.: 100%|██████████| 23/23 [00:02<00:00,  9.53it/s]


Epoch 17: Loss: 0.5065, Accuracy: 77.08%
Validation Loss: 0.4593


Epoch 18 Iteration 23: loss 0.46702.: 100%|██████████| 23/23 [00:02<00:00,  9.50it/s]


Epoch 18: Loss: 0.4917, Accuracy: 79.30%
Validation Loss: 0.4334


Epoch 19 Iteration 23: loss 0.41882.: 100%|██████████| 23/23 [00:02<00:00,  9.52it/s]


Epoch 19: Loss: 0.4761, Accuracy: 79.51%
Validation Loss: 0.4278


Epoch 20 Iteration 23: loss 0.40875.: 100%|██████████| 23/23 [00:02<00:00,  9.52it/s]


Epoch 20: Loss: 0.4618, Accuracy: 80.39%
Validation Loss: 0.4157


Epoch 21 Iteration 23: loss 0.36553.: 100%|██████████| 23/23 [00:02<00:00,  9.50it/s]


Epoch 21: Loss: 0.4544, Accuracy: 80.07%
Validation Loss: 0.3947


Epoch 22 Iteration 23: loss 0.39117.: 100%|██████████| 23/23 [00:02<00:00,  9.50it/s]


Epoch 22: Loss: 0.4353, Accuracy: 81.65%
Validation Loss: 0.3931


Epoch 23 Iteration 23: loss 0.39759.: 100%|██████████| 23/23 [00:02<00:00,  9.49it/s]


Epoch 23: Loss: 0.4281, Accuracy: 81.55%
Validation Loss: 0.3794


Epoch 24 Iteration 23: loss 0.36092.: 100%|██████████| 23/23 [00:02<00:00,  9.52it/s]


Epoch 24: Loss: 0.4106, Accuracy: 82.96%
Validation Loss: 0.3747


Epoch 25 Iteration 23: loss 0.31307.: 100%|██████████| 23/23 [00:02<00:00,  9.39it/s]


Epoch 25: Loss: 0.4004, Accuracy: 82.99%
Validation Loss: 0.3775


Epoch 26 Iteration 23: loss 0.31171.: 100%|██████████| 23/23 [00:02<00:00,  9.45it/s]


Epoch 26: Loss: 0.4011, Accuracy: 83.42%
Validation Loss: 0.3533


Epoch 27 Iteration 23: loss 0.31296.: 100%|██████████| 23/23 [00:02<00:00,  9.43it/s]


Epoch 27: Loss: 0.3792, Accuracy: 84.65%
Validation Loss: 0.3650


Epoch 28 Iteration 23: loss 0.28389.: 100%|██████████| 23/23 [00:02<00:00,  9.45it/s]


Epoch 28: Loss: 0.3717, Accuracy: 84.37%
Validation Loss: 0.3442


Epoch 29 Iteration 23: loss 0.27253.: 100%|██████████| 23/23 [00:02<00:00,  9.48it/s]


Epoch 29: Loss: 0.3585, Accuracy: 84.65%
Validation Loss: 0.3484


Epoch 30 Iteration 23: loss 0.29990.: 100%|██████████| 23/23 [00:02<00:00,  9.44it/s]


Epoch 30: Loss: 0.3554, Accuracy: 85.53%
Validation Loss: 0.3350


Epoch 31 Iteration 23: loss 0.27043.: 100%|██████████| 23/23 [00:02<00:00,  9.47it/s]


Epoch 31: Loss: 0.3390, Accuracy: 85.77%
Validation Loss: 0.3310


Epoch 32 Iteration 23: loss 0.21636.: 100%|██████████| 23/23 [00:02<00:00,  9.36it/s]


Epoch 32: Loss: 0.3225, Accuracy: 87.15%
Validation Loss: 0.3285


Epoch 33 Iteration 23: loss 0.24489.: 100%|██████████| 23/23 [00:02<00:00,  9.42it/s]


Epoch 33: Loss: 0.3201, Accuracy: 86.87%
Validation Loss: 0.3317


Epoch 34 Iteration 23: loss 0.16708.: 100%|██████████| 23/23 [00:02<00:00,  9.39it/s]


Epoch 34: Loss: 0.2981, Accuracy: 88.10%
Validation Loss: 0.3124


Epoch 35 Iteration 23: loss 0.18423.: 100%|██████████| 23/23 [00:02<00:00,  9.45it/s]


Epoch 35: Loss: 0.2917, Accuracy: 87.96%
Validation Loss: 0.3102


Epoch 36 Iteration 23: loss 0.16782.: 100%|██████████| 23/23 [00:02<00:00,  9.46it/s]


Epoch 36: Loss: 0.2821, Accuracy: 88.73%
Validation Loss: 0.2980


Epoch 37 Iteration 23: loss 0.15046.: 100%|██████████| 23/23 [00:02<00:00,  9.43it/s]


Epoch 37: Loss: 0.2675, Accuracy: 89.19%
Validation Loss: 0.3040


Epoch 38 Iteration 23: loss 0.20085.: 100%|██████████| 23/23 [00:02<00:00,  9.41it/s]


Epoch 38: Loss: 0.2629, Accuracy: 89.33%
Validation Loss: 0.2929


Epoch 39 Iteration 23: loss 0.12719.: 100%|██████████| 23/23 [00:02<00:00,  9.47it/s]


Epoch 39: Loss: 0.2508, Accuracy: 89.65%
Validation Loss: 0.2948


Epoch 40 Iteration 23: loss 0.12825.: 100%|██████████| 23/23 [00:02<00:00,  9.47it/s]


Epoch 40: Loss: 0.2362, Accuracy: 90.32%
Validation Loss: 0.2975


Epoch 41 Iteration 23: loss 0.15691.: 100%|██████████| 23/23 [00:02<00:00,  9.47it/s]


Epoch 41: Loss: 0.2301, Accuracy: 91.06%
Validation Loss: 0.2818


Epoch 42 Iteration 23: loss 0.10604.: 100%|██████████| 23/23 [00:02<00:00,  9.49it/s]


Epoch 42: Loss: 0.2036, Accuracy: 92.22%
Validation Loss: 0.2911


Epoch 43 Iteration 23: loss 0.11651.: 100%|██████████| 23/23 [00:02<00:00,  9.50it/s]


Epoch 43: Loss: 0.2082, Accuracy: 91.69%
Validation Loss: 0.2828


Epoch 44 Iteration 23: loss 0.10810.: 100%|██████████| 23/23 [00:02<00:00,  9.46it/s]


Epoch 44: Loss: 0.1829, Accuracy: 93.06%
Validation Loss: 0.2905


Epoch 45 Iteration 23: loss 0.07392.: 100%|██████████| 23/23 [00:02<00:00,  9.43it/s]


Epoch 45: Loss: 0.1798, Accuracy: 93.03%
Validation Loss: 0.2948


Epoch 46 Iteration 23: loss 0.05561.: 100%|██████████| 23/23 [00:02<00:00,  9.40it/s]


Epoch 46: Loss: 0.1687, Accuracy: 93.84%
Validation Loss: 0.2845


Epoch 47 Iteration 23: loss 0.05978.: 100%|██████████| 23/23 [00:02<00:00,  9.50it/s]


Epoch 47: Loss: 0.1606, Accuracy: 93.91%
Validation Loss: 0.2839


Epoch 48 Iteration 23: loss 0.07811.: 100%|██████████| 23/23 [00:02<00:00,  9.44it/s]


Epoch 48: Loss: 0.1449, Accuracy: 94.82%
Validation Loss: 0.2953


Epoch 49 Iteration 23: loss 0.05834.: 100%|██████████| 23/23 [00:02<00:00,  9.45it/s]


Epoch 49: Loss: 0.1507, Accuracy: 94.30%
Validation Loss: 0.2894


Epoch 50 Iteration 23: loss 0.06025.: 100%|██████████| 23/23 [00:02<00:00,  9.47it/s]


Epoch 50: Loss: 0.1393, Accuracy: 95.18%
Validation Loss: 0.2842
Finished Training


## Testing model


## definine the metrics function




In [91]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

def get_metrics(y_true, y_pred):
    metrics = {}

    # Calculate accuracy
    accuracy = accuracy_score(y_true, y_pred)
    metrics['Accuracy'] = f'{accuracy*100:.2f}%'

    # Calculate precision, recall, and F1 score
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    metrics['Precision'] = f'{precision*100:.2f}%'
    metrics['Recall'] = f'{recall*100:.2f}%'
    metrics['F1 Score'] = f'{f1*100:.2f}%'

    # Calculate ROC AUC score
    roc_auc = roc_auc_score(y_true, y_pred)
    metrics['ROC AUC'] = f'{roc_auc*100:.2f}%'

    return metrics


In [93]:
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

# Evaluate the model on the validation set
model.eval()  # Set the model to evaluation mode

# Initialize variables to store the total loss and number of correct predictions


y_true = []
y_pred = []
# Iterate over the validation data
for tokens, labels in test_dataloader:
    # Move the inputs to the device
    tokens = tokens.to(device)
    labels = labels.to(device)

    # Forward pass
    with torch.no_grad():
        outputs = model(input_ids=tokens, labels=labels)

    # Get the predicted labels
    _, predicted = torch.max(outputs.logits, 1)

    y_true.extend(labels.tolist())
    y_pred.extend(predicted.tolist())

get_metrics(y_true, y_pred)

  tokens = torch.tensor(self.tokens[idx])


{'Accuracy': '90.20%',
 'Precision': '61.54%',
 'Recall': '68.09%',
 'F1 Score': '64.65%',
 'ROC AUC': '80.82%'}

# Support Vector Machine (SVM) classification


In [81]:
import pandas as pd
from sklearn import svm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import recall_score, precision_score

vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform([str(text) for text in X_train.values])
X_test_tfidf = vectorizer.transform([str(text) for text in X_test.values])

# Instantiate the model
svm_model = svm.SVC(kernel='linear')

# Fit the model on the training data
svm_model.fit(X_train_tfidf, y_train)

# Get predictions on the test data
y_pred = svm_model.predict(X_test_tfidf)

get_metrics(y_true, y_pred)

{'Accuracy': '89.78%',
 'Precision': '66.67%',
 'Recall': '44.68%',
 'F1 Score': '53.50%',
 'ROC AUC': '70.65%'}

### Using SVM classification, we have achieved quite lower recall than using BERT Transformer model


# Naive Bayes


In [82]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform([str(text) for text in X_train.values])
X_test_tfidf = vectorizer.transform([str(text) for text in X_test.values])

clf = MultinomialNB()

# Train the classifier
clf.fit(X_train_tfidf, y_train)

# Make predictions on the test data
y_pred = clf.predict(X_test_tfidf)

get_metrics(y_true, y_pred)


{'Accuracy': '85.99%',
 'Precision': '46.94%',
 'Recall': '48.94%',
 'F1 Score': '47.92%',
 'ROC AUC': '70.27%'}