In [8]:
# ! pip install -q autoviz
# ! pip install -q -U --pre pycaret
# ! pip -q install transformers
# ! pip -q install torch

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from scipy import stats
from statsmodels.stats.outliers_influence import variance_inflation_factor
from autoviz.classify_method import data_cleaning_suggestions ,data_suggestions
from pycaret import regression
from sklearn.model_selection import cross_val_score

ImportError: DLL load failed while importing _path: The specified module could not be found.

In [None]:
df = pd.read_csv('../dataset/malicious_phish.csv', nrows = 1000)

In [None]:
df.head()

In [None]:
df.shape

## EDA

In [None]:
df.info()

In [None]:
df.dtypes


In [None]:
data_cleaning_suggestions(df)


# 1. Handling Categorical Values

In [None]:
from sklearn.preprocessing import LabelEncoder


In [None]:
# get list of categorical columns
cat_cols = df.select_dtypes(include=['object']).columns.tolist()
cat_cols

In [None]:
# Initialize the LabelEncoder
le = LabelEncoder()

# Fit and transform the data
encoded_data = le.fit_transform(df['type'])

print(encoded_data)

In [None]:
df


In [None]:
df['type'] = encoded_data

In [None]:
df

## 2. Handling Null Values

In [None]:
df.isnull().any()

In [None]:
df.dtypes

# 3. Class Distributions

In [None]:
# Count the number of instances in each class
class_counts = df['type'].value_counts()

# Print the class distribution
print('Class distribution:')
print(class_counts)

In [None]:
sns.countplot(x = 'type', data = df, order = df['type'].value_counts().index)


# Feature Extraction using Bert Model¶


In [None]:
from transformers import BertModel, BertTokenizer
import torch

# Load the pre-trained BERT model and tokenizer
model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
# Define a function to extract features for each transaction
def extract_features(text):
    # Tokenize the text
    input_ids = torch.tensor([tokenizer.encode(text, add_special_tokens=True)])
    # Get the hidden states for each token
    with torch.no_grad():
        outputs = model(input_ids)
        hidden_states = outputs[2]
    # Concatenate the last 4 hidden states
    token_vecs = []
    for layer in range(-4, 0):
        token_vecs.append(hidden_states[layer][0])
    # Calculate the mean of the last 4 hidden states
    features = []
    for token in token_vecs:
        features.append(torch.mean(token, dim=0))
    # Return the features as a tensor
    return torch.stack(features)

In [None]:
# Extract features for each transaction
features = []
for i in range(len(df)):
    features.append(extract_features(df.iloc[i]["url"]))
# Concatenate the features and convert to a numpy array
features = torch.cat(features).numpy()

In [None]:
features

## Data processing 

In [None]:
types = df['type'].values
types

NameError: name 'df' is not defined

In [None]:
print(features.shape)
print(types.shape)

In [None]:
# features is a 2D numpy array of size 4000x768
# labels is a 1D numpy array of size 1000
# reshape the feature array to size 1000x(768*4)

features_reshaped = features.reshape((1000, -1))

# concatenate the feature array with the label array horizontally
dataset = np.hstack((features_reshaped, types.reshape((-1, 1))))

# dataset is a 2D numpy array of size 1000x(4000*768+1)

In [None]:
features_reshaped.shape

In [None]:
dataset

In [None]:
dataset.shape

# Data Splitting 

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# Split the data into training and testing sets
train_data, test_data = train_test_split(dataset, test_size=0.2, random_state=42)

# Convert the training and testing sets back into separate feature and label arrays
X_train, y_train = train_data[:, :-1], train_data[:, -1]
X_test, y_test = test_data[:, :-1], test_data[:, -1]

# Balancing Classes using SMOTE

In [None]:
from imblearn.over_sampling import SMOTE

# Instantiate SMOTE
sm = SMOTE(random_state=42)

# Fit SMOTE to training data
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

y_train_s = pd.Series(y_train)
y_train_res = pd.Series (y_train_res)

# Print class distribution of original and resampled data
print('Class distribution before resampling:', y_train_s.value_counts())
print('Class distribution after resampling:', y_train_res.value_counts())

# Logistic Regression Model

In [None]:
from sklearn.linear_model import LogisticRegression

# Train a logistic regression classifier on the training set
clf = LogisticRegression()
clf.fit(X_train, y_train)

# Evaluation

In [None]:
# Evaluate the classifier on the testing set
score = clf.score(X_test, y_test)
print("Accuracy:", score)

In [None]:
# Predict the labels of the testing set
y_pred = clf.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
# Generate the confusion matrix and classification report
cm = confusion_matrix(y_test, y_pred)
cr = classification_report(y_test, y_pred)

In [None]:
# Plot the heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()

In [None]:
print(classification_report(y_test, y_pred))