In [None]:
# !pip install datasets

In [37]:
import pandas as pd
from datasets import load_dataset
from torch.utils.data import Dataset
from openai import OpenAI
import json

In [None]:
from google.colab import userdata
key = userdata.get('API_KEY')

client = OpenAI(
    api_key = key
)

In [9]:
ds = load_dataset("imanoop7/phishing_url_classification")

In [10]:
ds

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 100000
    })
})

In [17]:
from datasets import DatasetDict

sample_size = 100

#Choose random 100 samples
small_dataset = ds["train"].shuffle(seed=42).select(range(sample_size))

#Make new dataset from random samples
reduced_dataset = DatasetDict({
    "train": small_dataset
})

In [18]:
df = pd.DataFrame(reduced_dataset['train'])

In [None]:
df['label'] = df['label'].astype(bool) #convert to Boolean
df = df.rename(columns={'label': 'is_deceptive'}) #Rename column

In [95]:
df

Unnamed: 0,text,is_deceptive
0,https://www.airbnb.com,False
1,http://exchangelogin.com/account,True
2,https://www.stackoverflow.com,False
3,https://www.microsoft.com,False
4,https://www.facebook.com,False
...,...,...
95,https://www.apple.com,False
96,http://memberspotify.org/profile,True
97,http://shopverify.co/login,True
98,http://mailgoogle.info/password,True


In [84]:
df.to_csv('./sampled.csv',index=False)

In [90]:
prompt = """
You are an assistant who helps to analyse suspicious (phishing) sites and classify it as phishing or non-phishing.

###Instructions###
provide the result of the analysis in json format with the following parts:
is_deceptive: Boolean (True - phishing, False - not phishing)

###Example###:
text - http://facebookgoogle.net/login
is_deceptive: True

text - http://spotifyapple.me/reset
is_deceptive: True

text - https://www.github.com
is_deceptive: False
"""

**gpt-4o-mini**

In [91]:
processed_df = pd.DataFrame(columns=df.columns)

for index, row in df.iterrows():

    url = row['text']
    # label = row['label']

    completion = client.chat.completions.create(
        model="gpt-4o-mini",
        response_format={ "type": "json_object" },
        messages=[
            {"role": "system", "content": prompt},
            {"role": "user", "content": url}
        ]
    )
    response = completion.choices[0].message.content
    response_json = json.loads(response)

    new_row = pd.DataFrame({
        'text': [url],
        # 'label': [label],
        'is_deceptive': [response_json['is_deceptive']]
    })

    processed_df = pd.concat([processed_df, new_row], ignore_index=True)

In [92]:
#Download result
processed_df.to_csv('processed.csv', index=False, columns=['text', 'is_deceptive'])

In [94]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load the datasets
processed_df = pd.read_csv('./processed.csv')
true_labels_df = pd.read_csv('./sampled.csv')


# Extracting the predicted and true labels
y_pred = processed_df['is_deceptive'].values
y_true = true_labels_df['is_deceptive'].values

# Calculate the evaluation metrics
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

# Print the scores
print(f"Accuracy: {accuracy*100:.2f}%")
print(f"Precision: {precision*100:.2f}%")
print(f"Recall: {recall*100:.2f}%")
print(f"F1 Score: {f1*100:.2f}%")

Accuracy: 99.00%
Precision: 100.00%
Recall: 98.18%
F1 Score: 99.08%
