In [38]:
# !pip install datasets

In [1]:
from datasets import load_dataset

desired_dataset = ['texts', 'urls', 'webs', 'combined_full', 'combined_reduced']

dataset = load_dataset("ealvaradob/phishing-dataset", desired_dataset[0], trust_remote_code=True)

  from .autonotebook import tqdm as notebook_tqdm


In [45]:
# Label - 1 (Phishing) or 0 (Benign)

print(dataset['train'][0]['text'])

print(dataset['train'][0]['label'])

re : 6 . 1100 , disc : uniformitarianism , re : 1086 ; sex / lang dick hudson 's observations on us use of 's on ' but not 'd aughter ' as a vocative are very thought-provoking , but i am not sure that it is fair to attribute this to " sons " being " treated like senior relatives " . for one thing , we do n't normally use ' brother ' in this way any more than we do 'd aughter ' , and it is hard to imagine a natural class comprising senior relatives and 's on ' but excluding ' brother ' . for another , there seem to me to be differences here . if i am not imagining a distinction that is not there , it seems to me that the senior relative terms are used in a wider variety of contexts , e . g . , calling out from a distance to get someone 's attention , and hence at the beginning of an utterance , whereas 's on ' seems more natural in utterances like ' yes , son ' , ' hand me that , son ' than in ones like ' son ! ' or ' son , help me ! ' ( although perhaps these latter ones are not compl

In [2]:
# Retrieve all texts using a list comprehension
un_processed_features = [entry['text'] for entry in dataset['train']]

labels = [entry['label'] for entry in dataset['train']]

### Breakdown of the Pattern

- **`(?:...)`**: Non-capturing group to group multiple regex elements without creating a backreference.

- **`(?:https?://|www\.)[^\s]+`**: Matches URLs:
  - **`https?://`**: Matches both "http://" and "https://".
  - **`|www\.`**: Allows for URLs that start with "www.".
  - **`[^\s]+`**: Matches one or more characters that are not whitespace (captures the rest of the URL).

- **`|[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}`**: Matches email addresses:
  - **`[a-zA-Z0-9._%+-]+`**: Matches the local part of the email (before the `@`).
  - **`@`**: Matches the `@` symbol.
  - **`[a-zA-Z0-9.-]+`**: Matches the domain name (after the `@`).
  - **`\.[a-zA-Z]{2,}`**: Matches the domain extension (like .com, .net, etc.), requiring at least two alphabetic characters.

- **`|[a-zA-Z]+(?:'[a-zA-Z]+)?(?:-[a-zA-Z]+)?`**: Matches regular words:
  - **`[a-zA-Z]+`**: Matches alphabetic words.
  - **`(?:'[a-zA-Z]+)?`**: Allows for apostrophes (for contractions like "don't").
  - **`(?:-[a-zA-Z]+)?`**: Allows for hyphens (for compound words like "mother-in-law").


In [47]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TfidfVectorizer

# https://scikit-learn.org/1.5/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html

pattern = r"(?:(?:https?://|www\.)[^\s]+|[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}|[a-zA-Z]+(?:'[a-zA-Z]+)?(?:-[a-zA-Z]+)?)"  
tfidf_vectorizer = TfidfVectorizer(max_features=100, analyzer="word", lowercase=True, stop_words='english', norm='l2', token_pattern = pattern)

features = un_processed_features

# Compute the TF-IDF values for the dataset
X_tfidf = tfidf_vectorizer.fit_transform(features)

# Print the shape of the resulting TF-IDF matrix
print(X_tfidf.shape)

(20137, 100)


In [48]:
from sklearn.model_selection import train_test_split

# Split data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, labels, test_size=0.2, random_state=42)

# X_train, X_test: TF-IDF features for training and testing
# y_train, y_test: Corresponding labels for training and testing


In [49]:
from sklearn.svm import SVC

# Initialize the SVM classifier (with a linear kernel for text classification)
# https://scikit-learn.org/dev/modules/generated/sklearn.svm.SVC.html
svm_model = SVC(kernel='linear')

# Train the SVM model using the training data
svm_model.fit(X_train, y_train)

In [50]:
# Evaluate the model on the test data
y_pred = svm_model.predict(X_test)

# Import metrics to measure accuracy and other performance metrics
from sklearn.metrics import accuracy_score, classification_report

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Print detailed classification report (precision, recall, F1-score)
print(classification_report(y_test, y_pred))


Accuracy: 85.87%
              precision    recall  f1-score   support

           0       0.86      0.92      0.89      2493
           1       0.85      0.77      0.81      1535

    accuracy                           0.86      4028
   macro avg       0.86      0.84      0.85      4028
weighted avg       0.86      0.86      0.86      4028



In [3]:
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import make_classification

pattern = r"(?:(?:https?://|www\.)[^\s]+|[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}|[a-zA-Z]+(?:'[a-zA-Z]+)?(?:-[a-zA-Z]+)?)"  

# Create a pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(analyzer="word", lowercase=True, stop_words='english', token_pattern = pattern)),
    ('clf', SVC())
])

# Define the parameter grid to search
# Set up the parameters for Grid Search
param_grid = {
    'tfidf__max_features': [100], #[None, 100, 500, 1000],  # Maximum features for TF-IDF
    'tfidf__norm': ['l2'], #['l1', 'l2'],  # Norm for TF-IDF
    'tfidf__min_df': [5],#[1, 5, 10],  # Minimum document frequency for TF-IDF
    'clf__C': [0.1],#[0.1, 1, 10],  # Regularization parameter for SVM
    'clf__kernel': ['linear']#['linear', 'rbf']  # Kernel type for SVM
}

# Create the GridSearchCV object
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')

# Fit the model
features = un_processed_features
grid_search.fit(features, labels)

# Print the best parameters and score
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: ", grid_search.best_score_)

Best parameters found:  {'clf__C': 0.1, 'clf__kernel': 'linear', 'tfidf__max_features': 100, 'tfidf__min_df': 5, 'tfidf__norm': 'l2'}
Best cross-validation score:  0.8242012147892492
