# Spot the Fake Hackathon - HTML Content Analysis

## Install dependencies

In [1]:
!pip install -q bitsandbytes datasets transformers peft accelerate trl kaggle

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m35.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m544.8/544.8 kB[0m [31m37.3 MB/s[0m eta [36m0:00:00[0m
[?25h

## Mount drive

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
project_dir = "/content/drive/MyDrive/CipherCop/ciphercop_mistral"
!mkdir -p {project_dir}

## Kaggle

In [4]:
import shutil, os

In [5]:
kaggle_json = "/content/drive/MyDrive/Kaggle/kaggle.json"
os.makedirs("/root/.kaggle", exist_ok=True)
shutil.copy(kaggle_json, "/root/.kaggle/")
os.chmod("/root/.kaggle/kaggle.json", 0o600)


In [6]:
!kaggle datasets download -d zackyzac/phishing-site-html-content -p {project_dir}

Dataset URL: https://www.kaggle.com/datasets/zackyzac/phishing-site-html-content
License(s): MIT
phishing-site-html-content.zip: Skipping, found more recently modified local copy (use --force to force download)


In [7]:
!unzip -o {project_dir}/phishing-site-html-content.zip -d {project_dir}/data

Archive:  /content/drive/MyDrive/CipherCop/ciphercop_mistral/phishing-site-html-content.zip
  inflating: /content/drive/MyDrive/CipherCop/ciphercop_mistral/data/html_content/genuine_site_0/123people.com_141.txt  
  inflating: /content/drive/MyDrive/CipherCop/ciphercop_mistral/data/html_content/genuine_site_0/abodedublin.com_122.txt  
  inflating: /content/drive/MyDrive/CipherCop/ciphercop_mistral/data/html_content/genuine_site_0/absoluteastronomy.com_11.txt  
  inflating: /content/drive/MyDrive/CipherCop/ciphercop_mistral/data/html_content/genuine_site_0/absoluteastronomy.com_111.txt  
  inflating: /content/drive/MyDrive/CipherCop/ciphercop_mistral/data/html_content/genuine_site_0/absoluteastronomy.com_198.txt  
  inflating: /content/drive/MyDrive/CipherCop/ciphercop_mistral/data/html_content/genuine_site_0/acronyms.thefreedictionary.com_200.txt  
  inflating: /content/drive/MyDrive/CipherCop/ciphercop_mistral/data/html_content/genuine_site_0/ajboonbutchers.co.uk_196.txt  
  inflating:

## Prepare Dataset

In [8]:
from datasets import load_dataset
import pandas as pd

In [9]:
# Paths
genuine_path = f"{project_dir}/data/html_content/genuine_site_0"
phishing_path = f"{project_dir}/data/html_content/phishing_site_1"

In [10]:
data = []

In [11]:
# Genuine samples (label 0)
for fname in os.listdir(genuine_path):
    fpath = os.path.join(genuine_path, fname)
    if fname.endswith(".txt"):
        with open(fpath, "r", encoding="utf-8", errors="ignore") as f:
            text = f.read()
            data.append({"text": text, "label": 0})

In [12]:
# Phishing samples (label 1)
for fname in os.listdir(phishing_path):
    fpath = os.path.join(phishing_path, fname)
    if fname.endswith(".txt"):
        with open(fpath, "r", encoding="utf-8", errors="ignore") as f:
            text = f.read()
            data.append({"text": text, "label": 1})

In [16]:
# Create dataframe
df = pd.DataFrame(data)
print("Dataset size:", df.shape)
print(df['label'].value_counts())

Dataset size: (165, 2)
label
0    117
1     48
Name: count, dtype: int64


In [18]:
# Convert to Hugging Face Dataset
from datasets import Dataset
dataset = Dataset.from_pandas(df)

# Train-test split
dataset = dataset.train_test_split(test_size=0.2, seed=42)

## Load tokenizer and model

In [19]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model
import torch

In [20]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16,
)

In [21]:
model_name = "mistralai/Mistral-7B-Instruct-v0.2"

In [22]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/2.10k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [23]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2,
    quantization_config=bnb_config,
    device_map={"": 0},   # force all on GPU
)

config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of MistralForSequenceClassification were not initialized from the model checkpoint at mistralai/Mistral-7B-Instruct-v0.2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
# Apply LoRA
from peft import LoraConfig, TaskType

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj","k_proj","v_proj","o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_CLS
)


In [25]:
model = get_peft_model(model, lora_config)

## Preprocess text

In [26]:
max_length = 256

In [27]:
def preprocess(examples):
    inputs = examples["text"]  # adjust if column name is different
    model_inputs = tokenizer(inputs, max_length=max_length, truncation=True, padding="max_length")
    model_inputs["labels"] = examples["label"]
    return model_inputs

In [28]:
tokenized_ds = dataset.map(preprocess, batched=True, remove_columns=["text"])

Map:   0%|          | 0/132 [00:00<?, ? examples/s]

Map:   0%|          | 0/33 [00:00<?, ? examples/s]

## Training

In [29]:
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback

In [30]:
from torch.nn import CrossEntropyLoss

# Compute class weights
num_legit = (df["label"] == 0).sum()
num_phish = (df["label"] == 1).sum()
weights = torch.tensor([1.0, num_legit/num_phish]).to("cuda")  # phishing weighted more

def custom_loss(model, inputs, return_outputs=False):
    labels = inputs.get("labels")
    outputs = model(**inputs)
    logits = outputs.get("logits")
    loss_fct = CrossEntropyLoss(weight=weights)
    loss = loss_fct(logits.view(-1, model.config.vocab_size), labels.view(-1))
    return (loss, outputs) if return_outputs else loss

In [34]:
training_args = TrainingArguments(
    output_dir=f"{project_dir}/results",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=16,
    learning_rate=2e-4,
    num_train_epochs=5,
    logging_dir=f"{project_dir}/logs",
    logging_steps=20,
    save_strategy="epoch",
    save_total_limit=2,
    bf16=True,
    fp16=False,
    report_to="none"
)

In [35]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"]
    # compute_loss=custom_loss  # weighted loss - Didn't work
)


In [36]:
trainer.train()

Step,Training Loss
20,23.6867
40,4.7581


TrainOutput(global_step=45, training_loss=12.760759544372558, metrics={'train_runtime': 214.5252, 'train_samples_per_second': 3.077, 'train_steps_per_second': 0.21, 'total_flos': 7089462894919680.0, 'train_loss': 12.760759544372558, 'epoch': 5.0})

## Save adapter weights

In [37]:
save_dir = f"{project_dir}/mistral_phishing_lora"
model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)
print(f"Model saved to {save_dir}")

Model saved to /content/drive/MyDrive/CipherCop/ciphercop_mistral/mistral_phishing_lora
