In [None]:
!pip install dotenv unsloth trl accelerate bitsandbytes peft transformers datasets

In [3]:
import transformers
import torch
torch.cuda.get_device_name(0)

  from .autonotebook import tqdm as notebook_tqdm


'NVIDIA GeForce RTX 3090'

## CHECK URL, MAIL, PHONE

In [None]:
VT_API_KEY=''
ABSTRACT_EMAIL_API=''
ABSTRACT_PHONE_API=''

In [5]:
import os
import base64
import requests

SUSPICIOUS_COUNTRIES = {"Cambodia", "Nigeria", "Pakistan", "Afghanistan", "North Korea"}

def check_url_virustotal(url):
    api_key = VT_API_KEY
    url_id = base64.urlsafe_b64encode(url.encode()).decode().strip("=")
    vt_url = f"https://www.virustotal.com/api/v3/urls/{url_id}"

    headers = {
        "x-apikey": api_key
    }

    response = requests.get(vt_url, headers=headers)
    return response.json()

def parse_vt_result_for_display(vt_json):
    try:
        data = vt_json["data"]["attributes"]
        stats = data["last_analysis_stats"]

        url = data.get("last_final_url", data.get("url", ""))

        harmless = stats.get("harmless", 0)
        malicious = stats.get("malicious", 0)
        suspicious = stats.get("suspicious", 0)
        undetected = stats.get("undetected", 0)

        # ƒê√°nh gi√° t·ªïng qu√°t
        if malicious > 0:
            overall = "Nguy hi·ªÉm"
        elif suspicious > 0:
            overall = "C√≥ th·ªÉ ƒë√°ng ng·ªù"
        else:
            overall = "An to√†n"

        results = {
            "url": url,
            "harmless": harmless,
            "malicious": malicious,
            "suspicious": suspicious,
            "undetected": undetected,
            "overall": overall
        }

        return results

    except Exception as e:
        return {
            "error": f"Kh√¥ng th·ªÉ ph√¢n t√≠ch d·ªØ li·ªáu VirusTotal: {e}"
        }

def check_email_validity(email):
    api_key = ABSTRACT_EMAIL_API
    url = "https://emailvalidation.abstractapi.com/v1/"
    params = {
        "api_key": api_key,
        "email": email
    }
    response = requests.get(url, params=params)
    return response.json()

def parse_email_result(result):
    try:
        email = result.get("email", "N/A")
        deliverability = result.get("deliverability", "UNKNOWN")
        is_format_valid = result["is_valid_format"]["value"]
        is_smtp_valid = result["is_smtp_valid"]["value"]
        is_mx_found = result["is_mx_found"]["value"]
        is_free = result["is_free_email"]["value"]
        is_disposable = result["is_disposable_email"]["value"]
        is_role = result["is_role_email"]["value"]

        # T·ªïng k·∫øt h·ª£p l·ªá
        is_valid = all([
            is_format_valid,
            is_smtp_valid,
            is_mx_found,
            deliverability == "DELIVERABLE"
        ])

        result_dict = {
            "email": email,
            "valid": is_valid,
            "deliverability": deliverability,
            "is_format_valid": is_format_valid,
            "is_smtp_valid": is_smtp_valid,
            "is_mx_found": is_mx_found,
            "is_free_email": is_free,
            "is_disposable_email": is_disposable,
            "is_role_email": is_role,
            "conclusion": (
                "H·ª£p l·ªá (SMTP & MX t·ªìn t·∫°i)" if is_valid else
                "Kh√¥ng h·ª£p l·ªá ho·∫∑c kh√¥ng g·ª≠i ƒë∆∞·ª£c"
            ),
            "description": {
                "type": "Mi·ªÖn ph√≠" if is_free else "Domain ri√™ng",
                "spam": "T·∫°m th·ªùi / spam" if is_disposable else "Kh√¥ng ph·∫£i spam",
                "role": "ƒê·∫°i di·ªán t·ªï ch·ª©c" if is_role else "Email c√° nh√¢n"
            }
        }

        return result_dict

    except Exception as e:
        return {
            "error": f"Kh√¥ng th·ªÉ ph√¢n t√≠ch k·∫øt qu·∫£ email: {e}"
        }

def normalize_phone_vn(phone: str) -> str:
    if phone.startswith("0") and len(phone) == 10:
        return "+84" + phone[1:]
    elif phone.startswith("+84"):
        return phone
    return phone

def check_phone_validity(phone):
    api_key = ABSTRACT_PHONE_API
    if not api_key:
        raise ValueError("‚ùå ABSTRACT_PHONE_API ch∆∞a ƒë∆∞·ª£c thi·∫øt l·∫≠p trong .env")

    url = "https://phonevalidation.abstractapi.com/v1/"
    normalized_phone = normalize_phone_vn(phone)
    params = {
        "api_key": api_key,
        "phone": normalized_phone
    }

    response = requests.get(url, params=params)
    if response.status_code != 200:
        raise Exception(f"L·ªói API: {response.status_code} ‚Äì {response.text}")

    return response.json()

# H√†m ph√¢n t√≠ch k·∫øt qu·∫£ tr·∫£ v·ªÅ
def parse_phone_result(result):
    try:
        phone = result.get("phone")
        valid = result.get("valid", False)
        country = result.get("country", {}).get("name", "")
        country_code = result.get("country", {}).get("code", "")
        intl_format = result.get("format", {}).get("international", "")
        local_format = result.get("format", {}).get("local", "")

        is_foreign = country and country != "Vietnam"
        is_high_risk = country in SUSPICIOUS_COUNTRIES

        return {
            "phone": phone,
            "valid": valid,
            "international_format": intl_format,
            "local_format": local_format,
            "country": country,
            "country_code": country_code,
            "location": result.get("location"),
            "carrier": result.get("carrier"),
            "type": result.get("type"),
            "is_foreign_number": is_foreign,
            "is_high_risk_country": is_high_risk,
            "conclusion": (
                "Kh√¥ng h·ª£p l·ªá" if not valid else
                "S·ªë t·ª´ qu·ªëc gia r·ªßi ro (c·∫ßn c·∫©n tr·ªçng)" if is_high_risk else
                "S·ªë t·ª´ n∆∞·ªõc ngo√†i" if is_foreign else
                "S·ªë h·ª£p l·ªá n·ªôi ƒë·ªãa"
            )
        }

    except Exception as e:
        return {
            "error": f"L·ªói ph√¢n t√≠ch d·ªØ li·ªáu s·ªë ƒëi·ªán tho·∫°i: {e}"
        }

def build_checks_summary(url=None, email=None, phone=None):
    parts = []

    if url:
        url_result = check_url_virustotal(url)
        check_url = parse_vt_result_for_display(url_result)
        parts.append(f"K·∫øt qu·∫£ ki·ªÉm tra URL: {check_url}")

    if email:
        mail_result = check_email_validity(email)
        check_mail = parse_email_result(mail_result)
        parts.append(f"K·∫øt qu·∫£ ki·ªÉm tra Mail: {check_mail}")

    if phone:
        phone_result = check_phone_validity(phone)
        check_phone = parse_phone_result(phone_result)
        parts.append(f"K·∫øt qu·∫£ ki·ªÉm tra Phone: {check_phone}")

    return parts

## Load model

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "unsloth/gemma-3-4b-it"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)

tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
prompt = "B·∫°n ƒë√£ tr√∫ng th∆∞·ªüng 1 chi·∫øc Iphone 16, h√£y nh·∫•n v√†o link ƒë·ªÉ nh·∫≠n th∆∞·ªüng"
messages = [
    {"role": "system", "content": "B·∫°n l√† 1 AI th√¥ng minh h·ªó tr·ª£ ph√¢n lo·∫°i tin t·ª©c real v√† fake. H√£y ph√¢n lo·∫°i tin t·ª©c ng∆∞·ªùi d√πng thu·ªôc lo·∫°i real ho·∫∑c fake. Nh·ªõ ch·ªâ c·∫ßn tr·∫£ l·ªùi ƒë√∫ng l√† real ho·∫∑c fake kh√¥ng c·∫ßn gi·∫£i th√≠ch th√™m"},
    {"role": "user", "content": prompt}
]

In [None]:
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

generated_ids = model.generate(
    **model_inputs,
    max_new_tokens=512
)
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(response)

## Finetune model

In [59]:
import pandas as pd
import torch
from unsloth import FastLanguageModel
from datasets import Dataset
from transformers import TrainingArguments, DataCollatorForLanguageModeling
from trl import SFTTrainer

In [60]:
train_df = pd.read_csv("data/train.csv")
val_df = pd.read_csv("data/val.csv")

train_df = train_df[["text", "label"]].dropna()
val_df = val_df[["text", "label"]].dropna()

print(f"Train samples: {len(train_df)}")
print(f"Val samples: {len(val_df)}")

Train samples: 2534
Val samples: 843


In [61]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/gemma-2-9b-it-bnb-4bit",
    max_seq_length=2048,
    dtype=None,
    load_in_4bit=True,
)

Unsloth: If you want to finetune Gemma 2, install flash-attn to make it faster!
To install flash-attn, do the below:

pip install --no-deps --upgrade "flash-attn>=2.6.3"
==((====))==  Unsloth 2025.5.8: Fast Gemma2 patching. Transformers: 4.52.3.
   \\   /|    NVIDIA GeForce RTX 3090. Num GPUs = 1. Max memory: 23.488 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 8.6. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [62]:
# check pad_token 
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

# √Åp d·ª•ng LoRA
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407,
)

In [63]:
def format_chat_template(sample):
    return f"""<start_of_turn>user
    Ph√¢n lo·∫°i tin t·ª©c sau l√† real hay fake:

    {sample['text']}<end_of_turn>
    <start_of_turn>model
    {sample['label']}<end_of_turn>"""

train_texts = [format_chat_template(row) for _, row in train_df.iterrows()]
val_texts = [format_chat_template(row) for _, row in val_df.iterrows()]

train_dataset = Dataset.from_dict({"text": train_texts})
val_dataset = Dataset.from_dict({"text": val_texts})

In [64]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
    mlm_probability=0.15,  # Explicitly set this value!
)

In [71]:
training_args = TrainingArguments(
    output_dir="gemma_outputs",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    warmup_steps=5,
    num_train_epochs=3,
    learning_rate=2e-4,
    fp16=not torch.cuda.is_bf16_supported(),
    bf16=torch.cuda.is_bf16_supported(),
    logging_steps=10,
    optim="adamw_8bit",
    weight_decay=0.01,
    lr_scheduler_type="linear",
    seed=3407,
    save_strategy="steps",
    save_steps=50,
    eval_strategy="steps",
    eval_steps=50,
    report_to="none",
    dataloader_pin_memory=False,
    remove_unused_columns=False,
)

In [66]:
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=1024,
    )

train_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
val_dataset = val_dataset.map(tokenize_function, batched=True, remove_columns=["text"])

Map:   0%|          | 0/2534 [00:00<?, ? examples/s]

Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2534/2534 [00:00<00:00, 3158.11 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 843/843 [00:00<00:00, 3311.04 examples/s]


In [72]:
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    args=training_args,
)

trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 2,534 | Num Epochs = 3 | Total steps = 951
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 54,018,048/9,000,000,000 (0.60% trained)


Step,Training Loss,Validation Loss
50,1.3949,7.464354
100,0.8436,7.866084
150,1.0996,8.059935
200,0.9831,7.871105
250,0.7986,8.220574
300,1.0985,8.026659
350,0.6047,8.74048
400,0.5802,8.675448
450,0.6388,8.626851
500,0.5705,8.939626


Unsloth: Not an error, but Gemma2ForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient
AUTOTUNE bmm(16x1024x256, 16x256x1024)
  bmm 0.1372 ms 100.0% 
  triton_bmm_199 0.1874 ms 73.2% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4
  triton_bmm_203 0.1935 ms 70.9% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4
  triton_bmm_195 0.1997 ms 68.7% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4
  triton_bmm_204 0.1997 ms 68.7% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=False, GROUP_M=8, num_stages=4, num_warps=8
  triton_bmm_200 0.2007 ms 68.4% ACC_TYPE='tl.float32', ALLOW

In [73]:
trainer.model.save_pretrained("Gemma-lora-outputs")
tokenizer.save_pretrained("Gemma-lora-outputs")

('Gemma-lora-outputs/tokenizer_config.json',
 'Gemma-lora-outputs/special_tokens_map.json',
 'Gemma-lora-outputs/chat_template.jinja',
 'Gemma-lora-outputs/tokenizer.model',
 'Gemma-lora-outputs/added_tokens.json',
 'Gemma-lora-outputs/tokenizer.json')

## Inference

In [None]:
from unsloth import FastLanguageModel
from transformers import AutoTokenizer

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "qwen2.5_finetuned",
    max_seq_length = 1024,
    load_in_4bit = True,
)

In [None]:
prompt = "Ph√¢n lo·∫°i tin t·ª©c sau l√† real hay fake:\n\nB·∫°n ƒë√£ tr√∫ng th∆∞·ªüng gi·∫£i Jackpot tr·ªã gi√° 1 t·ª∑ ƒë·ªìng t·∫°i https://www.x311y.com/. Nh·∫•n v√†o link ƒë·ªÉ nh·∫≠n ngay\n\n### Response:"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

outputs = model.generate(**inputs, max_new_tokens=20)
response = tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
print("K·∫øt lu·∫≠n:", response)

## Test model after finetune

In [74]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import json
import re

# Load model (Qwen fine-tuned)
from unsloth import FastLanguageModel
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "Gemma-lora-outputs",
    max_seq_length = 1024,
    load_in_4bit = True,
)

def extract_contact_info(text: str) -> str:
    email_pattern = r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+"
    url_pattern = r"https?://[^\s]+|www\.[^\s]+"
    phone_pattern = r"(\+?84|0)?\s?(\d{9,10})"

    email = ''
    phone = ''
    url = ''

    email_match = re.search(email_pattern, text)
    if email_match:
        email = email_match.group(0)
    
    # T√¨m URL ƒë·∫ßu ti√™n
    url_match = re.search(url_pattern, text)
    if url_match:
        url = url_match.group(0)
    
    # T√¨m s·ªë ƒëi·ªán tho·∫°i ƒë·∫ßu ti√™n
    phone_match = re.search(phone_pattern, text)
    if phone_match:
        phone = "".join([g if g is not None else "" for g in phone_match.groups()]) if phone_match else ""

    return email, phone, url

def classify_news(input_text: str, check_summary: list) -> str:
    joined_check = "\n".join(check_summary)
    full_prompt = f"""B·∫°n l√† tr·ª£ l√Ω AI c√≥ nhi·ªám v·ª• x√°c th·ª±c tin t·ª©c l√† real hay fake.

    Th√¥ng tin c·∫ßn x√°c th·ª±c: {input_text}

    K·∫øt qu·∫£ ki·ªÉm tra b·ªï sung (n·∫øu c√≥):
    {joined_check}

    Y√™u c·∫ßu:
    Ch·ªâ tr·∫£ l·ªùi duy nh·∫•t 1 trong 2 t·ª´ sau: real ho·∫∑c fake.
    Kh√¥ng th√™m gi·∫£i th√≠ch, kh√¥ng ghi ch√∫, kh√¥ng d√≤ng th·ª´a.


    K·∫øt lu·∫≠n:"""

    inputs = tokenizer(full_prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=20)
    response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
    return response.strip()

Unsloth: If you want to finetune Gemma 2, install flash-attn to make it faster!
To install flash-attn, do the below:

pip install --no-deps --upgrade "flash-attn>=2.6.3"
==((====))==  Unsloth 2025.5.8: Fast Gemma2 patching. Transformers: 4.52.3.
   \\   /|    NVIDIA GeForce RTX 3090. Num GPUs = 1. Max memory: 23.488 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 8.6. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [None]:
# ==== TEST ====
input_text = "Ch√≠nh ph·ªß ra l·ªánh c·∫•m s·ª≠ d·ª•ng m·∫°ng x√£ h·ªôi Facebook t·∫°i Vi·ªát Nam t·ª´ th√°ng sau"
email, phone, url = extract_contact_info(input_text)

check_summary = build_checks_summary(url, email, phone)
# print("Check Summary:", check_summary)

final_label = classify_news(input_text, check_summary)
print("\nüß† K·∫øt lu·∫≠n cu·ªëi c√πng:", final_label)

## Testing with test_dataset

In [76]:
import pandas as pd
import json

df = pd.read_csv('data/test.csv')
llm_outputs = []

for i, row in df.iterrows():
    input_text = row['text']

    try:
        email, phone, url = extract_contact_info(input_text)
        check_summary = build_checks_summary(url, email, phone)
        # Ph√¢n lo·∫°i
        final_label = classify_news(input_text, check_summary)
        llm_outputs.append(final_label)
        print("K·∫øt lu·∫≠n:", final_label)

    except Exception as e:
        error_msg = f"Error: {str(e)}"
        llm_outputs.append(error_msg)
        print("‚ùå L·ªói x·ª≠ l√Ω:", error_msg)

# Ghi k·∫øt qu·∫£ v√†o c·ªôt m·ªõi v√† l∆∞u file
df['Gemma_finetuned'] = llm_outputs
df.to_csv('test_output_Gemma_finetuned.csv', index=False)

print("\nƒê√£ x·ª≠ l√Ω xong to√†n b·ªô test.csv v√† l∆∞u k·∫øt qu·∫£.")

K·∫øt lu·∫≠n: fake
    model
    fake
K·∫øt lu·∫≠n: real
    model
    real
K·∫øt lu·∫≠n: fake
    model
    fake
K·∫øt lu·∫≠n: fake
    model
    fake
K·∫øt lu·∫≠n: fake
    model
    fake
K·∫øt lu·∫≠n: fake
K·∫øt lu·∫≠n: fake
    model
    fake
K·∫øt lu·∫≠n: fake
K·∫øt lu·∫≠n: fake
    model
    fake
K·∫øt lu·∫≠n: fake
K·∫øt lu·∫≠n: fake
K·∫øt lu·∫≠n: fake
K·∫øt lu·∫≠n: fake
K·∫øt lu·∫≠n: fake
    model
    fake
K·∫øt lu·∫≠n: fake
    model
    fake
K·∫øt lu·∫≠n: real
K·∫øt lu·∫≠n: fake
    model
    fake
K·∫øt lu·∫≠n: fake
    model
    fake
K·∫øt lu·∫≠n: fake
    model
    fake
K·∫øt lu·∫≠n: fake
    model
    fake
K·∫øt lu·∫≠n: fake
    model
    fake
K·∫øt lu·∫≠n: fake
    model
    fake
K·∫øt lu·∫≠n: fake
K·∫øt lu·∫≠n: fake
    model
    fake
K·∫øt lu·∫≠n: fake
K·∫øt lu·∫≠n: fake
    model
    fake
K·∫øt lu·∫≠n: fake
    model
    fake
K·∫øt lu·∫≠n: real
K·∫øt lu·∫≠n: fake
    model
    fake
K·∫øt lu·∫≠n: real
    model
    real
K·∫øt lu·∫≠n: fake
K·∫øt lu·∫≠n: fake
    model
  