In [1]:
key="AIzaSyAuDcX3jEv2CNO02GlCQM3I4GbAIF1P_iY"

In [None]:
# -*- coding: utf-8 -*-
import os
import re
import textwrap
import unicodedata
from dotenv import load_dotenv
import google.generativeai as genai


# -----------------------------------------------------------
# 1. Load API Key
# -----------------------------------------------------------
def load_gemini_api():
    """Loads Gemini API key from environment variables."""
    load_dotenv()
    if not key:
        print("Error: GEMINI_API_KEY not found in environment variables.")
        exit()
    genai.configure(api_key=key)


# -----------------------------------------------------------
# 2. Initialize Gemini Model
# -----------------------------------------------------------
def get_gemini_model():
    """Returns a configured Gemini model instance."""
    model = genai.GenerativeModel(
        'gemini-2.0-flash',
        generation_config={
            "temperature": 0.2,         # lower = more consistent
            "max_output_tokens": 512    # limit tokens
        }
    )
    return model


# -----------------------------------------------------------
# 3. System Prompt
# -----------------------------------------------------------
system_prompt = """
You are an expert cybersecurity analyst with deep knowledge of phishing, scam websites, malicious online activity, 
and advanced attacks such as homograph (IDN/holographic) domain spoofing.

Your role is to carefully analyze the content of a given website (including its text, structure, metadata, and links). 
Determine whether the website is a phishing attempt or legitimate. 

Follow these rules strictly:
- Base your reasoning on typical phishing patterns (suspicious login forms, misleading links, brand impersonation, urgent warnings).
- Always check for homograph/holographic URLs where different Unicode scripts (e.g., Cyrillic, Greek) are used to mimic real domains.
- Always explain the reasoning in concise points.
- Be objective: if uncertain, classify as "suspicious" instead of giving a false "safe" verdict.

Respond ONLY in this parameterized format (do not add explanations outside this format):

verdict=<phishing or legitimate or suspicious>
risk_level=<high, medium, or safe>
confidence=<high, medium, low>
reasons=<comma-separated brief reasons>
evidence=<comma-separated concrete evidence snippets from the website>
"""


# -----------------------------------------------------------
# 4. Homograph Detector
# -----------------------------------------------------------
def detect_homograph(url: str) -> bool:
    """
    Detects potential homograph (holographic) attacks in a URL 
    by checking for mixed Unicode scripts in the domain name.
    """
    try:
        domain_match = re.findall(r"://([^/]+)/?", url)
        if not domain_match:
            return False
        domain = domain_match[0]

        scripts = set()
        for char in domain:
            try:
                name = unicodedata.name(char)
                if "CYRILLIC" in name:
                    scripts.add("CYRILLIC")
                elif "GREEK" in name:
                    scripts.add("GREEK")
                elif "LATIN" in name:
                    scripts.add("LATIN")
            except ValueError:
                continue

        # Suspicious if multiple scripts are mixed
        return len(scripts) > 1
    except Exception:
        return False


# -----------------------------------------------------------
# 5. Gemini Analyzer
# -----------------------------------------------------------
def gemini_analyze(website_content: str, model):
    """
    Analyzes website content for phishing using the configured Gemini model.
    Includes robust error handling for API calls and homograph detection.
    """
    try:
        # Check for links in the content
        urls = re.findall(r'href=[\'"]?([^\'" >]+)', website_content)
        homograph_flag = False
        for url in urls:
            if detect_homograph(url):
                homograph_flag = True
                website_content += f"\n\n[Warning: Suspicious homograph URL detected: {url}]"

        # Generate response from Gemini
        response = model.generate_content(
            system_prompt + "\n\nWebsite Content:\n" + textwrap.dedent(website_content),
            stream=False
        )

        # Check if blocked
        if hasattr(response, "prompt_feedback") and response.prompt_feedback.block_reason:
            print(f"⚠️ Content was blocked: {response.prompt_feedback.block_reason.name}")
            return "Error: Content blocked"

        clean_output = response.text.replace("*", "").strip()

        # Add homograph flag for clarity
        if homograph_flag:
            clean_output += "\n[Homograph detection triggered]"

        return clean_output

    except genai.types.StopCandidateException as e:
        print(f"⚠️ Generation stopped prematurely: {e}")
        return "Error: Generation stopped"
    except Exception as e:
        print(f"⚠️ An error occurred: {e}")
        return "Error: API call failed"
def parse_analysis_to_list(output_string: str) -> list:
    """Parses the Gemini response string into a list of key-value tuples."""
    if output_string.startswith("Error:"):
        return [("error", output_string)]
    
    # Use a list comprehension for a concise conversion
    # It splits each line by the first '=' and strips whitespace from the key/value
    result_list = [
        (line.split('=', 1)[0].strip(), line.split('=', 1)[1].strip())
        for line in output_string.strip().split('\n')
        if '=' in line
    ]
    return result_list
def calculate_fraud_percentage(parsed: list) -> int:
    """Converts risk_level and confidence into fraud percentage."""
    mapping = {
        ("high", "high"): 95,
        ("high", "medium"): 80,
        ("high", "low"): 65,
        ("medium", "high"): 55,
        ("medium", "medium"): 40,
        ("medium", "low"): 25,
        ("safe", "high"): 10,
        ("safe", "medium"): 5,
        ("safe", "low"): 2,
    }

    risk_level = None
    confidence = None
    for k, v in parsed:
        if k == "risk_level":
            risk_level = v.lower()
        elif k == "confidence":
            confidence = v.lower()

    return mapping.get((risk_level, confidence), 50)  # default fallback 50

# -----------------------------------------------------------
# 6. Example Usage
# -----------------------------------------------------------
if __name__ == "__main__":
    load_gemini_api()
    model = get_gemini_model()

    example_content = "www.paypa1.com"

    result = gemini_analyze(example_content, model)
    parsed = parse_analysis_to_list(result)
    print(parsed)
    fraud_percentage = calculate_fraud_percentage(parsed)
    print("\n🔎 Gemini Analysis Result:\n")
    print(result)
    print(f"\n💡 Fraud Percentage: {fraud_percentage}%")

[('verdict', 'phishing'), ('risk_level', 'high'), ('confidence', 'high'), ('reasons', 'Homograph domain spoofing, Brand impersonation, Suspicious domain name'), ('evidence', 'paypa1.com (likely using the number "1" to impersonate "l" in "paypal"), Domain name is not the official PayPal domain.')]

🔎 Gemini Analysis Result:

['v', 'e', 'r', 'd', 'i', 'c', 't', '=', 'p', 'h', 'i', 's', 'h', 'i', 'n', 'g', '\n', 'r', 'i', 's', 'k', '_', 'l', 'e', 'v', 'e', 'l', '=', 'h', 'i', 'g', 'h', '\n', 'c', 'o', 'n', 'f', 'i', 'd', 'e', 'n', 'c', 'e', '=', 'h', 'i', 'g', 'h', '\n', 'r', 'e', 'a', 's', 'o', 'n', 's', '=', 'H', 'o', 'm', 'o', 'g', 'r', 'a', 'p', 'h', ' ', 'd', 'o', 'm', 'a', 'i', 'n', ' ', 's', 'p', 'o', 'o', 'f', 'i', 'n', 'g', ',', ' ', 'B', 'r', 'a', 'n', 'd', ' ', 'i', 'm', 'p', 'e', 'r', 's', 'o', 'n', 'a', 't', 'i', 'o', 'n', ',', ' ', 'S', 'u', 's', 'p', 'i', 'c', 'i', 'o', 'u', 's', ' ', 'd', 'o', 'm', 'a', 'i', 'n', ' ', 'n', 'a', 'm', 'e', '\n', 'e', 'v', 'i', 'd', 'e', 'n',

In [None]:
# -*- coding: utf-8 -*-
import os
import re
import textwrap
import unicodedata
from dotenv import load_dotenv
import google.generativeai as genai
import requests
from bs4 import BeautifulSoup

# --- Installation ---
# You need to install requests and beautifulsoup4 for this script to work.
# pip install requests beautifulsoup4 python-dotenv

# -----------------------------------------------------------
# 1. Load API Key
# -----------------------------------------------------------
def load_gemini_api():
    """Loads Gemini API key from environment variables."""
    load_dotenv()
    if not key:
        print("Error: GEMINI_API_KEY not found in environment variables.")
        exit()
    genai.configure(api_key=key)


# -----------------------------------------------------------
# 2. Initialize Gemini Model
# -----------------------------------------------------------
def get_gemini_model():
    """Returns a configured Gemini model instance."""
    model = genai.GenerativeModel(
        'gemini-2.0-flash',
        generation_config={
            "temperature": 0.2,         # lower = more consistent
            "max_output_tokens": 512    # limit tokens
        }
    )
    return model


# -----------------------------------------------------------
# 3. System Prompt (No Changes)
# -----------------------------------------------------------
system_prompt = """
You are an expert cybersecurity analyst with deep knowledge of phishing, scam websites, malicious online activity, 
and advanced attacks such as homograph (IDN/holographic) domain spoofing.

Your role is to carefully analyze the content of a given website (including its text, structure, metadata, and links). 
Determine whether the website is a phishing attempt or legitimate. 

Follow these rules strictly:
- Base your reasoning on typical phishing patterns (suspicious login forms, misleading links, brand impersonation, urgent warnings).
- Always check for homograph/holographic URLs where different Unicode scripts (e.g., Cyrillic, Greek) are used to mimic real domains.
- Always explain the reasoning in concise points.
- Be objective: if uncertain, classify as "suspicious" instead of giving a false "safe" verdict.

Respond ONLY in this parameterized format (do not add explanations outside this format):

verdict=<phishing or legitimate or suspicious>
risk_level=<high, medium, or safe>
confidence=<high, medium, low>
reasons=<comma-separated brief reasons>
evidence=<comma-separated concrete evidence snippets from the website>
"""


# -----------------------------------------------------------
# 4. Fetch Website Content (New Function)
# -----------------------------------------------------------
def fetch_website_content(url: str) -> str | None:
    """
    Fetches the content of a URL and extracts visible text and links.
    Returns a formatted string for analysis or None if an error occurs.
    """
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()  # Raises an HTTPError for bad responses (4xx or 5xx)

        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract all visible text from the body
        text_content = soup.body.get_text(separator=' ', strip=True) if soup.body else ""

        # Extract all href links
        links = [a['href'] for a in soup.find_all('a', href=True)]

        # Combine the URL, text, and links into a single context for the LLM
        full_content = (
            f"URL BEING ANALYZED: {url}\n\n"
            f"VISIBLE TEXT ON PAGE:\n{text_content}\n\n"
            f"LINKS FOUND ON PAGE:\n" + "\n".join(links)
        )
        return full_content

    except requests.exceptions.RequestException as e:
        print(f"Error fetching URL {url}: {e}")
        return None
    except Exception as e:
        print(f"An unexpected error occurred while processing {url}: {e}")
        return None

# -----------------------------------------------------------
# 5. Homograph Detector (No Changes)
# -----------------------------------------------------------
def detect_homograph(url: str) -> bool:
    try:
        domain_match = re.findall(r"://([^/]+)/?", url)
        if not domain_match: return False
        domain = domain_match[0]
        scripts = set(unicodedata.name(char).split(' ')[0] for char in domain if 'LATIN' not in unicodedata.name(char, ''))
        return len(scripts) > 0
    except Exception:
        return False


# -----------------------------------------------------------
# 6. Gemini Analyzer (Minor Tweak for Clarity)
# -----------------------------------------------------------
def gemini_analyze(website_content: str, model):
    """
    Analyzes website content for phishing using the configured Gemini model.
    """
    if not website_content:
        return "Error: No content to analyze."
    try:
        urls = re.findall(r'href=[\'"]?([^\'" >]+)', website_content)
        homograph_flag = any(detect_homograph(url) for url in urls)

        prompt = system_prompt + "\n\nWebsite Content:\n" + textwrap.dedent(website_content)
        if homograph_flag:
            prompt += "\n\n[System Note: Potential homograph characters detected in links.]"
        
        response = model.generate_content(prompt)
        clean_output = response.text.replace("*", "").strip()

        return clean_output

    except Exception as e:
        print(f"⚠️ An error occurred during Gemini analysis: {e}")
        return "Error: API call failed"

def parse_analysis_to_dict(output_string: str) -> dict:
    """Parses the Gemini response string into a dictionary."""
    if output_string.startswith("Error:"):
        return {"error": output_string}
    
    analysis_dict = {}
    for line in output_string.strip().split('\n'):
        if '=' in line:
            key, value = line.split('=', 1)
            analysis_dict[key.strip()] = value.strip()
    return analysis_dict


# -----------------------------------------------------------
# 7. Fraud Score Calculator (No Changes)
# -----------------------------------------------------------
def calculate_fraud_score(analysis_dict: dict) -> dict:
    """Calculates a fraud score on a scale of 0 to 10."""
    if "error" in analysis_dict:
        return {"score": 0}

    # Internal weights remain the same
    risk_weights = {"high": 3, "medium": 2, "safe": 0}
    verdict_weights = {"phishing": 2, "suspicious": 1, "legitimate": 0}
    confidence_multipliers = {"high": 1.5, "medium": 1.2, "low": 1.0}

    risk = analysis_dict.get("risk_level", "safe").lower()
    verdict = analysis_dict.get("verdict", "legitimate").lower()
    confidence = analysis_dict.get("confidence", "low").lower()

    # Calculate the internal score
    base_score = risk_weights.get(risk, 0) + verdict_weights.get(verdict, 0)
    internal_score = base_score * confidence_multipliers.get(confidence, 1.0)
    
    # Scale the score to be out of 10 (internal max score is 7.5)
    max_internal_score = 7.5
    score_out_of_10 = (internal_score / max_internal_score) * 10
    
    return {"score": round(score_out_of_10, 2)}



# -----------------------------------------------------------
# 8. Main Execution Block
# -----------------------------------------------------------
if __name__ == "__main__":
    load_gemini_api()
    model = get_gemini_model()

    # --- Replace this with the URL you want to analyze ---
    target_url = "http://google.com" 
    # Example of a potentially suspicious URL to test (use with caution): "http://example-bank-login.com"
    
    print(f"--- Analyzing URL: {target_url} ---")
    
    # 1. Fetch content from the URL
    content_to_analyze = fetch_website_content(target_url)
    
    if content_to_analyze:
        # 2. Analyze the fetched content
        result_str = gemini_analyze(content_to_analyze, model)
        
        # 3. Parse and Score the result
        result_dict = parse_analysis_to_dict(result_str)
        fraud_score = calculate_fraud_score(result_dict)
        
        print("\n🔎 Gemini Analysis Result:\n")
        print(result_str)
        print(f"\n📈 Fraud Score: {fraud_score['score']:.2f} / 10")
        print(f"🚨 Fraud Risk: {fraud_score['percentage']}%")
    else:
        print("\nCould not retrieve content. Analysis aborted.")

--- Analyzing URL: http://google.com ---

🔎 Gemini Analysis Result:

verdict=legitimate
risk_level=safe
confidence=high
reasons=The domain is google.com, the content and links are consistent with a legitimate Google search page.
evidence=Domain is google.com, links point to google.com, standard google search page layout.

📈 Fraud Score: 0.00 / 7.5


KeyError: 'percentage'