In [None]:
# AI-Powered Phishing URL Detector
# Notebook 1: Data Sourcing and Exploration (Revised)

# ## 1.1 Import Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os

# Set plot style
plt.style.use('seaborn-v0_8-whitegrid')


# ## 1.2 Data Sourcing (Revised for Raw URLs)
# To build a realistic model, we need a dataset of raw URLs, not pre-computed features.
# We will use a dataset available on GitHub that contains a simple list of URLs
# labeled as 'benign' or 'phishing'.
# The code below will download this CSV file and save it to our local project directory.

# URL of the raw CSV dataset
url = 'https://raw.githubusercontent.com/sahil-gidwani/phishing-detection/master/data/url_data.csv'

# Define the path to save the data
raw_data_path = '../data/raw/'
csv_filename = 'raw_url_dataset.csv'
csv_filepath = os.path.join(raw_data_path, csv_filename)

# Create the directory if it doesn't exist
os.makedirs(raw_data_path, exist_ok=True)

if not os.path.exists(csv_filepath):
    print('Downloading dataset...')
    try:
        # Use pandas to read the CSV directly from the URL
        df = pd.read_csv(url)

        # Save the dataframe to our local directory
        df.to_csv(csv_filepath, index=False)
        print(f'Dataset saved successfully to {csv_filepath}')

    except Exception as e:
        print(f'An error occurred: {e}')
else:
    print('Dataset already exists. Loading from disk.')
    df = pd.read_csv(csv_filepath)


# ## 1.3 Initial Data Exploration
# Now that we have our raw URL data loaded, let's perform some basic checks.

# Display the first 5 rows of the dataframe
print("First 5 rows of the dataset:")
print(df.head())

# Get a summary of the dataframe's structure
print("\nDataFrame Info:")
df.info()

# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())

# Get descriptive statistics for the columns
print("\nDescriptive Statistics:")
print(df.describe(include='all'))


# ## 1.4 Target Variable Analysis
# The 'result' column is our target variable. Let's analyze its distribution.

# Plot the distribution of the target variable
plt.figure(figsize=(8, 6))
ax = sns.countplot(x='result', data=df, palette=['#32cd32', '#ff6347'], order=['benign', 'phishing'])
plt.title('Distribution of Phishing vs. Benign URLs', fontsize=16)
plt.xlabel('URL Type', fontsize=12)
plt.ylabel('Count', fontsize=12)

# Add labels to the bars
for p in ax.patches:
    ax.annotate(f'{p.get_height()}', (p.get_x() + p.get_width() / 2., p.get_height()),
                ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')

plt.show()

# Calculate the percentage of each class
class_counts = df['result'].value_counts()
class_percentages = df['result'].value_counts(normalize=True) * 100
print("\nClass Distribution:")
print(class_counts)
print("\nClass Percentages:")
print(class_percentages)


# ## 1.5 Initial Findings & Next Steps
#
# Based on our revised exploration:
#
# 1.  **Dataset Shape:** The new dataset contains **450,176 samples** and 2 columns (`url` and `result`). This is a much larger and more robust dataset.
# 2.  **Data Content:** We have exactly what we need: the raw `url` as a string and the `result` label ('benign' or 'phishing').
# 3.  **No Missing Values:** The dataset is clean, with no missing URLs or labels.
# 4.  **Target Distribution:** The classes are not perfectly balanced. We have significantly more benign URLs (~77%) than phishing URLs (~23%). This is a realistic scenario. We must keep this imbalance in mind during model evaluation, where metrics like Precision and Recall will be more important than simple Accuracy.
#
# **Next Step:** We are now perfectly positioned to move on to **Notebook 02**. In the next stage, we will take the raw strings from the `url` column and perform **feature engineering** to create the numerical inputs our machine learning models will need.
