# Project CYB333: Cybersecurity Data Analysis

This project uses a phishing dataset I created to explore cybersecurity threats using Python (Pandas and Matplotlib). The data is analyzed for patterns such as phishing vs legitimate URLs, average URL lengths, and the use of IP addresses and URL shortening services. 


## Step 1: Import Libraries and load dataset

In [None]:
# Import the libraries
import pandas as pd
import matplotlib.pyplot as plt

# Load the CSV file
data = pd.read_csv('phishing_dataset.csv')

# Show the first few rows
data.head()


## Step 2: Explore the Dataset

In [None]:
# Check for missing values
print("Missing values:\n", data.isnull().sum())

# See basic information about the dataset
print("\nData info:")
print(data.info())

# Quick stats about the data (if numeric fields exist)
print("\nData description:")
print(data.describe())


## Step 3: Analyze the Dataset

In [None]:
# Count how many phishing vs legitimate URLs
print("Phishing vs Legitimate URLs:")
print(data['Label'].value_counts())

# Calculate average URL length for phishing and legitimate URLs
print("\nAverage URL Lengths:")
print(data.groupby('Label')['URL_Length'].mean())

# Percentage of URLs that have an IP Address
ip_usage_percentage = (data['Has_IP_Address'].sum() / len(data)) * 100
print(f"\nPercentage of URLs using IP Address: {ip_usage_percentage:.2f}%")

# Percentage of URLs that are Shortened
shortened_usage_percentage = (data['Is_Shortened'].sum() / len(data)) * 100
print(f"Percentage of URLs that are Shortened: {shortened_usage_percentage:.2f}%")


## Step 4: Bar Graph of Phishing vs Legitimate URLs

In [None]:
# Bar Graph - Phishing vs Legitimate URLs
import matplotlib.pyplot as plt

# Count phishing vs legitimate
label_counts = data['Label'].value_counts()

# Plot
plt.figure(figsize=(6,4))
label_counts.plot(kind='bar', color=['red', 'green'])

# Titles and labels
plt.title('Phishing vs Legitimate URLs')
plt.xlabel('Label (1 = Phishing, 0 = Legitimate)')
plt.ylabel('Count')
plt.xticks(rotation=0)  # Keep labels horizontal

# Show the graph
plt.show()


## Step 5: Pie Chart of Shortened vs Normal URLs

In [None]:
# Pie Chart - Shortened vs Normal URLs
# Count how many shortened vs not
shortened_counts = data['Is_Shortened'].value_counts()

# Labels for pie pieces
labels = ['Normal URL', 'Shortened URL']

# Plot pie chart
plt.figure(figsize=(6,6))
plt.pie(shortened_counts, labels=labels, autopct='%1.1f%%', startangle=140, colors=['lightblue', 'lightcoral'])

# Add a title
plt.title('Percentage of Shortened vs Normal URLs')

# Show the chart
plt.show()
