In [1]:
import pandas as pd
from urllib.parse import urlparse
import re

# Load the labeled dataset
df = pd.read_csv('../datasets/labeled_data.csv')

def extract_features(url):
    features = {}
    parsed_url = urlparse(url)
    
    # 1. Length-based Features
    features['URLLength'] = len(url)
    features['DomainLength'] = len(parsed_url.netloc)
    
    # 2. Count-based Features (Dots, Hyphens, Digits)
    features['NoOfDots'] = url.count('.')
    features['NoOfHyphens'] = url.count('-')
    features['NoOfDigits'] = sum(c.isdigit() for c in url)
    features['NoOfLetters'] = sum(c.isalpha() for c in url)
    
    # 3. Special Character Features
    features['NoOfEquals'] = url.count('=')
    features['NoOfQMark'] = url.count('?')
    features['NoOfAmpersand'] = url.count('&')
    
    # 4. Binary Features (HTTPS / IP)
    features['IsHTTPS'] = 1 if parsed_url.scheme == 'https' else 0
    # Check if the domain is an IP address
    features['IsIP'] = 1 if re.match(r"^\d{1,3}(\.\d{1,3}){3}$", parsed_url.netloc) else 0
    
    return features

# Apply extraction to the entire dataset
print("Extracting features... this may take a moment.")
feature_list = df['url'].apply(extract_features).tolist()
feature_df = pd.DataFrame(feature_list)

# Combine features with the original labels
final_df = pd.concat([feature_df, df['label']], axis=1)

# Preview the new numerical dataset
print(final_df.head())
final_df.to_csv('../datasets/feature_extracted_data.csv', index=False)

Extracting features... this may take a moment.
   URLLength  DomainLength  NoOfDots  NoOfHyphens  NoOfDigits  NoOfLetters  \
0         20             0         1            0           0           19   
1         28            11         1            0           4           19   
2         38            29         2            1           3           28   
3         14             0         1            0           0           13   
4         20             0         1            0           0           19   

   NoOfEquals  NoOfQMark  NoOfAmpersand  IsHTTPS  IsIP  label  
0           0          0              0        0     0      0  
1           0          0              0        1     0      1  
2           0          0              0        1     0      1  
3           0          0              0        0     0      0  
4           0          0              0        0     0      0  


In [2]:
import pandas as pd
from urllib.parse import urlparse
import re

# 1. Load the balanced dataset
print("Loading dataset...")
df_original = pd.read_csv('../datasets/labeled_data.csv')

# 2. Define the extraction function
def extract_lexical_features(url):
    features = {}
    try:
        parsed = urlparse(url)
        features['URLLength'] = len(url)
        features['NoOfDots'] = url.count('.')
        features['NoOfHyphens'] = url.count('-')
        features['NoOfDigits'] = sum(c.isdigit() for c in url)
        features['IsHTTPS'] = 1 if parsed.scheme == 'https' else 0
    except:
        # If a URL is malformed, return zeros
        return {'URLLength': 0, 'NoOfDots': 0, 'NoOfHyphens': 0, 'NoOfDigits': 0, 'IsHTTPS': 0}
    return features

# 3. Create the feature_df
print("Extracting features (this may take a minute)...")
# This creates a list of dictionaries, then converts it to a DataFrame
feature_list = df_original['url'].apply(extract_lexical_features).tolist()
feature_df = pd.DataFrame(feature_list)

# 4. Merge them back together
# We ensure the index matches so the labels stay with the right URLs
feature_df.index = df_original.index
final_df = pd.concat([df_original[['url', 'label']], feature_df], axis=1)

# 5. Sanitize Column Names for LightGBM
# This removes spaces or dots in column names that cause LightGBM to crash
final_df.columns = [re.sub(r'[^A-Za-z0-9_]+', '', str(col)) for col in final_df.columns]

# 6. Save and Verify
final_df.to_csv('../datasets/feature_extracted_data.csv', index=False)
print("\nSuccess! Dataset created with columns:")
print(final_df.columns.tolist())
print(f"Total rows: {len(final_df)}")

Loading dataset...
Extracting features (this may take a minute)...

Success! Dataset created with columns:
['url', 'label', 'URLLength', 'NoOfDots', 'NoOfHyphens', 'NoOfDigits', 'IsHTTPS']
Total rows: 106494
