## Phising URL Dataset collected form https://github.com/JPCERTCC/phishurl-list
refining it for project dataset generation

In [10]:
import os
import pandas as pd

def merge_csv_files_with_urls(input_folder, output_file, url_column_name="url"):
    """
    Merges all CSV files in a folder into a single CSV file, keeping only the URL column.
    
    Args:
        input_folder (str): Path to the folder containing CSV files.
        output_file (str): Path to save the merged CSV file.
        url_column_name (str): Name of the URL column to retain.
    """
    all_files = [os.path.join(input_folder, f) for f in os.listdir(input_folder) if f.endswith('.csv')]
    
    if not all_files:
        print("No CSV files found in the folder.")
        return
    
    # Initialize an empty list to store DataFrames
    data_frames = []
    
    for file in all_files:
        try:
            # Read each CSV file
            df = pd.read_csv(file)
            
            # Keep only the URL column
            if url_column_name in df.columns:
                df = df[[url_column_name]]
                data_frames.append(df)
                print(f"Loaded {file} with {len(df)} URL records.")
            else:
                print(f"'{url_column_name}' column not found in {file}. Skipping.")
        except Exception as e:
            print(f"Failed to read {file}: {e}")
    
    # Combine all DataFrames
    if data_frames:
        combined_df = pd.concat(data_frames, ignore_index=True)
        
        # Save the combined DataFrame to a new CSV file
        combined_df.to_csv(output_file, index=False)
        print(f"All CSV files merged into {output_file} with {len(combined_df)} URL records.")
    else:
        print("No valid URL data found in the CSV files.")

# Example usage
input_folder = "data/"  # Replace with your folder containing .csv files
output_file = "data_output/combined.csv"  # Replace with your desired output file name
url_column_name = "URL"  # Replace with the name of the column containing URLs, if different

merge_csv_files_with_urls(input_folder, output_file, url_column_name)


Loaded data/202306.csv with 10581 URL records.
Loaded data/202001.csv with 646 URL records.
Loaded data/202210.csv with 5375 URL records.
Loaded data/201901.csv with 315 URL records.
Loaded data/202204.csv with 5397 URL records.
Loaded data/202410.csv with 4729 URL records.
Loaded data/201910.csv with 588 URL records.
Loaded data/202007.csv with 950 URL records.
Loaded data/201906.csv with 458 URL records.
Loaded data/201909.csv with 567 URL records.
Loaded data/202305.csv with 7250 URL records.
Loaded data/202303.csv with 3936 URL records.
Loaded data/201903.csv with 588 URL records.
Loaded data/202004.csv with 971 URL records.
Loaded data/202208.csv with 5336 URL records.
Loaded data/201912.csv with 581 URL records.
Loaded data/201902.csv with 434 URL records.
Loaded data/202401.csv with 5772 URL records.
Loaded data/202003.csv with 1124 URL records.
Loaded data/202206.csv with 7021 URL records.
Loaded data/202106.csv with 1626 URL records.
Loaded data/202402.csv with 7994 URL record

In [6]:
!pwd

/home/test/Desktop/Data_gen


In [8]:
!ls -al

total 24
drwxrwxr-x 5 test test 4096 Feb 12 12:29 .
drwxr-xr-x 7 test test 4096 Feb 12 12:29 ..
drwxrwxr-x 2 test test 4096 Feb 12 12:26 data
-rw-rw-r-- 1 test test 3103 Feb 12 12:28 data_gen.ipynb
drwxrwxr-x 2 test test 4096 Feb 12 12:28 data_output
drwxrwxr-x 2 test test 4096 Feb 12 12:22 .ipynb_checkpoints


In [13]:
!ls -al

total 36
drwxrwxr-x 5 test test 4096 Feb 12 12:40 .
drwxr-xr-x 7 test test 4096 Feb 12 12:29 ..
drwxrwxr-x 2 test test 4096 Feb 12 12:30 data
drwxrwxr-x 2 test test 4096 Feb 12 12:31 data_output
-rw-rw-r-- 1 test test 3900 Feb 12 12:40 email_data_gen.ipynb
drwxrwxr-x 2 test test 4096 Feb 12 12:38 .ipynb_checkpoints
-rw-rw-r-- 1 test test 8371 Feb 12 12:39 url_data_gen.ipynb


In [4]:
df = pd.read_csv("data_output/combined.csv")
df.shape

(220006, 1)