# remove duplicated urls

In [3]:
# prompt: load csv, remove entries whose "repo_url" column contain duplicates, print the number of deleted rows and new size, then export to a new csv

import pandas as pd

def process_csv(input_file, output_file):
    try:
        # Load the CSV file into a pandas DataFrame
        df = pd.read_csv(input_file)

        # Identify and remove duplicate URLs, keeping the first occurrence
        initial_size = len(df)
        df.drop_duplicates(subset='url', keep='first', inplace=True)

        # Calculate the number of deleted rows
        deleted_rows = initial_size - len(df)

        # Print the number of deleted rows and new size of the DataFrame
        print(f"Number of deleted rows: {deleted_rows}")
        print(f"New size of the DataFrame: {len(df)}")

        # Export the modified DataFrame to a new CSV file
        df.to_csv(output_file, index=False)

    except FileNotFoundError:
        print(f"Error: Input file '{input_file}' not found.")
    except KeyError as e:
        print(f"Error: Column '{e}' not found in the CSV file.")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")



In [4]:
input_csv_file = "/content/drive/MyDrive/datasets/muict-naist-senior/rq3/treatment_present/hn_rq3_repos_treatment_v5.csv"
output_csv_file = "/content/drive/MyDrive/datasets/muict-naist-senior/rq3/treatment_present/hn_rq3_repos_treatment_v6_unduplicated.csv"
process_csv(input_csv_file, output_csv_file)

Number of deleted rows: 2
New size of the DataFrame: 292
