In [None]:
import pandas as pd

def analyze_duplicates(file_path):
    # Load CSV file into a DataFrame
    df = pd.read_csv(f'{file_path}.csv')

    # Validate if 'pmid' column exists
    if 'pmid' not in df.columns:
        print("Error: The CSV file must contain a column named 'pmid'.")
        return

    # Calculate total number of entries
    total_entries = len(df)

    # Count the occurrences of each entry
    pmid_counts = df['pmid'].value_counts()

    # Separate duplicated and non-duplicated entries
    duplicated_entries = pmid_counts[pmid_counts > 1]
    non_duplicated_entries = pmid_counts[pmid_counts == 1]

    # Create a DataFrame with unique entries (including duplicated entries once)
    unique_pmids = pmid_counts.index
    df_unique = df.drop_duplicates(subset='pmid')
    df_unique.to_csv(f'{file_path}_no_duplicates.csv', index=False)

    # Display results
    print(f"Total number of entries: {total_entries}")
    print(f"Total number of not-duplicated entries: {len(non_duplicated_entries)}")
    print(f"Total number of duplicated entries: {len(duplicated_entries)}")
    print("Duplicated entries details:")
    for pmid, count in duplicated_entries.items():
        print(f"{pmid} - {count}")

def merge_and_remove_duplicates(file_path_1, file_path_2, output_file):
    # Load CSV files into DataFrames
    df1 = pd.read_csv(file_path_1)
    df2 = pd.read_csv(file_path_2)

    # Validate if 'pmid' column exists in both files
    if 'pmid' not in df1.columns or 'pmid' not in df2.columns:
        print("Error: Both CSV files must contain a column named 'pmid'.")
        return

    # Concatenate the two DataFrames and drop duplicates
    df_merged = pd.concat([df1, df2]).drop_duplicates(subset='pmid')
    df_merged.to_csv(output_file, index=False)
    print(f"Merged file with duplicates removed saved to: {output_file}")

def find_non_included_entries(file_path_1, file_path_2):
    # Load CSV files into DataFrames
    df1 = pd.read_csv(file_path_1)
    df2 = pd.read_csv(file_path_2)

    # Validate if 'pmid' column exists in both files
    if 'pmid' not in df1.columns or 'pmid' not in df2.columns:
        print("Error: Both CSV files must contain a column named 'pmid'.")
        return

    # Find entries in df1 that are not in df2
    non_included_entries = df1[~df1['pmid'].isin(df2['pmid'])]

    # Display results
    print(f"Total number of entries in {file_path_1}: {len(df1)}")
    print(f"Total number of entries not included in {file_path_2}: {len(non_included_entries)}")
    print("Entries not included:")
    for pmid in non_included_entries['pmid']:
        print(pmid)

"""
if __name__ == "__main__":
    #file1 = 'mentalHealth'
    #file2 = 'parkinson'
    file1 = 'pmid_list'
    analyze_duplicates(file1)

    file1 = 'pmid_list'
    file2 = 'new_pmid'
    output_file = 'merged_output.csv'

    analyze_duplicates(file1)
    analyze_duplicates(file2)

    find_non_included_entries(f'{file1}_no_duplicates.csv', f'{file2}_no_duplicates.csv')

    merge_and_remove_duplicates(f'{file1}_no_duplicates.csv', f'{file2}_no_duplicates.csv', output_file)
"""

In [None]:
analyze_duplicates('merged_literature\parkinson_old_new_merged')

In [None]:
find_non_included_entries("new_literature\parkinson\parkinson_new_merged_no_duplicates.csv", "old_literature\parkinson\parkinson_old_merged_no_duplicates.csv")