In [1]:
import os
import pandas as pd

In [2]:
def clean_name(x: str) -> str:
    return x.lower().strip('.,?:;() ')


def compare_dataframes(df1: pd.DataFrame, df2: pd.DataFrame) -> pd.DataFrame:
    """
    Compares two datasets based on content column
    
    :returns: dataset containing duplicate entities
    """
    # by content
    # df_dupl = df1[df1['content'].isin(df2['content'])]
    # df_dupl = df_dupl.merge(df2, on='content', how='left')

    # by name
    df1['trasn_name'] = df1['name'].apply(clean_name)
    df2['trasn_name'] = df2['name'].apply(clean_name)

    df_dupl = df1[df1['trasn_name'].isin(df2['trasn_name'])]
    df_dupl = df_dupl.merge(df2, on='trasn_name', how='left')

    return df_dupl

In [3]:
def compare_datasets(path1: str, path2: str):
    dupl_path = '../duplicates'
    df1 = pd.read_csv(path1)
    df2 = pd.read_csv(path2)

    if 'name' not in df1.columns:
        return

    if 'name' not in df2.columns:
        return

    df_dupl = compare_dataframes(df1, df2)
    name1 = os.path.splitext(path1)[0].split('/')[-1]
    name2 = os.path.splitext(path2)[0].split('/')[-1]
    df_dupl.to_csv(f'{dupl_path}/{name1}_{name2}_dupl.csv', index=False)

In [4]:
data_dirs = os.listdir('../data')
files = []

for d in data_dirs:
    files += ['../data/' + d + '/' + i for i in os.listdir('../data/' + d)]


In [5]:
for i in range(len(files)):
    for j in range(i + 1, len(files)):
        compare_datasets(files[i], files[j])

In [6]:
dupl_files = os.listdir('../duplicates')
for f in dupl_files:
    df = pd.read_csv('../duplicates/' + f)
    if df.shape[0] != 0:
        print(f, 'has duplicates')
    else:
        os.remove('../duplicates/' + f)


repeats_target_viral_target_dupl.csv has duplicates
molecule_data_mirna_target_dupl.csv has duplicates
viral_target_ribosomal_target_dupl.csv has duplicates
riboswitch_target_ribosomal_target_dupl.csv has duplicates
repeats_target_ribosomal_target_dupl.csv has duplicates
mirna_target_viral_target_dupl.csv has duplicates
molecule_data_viral_target_dupl.csv has duplicates
repeats_target_mirna_target_dupl.csv has duplicates
molecule_data_riboswitch_target_dupl.csv has duplicates
protein_data_molecule_data_dupl.csv has duplicates
protein_data_protein_data_biogrid_dupl.csv has duplicates
molecule_data_ribosomal_target_dupl.csv has duplicates
mirna_target_ribosomal_target_dupl.csv has duplicates


In [7]:
dupl_files = os.listdir('../duplicates')
for f in dupl_files:
    df = pd.read_csv('../duplicates/' + f)
    res_idx = []
    for idx in df.index:
        if 'content_x' in df.columns and 'content_y' in df.columns:
            if df.loc[idx]['content_x'] != df.loc[idx]['content_y']:
                res_idx.append(idx)
        else:
            res_idx.append(idx)

    res_df = df.iloc[res_idx]
    res_df.to_csv(f'../duplicates/{f}', index=False)