## EU DATA cleaning

## Remove blank Names

In [1]:
import pandas as pd
import re

### Load Data

In [44]:
df = pd.read_csv("name-fix-eu-list.csv")

In [45]:
df.columns

Index(['Entity_LogicalId', 'Entity_EU_ReferenceNumber', 'Entity_Remark',
       'Entity_SubjectType', 'Entity_SubjectType_ClassificationCode',
       'NameAlias_LastName', 'NameAlias_FirstName', 'NameAlias_MiddleName',
       'NameAlias_WholeName', 'NameAlias_Title', 'NameAlias_Function',
       'NameAlias_LogicalId', 'NameAlias_Remark', 'Address_City',
       'Address_Street', 'Address_PoBox', 'Address_ZipCode', 'Address_Region',
       'Address_Place', 'Address_ContactInfo', 'Address_Remark',
       'BirthDate_BirthDate', 'BirthDate_Day', 'BirthDate_Month',
       'BirthDate_Year', 'BirthDate_Place', 'BirthDate_City',
       'BirthDate_Remark', 'Identification_Number', 'Identification_Remark',
       'Citizenship_Remark'],
      dtype='object')

In [46]:
alias_cols = [
    "NameAlias_LastName",
    "NameAlias_FirstName",
    "NameAlias_MiddleName",
    "NameAlias_WholeName",
    "NameAlias_Title",
    "NameAlias_Function",
    "NameAlias_LogicalId",
    "NameAlias_Remark"
]


In [47]:
# Keep only rows where at least one alias column has English text
df = df[df[alias_cols].apply(lambda row: any(has_english(val) for val in row), axis=1)]

In [48]:
# Drop columns that are completely empty or contain only NaN/empty/whitespace
df = df.dropna(axis=1, how="all")  # Drop fully empty columns
df = df[[col for col in df.columns if df[col].apply(lambda x: str(x).strip() if pd.notna(x) else "").any()]]  # Remove whitespace-only columns

In [49]:
df.to_csv("output_namealias_english_2.csv", index=False)

### Load Data

## Remove blank Address

In [58]:
df = pd.read_csv("eu-list-IDN-src.csv")

In [59]:
df.columns

Index(['Entity_LogicalId', 'Entity_EU_ReferenceNumber', 'Entity_SubjectType',
       'Entity_SubjectType_ClassificationCode', 'Identification_Number',
       'Identification_TypeCode', 'Identification_TypeDescription',
       'Identification_Region', 'Identification_CountryIso2Code',
       'Identification_CountryDescription', 'Identification_Remark'],
      dtype='object')

In [60]:
# Alias-related columns
alias_cols = [
'Identification_Number',
       'Identification_TypeCode', 'Identification_TypeDescription',
       'Identification_Region', 'Identification_CountryIso2Code',
       'Identification_CountryDescription', 'Identification_Remark'
]

In [61]:
# Function to check if a value has English letters
def has_english(text):
    if pd.isna(text):
        return False
    return bool(re.search(r"[A-Za-z]", str(text)))

In [62]:
# Keep only rows where at least one alias column has English text
df = df[df[alias_cols].apply(lambda row: any(has_english(val) for val in row), axis=1)]

In [63]:
# Drop columns that are completely empty or contain only NaN/empty/whitespace
df = df.dropna(axis=1, how="all")  # Drop fully empty columns
df = df[[col for col in df.columns if df[col].apply(lambda x: str(x).strip() if pd.notna(x) else "").any()]]  # Remove whitespace-only columns

In [64]:
df.to_csv("output_IDN_english.csv", index=False)

### Remove Blank BirthDate

In [70]:
df = pd.read_csv("eu-list-BirthDate-src.csv")

In [71]:
df.columns

Index(['Entity_LogicalId', 'Entity_EU_ReferenceNumber', 'BirthDate_BirthDate',
       'BirthDate_Day', 'BirthDate_Month', 'BirthDate_Year',
       'BirthDate_Region', 'BirthDate_Place', 'BirthDate_City',
       'BirthDate_CountryIso2Code', 'BirthDate_CountryDescription',
       'BirthDate_Remark'],
      dtype='object')

In [72]:
# Alias-related columns
alias_cols = [
'BirthDate_BirthDate',
       'BirthDate_Day', 'BirthDate_Month', 'BirthDate_Year',
       'BirthDate_Region', 'BirthDate_Place', 'BirthDate_City',
       'BirthDate_CountryIso2Code', 'BirthDate_CountryDescription',
       'BirthDate_Remark'
]

In [73]:
# Function to check if a value has English letters
def has_english(text):
    if pd.isna(text):
        return False
    return bool(re.search(r"[A-Za-z]", str(text)))

In [74]:
# Keep only rows where at least one alias column has English text
df = df[df[alias_cols].apply(lambda row: any(has_english(val) for val in row), axis=1)]

In [75]:
# Drop columns that are completely empty or contain only NaN/empty/whitespace
df = df.dropna(axis=1, how="all")  # Drop fully empty columns
df = df[[col for col in df.columns if df[col].apply(lambda x: str(x).strip() if pd.notna(x) else "").any()]]  # Remove whitespace-only columns

In [76]:
df.to_csv("output_BirthDt_english.csv", index=False)

### Remove Blank Citizen

In [77]:
df = pd.read_csv("eu-list-Citizen-src.csv")

In [78]:
df.columns

Index(['Entity_LogicalId', 'Entity_EU_ReferenceNumber', 'Entity_SubjectType',
       'Entity_SubjectType_ClassificationCode', 'Citizenship_CountryIso2Code',
       'Citizenship_Remark'],
      dtype='object')

In [79]:
# Alias-related columns
alias_cols = [
'Citizenship_CountryIso2Code',
       'Citizenship_Remark'
]

In [80]:
# Function to check if a value has English letters
def has_english(text):
    if pd.isna(text):
        return False
    return bool(re.search(r"[A-Za-z]", str(text)))

In [81]:
# Keep only rows where at least one alias column has English text
df = df[df[alias_cols].apply(lambda row: any(has_english(val) for val in row), axis=1)]

In [82]:
# Drop columns that are completely empty or contain only NaN/empty/whitespace
df = df.dropna(axis=1, how="all")  # Drop fully empty columns
df = df[[col for col in df.columns if df[col].apply(lambda x: str(x).strip() if pd.notna(x) else "").any()]]  # Remove whitespace-only columns

In [83]:
df.to_csv("output_Citizen_english.csv", index=False)