# 🧹 Clean ISB Translated Data (No Replacements Needed)
This notebook formats the contact column but skips text replacements (since none are needed).

In [None]:

# import csv
# import pandas as pd
# import ast

In [None]:

# # --- File paths ---
# uncleaned_isb_english = '/Users/kiranmulawad/AI-Funding/1_scraping/isb/data/funding-isb-english.csv'
# cleaned_isb_english = '/Users/kiranmulawad/AI-Funding/1_scraping/isb/data/funding-isb-cleaned.csv'


In [None]:

# # --- Clean contact formatting if separator is missing ---
# def clean_contact_info(contact_info):
#     if "Advice on business development" in contact_info:
#         if "|" not in contact_info.split("Advice on business development")[1][:1]:
#             contact_info = contact_info.replace("Advice on business development", "Advice on business development |", 1)
#     return contact_info


In [None]:

# # --- Format list-style contact info into readable lines ---
# def format_contact_info(contact_column):
#     formatted_contacts = []
#     try:
#         contacts = ast.literal_eval(contact_column) if isinstance(contact_column, str) else contact_column
#     except:
#         return contact_column

#     for contact in contacts:
#         parts = [p.strip() for p in contact.split("|")]
#         if len(parts) == 3:
#             formatted_contacts.append(f"- Name: {parts[0]} | Phone: {parts[1]} | Email: {parts[2]}")

#     return "\n".join(formatted_contacts) if formatted_contacts else "No contact information available."


In [None]:

# # --- Load and clean rows (only contact formatting applied) ---
# with open(uncleaned_isb_english, mode='r', encoding='utf-8') as infile:
#     reader = csv.DictReader(infile)
#     data = list(reader)

# cleaned_data = []
# for row in data:
#     cleaned_row = {}
#     for key, value in row.items():
#         if key == 'contact':
#             value = clean_contact_info(value)
#             value = format_contact_info(value)
#         cleaned_row[key] = value
#     cleaned_data.append(cleaned_row)


In [None]:

# # --- Save cleaned output ---
# with open(cleaned_isb_english, mode='w', encoding='utf-8', newline='') as outfile:
#     writer = csv.DictWriter(outfile, fieldnames=data[0].keys())
#     writer.writeheader()
#     writer.writerows(cleaned_data)

# print(f"✅ Cleaned file saved to: {cleaned_isb_english}")


✅ Cleaned file saved to: /Users/kiranmulawad/AI-Funding/1_scraping/isb/data/funding-isb-cleaned.csv


In [1]:
# %% [markdown]
# # 🧹 Clean ISB Translated Data (Best Practice Version)
# Formats the contact column for readability and ensures all columns are tidy.

# %%
import pandas as pd

# --- File paths ---
uncleaned_isb_english = '/Users/kiranmulawad/AI-Funding/1_scraping/isb/data/funding_isb_english.csv'
cleaned_isb_english = '/Users/kiranmulawad/AI-Funding/1_scraping/isb/data/funding-isb-cleaned.csv'



In [2]:
# %%
def clean_contact(contact):
    """
    Cleans and formats the contact column for readability.
    Handles both single string and list-like entries.
    """
    if pd.isna(contact) or not isinstance(contact, str):
        return ""
    # Try to split multiple contacts (if they are separated by '||' or '\n')
    contacts = [c.strip() for c in contact.replace('\n', '||').split('||') if c.strip()]
    formatted_contacts = []
    for c in contacts:
        # Split by '|' or ',' if present
        if '|' in c:
            parts = [p.strip() for p in c.split('|')]
        elif ',' in c and 'Email:' in c and 'Phone:' in c:
            # Try to split by ',' for FDB style contacts
            parts = [p.strip() for p in c.split(',')]
        else:
            parts = [c.strip()]
        # Build a readable contact line
        if len(parts) == 3:
            formatted_contacts.append(f"- Name: {parts[0]} | Phone: {parts[1]} | Email: {parts[2]}")
        elif len(parts) == 2:
            formatted_contacts.append(f"- {parts[0]} | {parts[1]}")
        else:
            formatted_contacts.append(f"- {parts[0]}")
    return "\n".join(formatted_contacts) if formatted_contacts else ""



In [3]:
# %%
# Load data
df = pd.read_csv(uncleaned_isb_english)

# Strip whitespace from all string columns
for col in df.select_dtypes(include='object'):
    df[col] = df[col].str.strip()

# Clean and format the contact column
if 'contact' in df.columns:
    df['contact'] = df['contact'].apply(clean_contact)

# Optional: drop duplicates, reset index, or reorder columns as needed
df = df.drop_duplicates().reset_index(drop=True)




In [4]:
# Save cleaned output
df.to_csv(cleaned_isb_english, index=False)
print(f"✅ Cleaned file saved to: {cleaned_isb_english}")



✅ Cleaned file saved to: /Users/kiranmulawad/AI-Funding/1_scraping/isb/data/funding-isb-cleaned.csv


In [5]:
# Display sample
df[['name', 'contact']].head(3)

Unnamed: 0,name,contact
0,Consulting program for SMEs,- Name: Business development consulting | Phon...
1,Consulting program for business start-ups,- Name: Business development consulting | Phon...
2,"Business relocation, investment service",- Name: Tim Sandrock | Phone: 06131 6172-1226 ...
