In [2]:
import pandas as pd

In [3]:
df = pd.read_csv(
    "Database export - Clothing distribuiton EMEA - 2019-2021.csv", encoding="latin1"
)
df.drop("Unnamed: 0", axis=1, inplace=True)

df.head()

Unnamed: 0,Company Name,ISO Country Code,Country,City,Address,NACE Rev. 2 Code principal,"NACE Principal Rev. 2, description",Sector,Main activity,Description in Original language,...,Website,Operating result [=EBIT]\nmEUR 2021,Operating result [=EBIT]\nmEUR 2020,Operating result [=EBIT]\nmEUR 2019,Turnover\nmEUR 2021,Turnover\nmEUR 2020,Turnover\nmEUR 2019,Availability of financial data,2 years of consecutive losses,Average loss over 3 years
0,ITX HELLAS SINGLE MEMBER S.A.,GR,Grèce,ATHENS,,4771,Commerce de détail d'habillement en magasin sp...,Engaged in the retail of fashion clothing,Retail,,...,www.zara.com\nwww.fashionwear1.gr,31,17,22,459,352,300,YES,NO,NO
1,RANDEVU,RU,Fédération de Russie,MOSCOW,"UL. SKAKOVAYA D. 17, STR. 2, KOM. 2",4772,Commerce de détail de chaussures et d'articles...,Engaged in the wholesale trade and marketing o...,Wholesale,?????????? ????? ? ???????????,...,www.rendez-vous.ru\nwww.rest-randevu.ru,15,14,1,306,230,337,YES,NO,NO
2,ZEEMAN TEXTIELSUPERS,FR,France,PARIS,,4771,Commerce de détail d'habillement en magasin sp...,Engaged in the retail sale of clothing and out...,Retail,"Négoce au détail ou en gros de vêtements, acce...",...,www.zeeman.com,4,3,4,164,137,147,YES,NO,NO
3,ANSWEAR.COM S.A.,PL,Pologne,KRAKOW,,4771,Commerce de détail d'habillement en magasin sp...,Digital platform for the sale of branded cloth...,Wholesale; Retail; Services,,...,www.answear.com,8,5,1,149,90,73,YES,NO,NO
4,REGENT GOLD,RU,Fédération de Russie,"AGALATOVSKOE SELSKOE POSELENIE, D. SKOTNOE","UL. SAVUSHKINA D. 126, LIT. A, POMESHCH. 133-N",4777,Commerce de détail d'articles d'horlogerie et ...,Operates as other miscellaneous store retailer,Retail,???????? ?????????? ?????????,...,www.zolotoy.ru,22,16,5,141,103,117,YES,NO,NO


In [4]:
# ✅ 1. Clean column names (remove newlines)
df.columns = [col.replace("\n", " ").strip() for col in df.columns]

In [5]:

# ✅ 2. Split multiple website entries
df["Additional Websites"] = df["Website"].apply(lambda x: "\n".join(x.split("\n")[1:]) if isinstance(x, str) and "\n" in x else None)
df["Website"] = df["Website"].apply(lambda x: x.split("\n")[0] if isinstance(x, str) else x)


In [8]:
df['Website']

0                            www.zara.com
1                      www.rendez-vous.ru
2                          www.zeeman.com
3                         www.answear.com
4                          www.zolotoy.ru
                      ...                
485                                   NaN
486                  www.cortinadecor.com
487                         www.tezuk.com
488    www.cralregionesardegna.it/?p=2693
489                   www.markt-kontor.de
Name: Website, Length: 490, dtype: object

In [9]:
df['Additional Websites']

0      www.fashionwear1.gr
1      www.rest-randevu.ru
2                     None
3                     None
4                     None
              ...         
485                   None
486                   None
487           www.zetek.it
488                   None
489                   None
Name: Additional Websites, Length: 490, dtype: object

In [17]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager

In [18]:
# ✅ Set up Selenium for JavaScript-rendered websites
options = Options()
options.add_argument("--headless")  # Run in background
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)


In [19]:
def get_metadata(url):
    if not isinstance(url, str) or len(url) < 5:
        return {"Meta Title": None, "Meta Description": None, "Meta Keywords": None}

    # Ensure the URL has a proper scheme
    if not url.startswith(("http://", "https://")):
        url = "https://" + url  # Default to HTTPS

    try:
        headers = {"User-Agent": "Mozilla/5.0"}
        response = requests.get(url, headers=headers, timeout=5)

        # If requests fail, use Selenium (handles JavaScript)
        if response.status_code != 200:
            driver.get(url)
            page_source = driver.page_source
            soup = BeautifulSoup(page_source, "html.parser")
        else:
            soup = BeautifulSoup(response.text, "html.parser")

        # Extract metadata
        title = soup.title.string.strip() if soup.title else None
        meta_desc = soup.find("meta", attrs={"name": "description"})
        meta_desc = meta_desc["content"].strip() if meta_desc else None
        meta_keywords = soup.find("meta", attrs={"name": "keywords"})
        meta_keywords = meta_keywords["content"].strip() if meta_keywords else None

        return {"Meta Title": title, "Meta Description": meta_desc, "Meta Keywords": meta_keywords}

    except Exception as e:
        return {"Meta Title": None, "Meta Description": None, "Meta Keywords": None}


In [23]:
# ✅ Apply function to extract metadata
df["Metadata"] = df["Website"].apply(get_metadata)

In [24]:
df["Metadata"]

0      {'Meta Title': '', 'Meta Description': None, '...
1      {'Meta Title': 'Интернет-магазин обуви и одежд...
2      {'Meta Title': 'Zeeman. Zo eenvoudig kan het z...
3      {'Meta Title': 'Answear.com sklep internetowy ...
4      {'Meta Title': 'Женский интернет-журнал "ЗОЛОТ...
                             ...                        
485    {'Meta Title': None, 'Meta Description': None,...
486    {'Meta Title': 'Cortinadecor.com | Estores y c...
487    {'Meta Title': 'Tezuk | Tezuk beachwear | Shop...
488    {'Meta Title': 'Cral Regione Sardegna - eventi...
489    {'Meta Title': 'Home', 'Meta Description': '',...
Name: Metadata, Length: 490, dtype: object

In [25]:
# ✅ Expand dictionary into separate columns
df = df.join(pd.DataFrame(df.pop("Metadata").tolist()))

In [29]:
df.to_csv("metadata1.csv", index=False)

In [31]:
import ace_tools as tools

ModuleNotFoundError: No module named 'ace_tools'

In [33]:
df[["Company Name", "Website", "Meta Title", "Meta Description"]].head(10)  # View first 10 rows


Unnamed: 0,Company Name,Website,Meta Title,Meta Description
0,ITX HELLAS SINGLE MEMBER S.A.,www.zara.com,,
1,RANDEVU,www.rendez-vous.ru,Интернет-магазин обуви и одежды в Москве,Интернет-магазин обуви и одежды в Москве. Дост...
2,ZEEMAN TEXTIELSUPERS,www.zeeman.com,Zeeman. Zo eenvoudig kan het zijn.,Goede kwaliteit voor de laagst mogelijke prijs...
3,ANSWEAR.COM S.A.,www.answear.com,"Answear.com sklep internetowy z modą damską, m...",Masz styl z Answear.com! Odkryj modne ubrania ...
4,REGENT GOLD,www.zolotoy.ru,"Женский интернет-журнал ""ЗОЛОТОЙ""",Женский журнал «ЗОЛОТОЙ» – твой верный советчи...
5,WOLW-POL SP. Z O.O.,www.dieselshop.pl,,
6,NILSON GROUP AB FILIAL NORGE,,,
7,SAKURA LLC,xn--23-6kca3cvbes.xn--p1ai,,
8,LIMITED LIABILITY COMPANY HUGO BOSS RUS,www.hugoboss.com/ru,404 - Page Not Found,
9,MNG-MANGO U.K. LIMITED,www.shop.mango.com,,


In [35]:
# ✅ Extract metadata for additional websites
df["Additional Metadata"] = df["Additional Websites"].apply(get_metadata)

In [36]:
df["Additional Metadata"]

0      {'Meta Title': 'IÎ¤Î§ ÎÎÎÎÎ£ MONOÎ Î¡ÎÎ£Î...
1      {'Meta Title': None, 'Meta Description': None,...
2      {'Meta Title': None, 'Meta Description': None,...
3      {'Meta Title': None, 'Meta Description': None,...
4      {'Meta Title': None, 'Meta Description': None,...
                             ...                        
485    {'Meta Title': None, 'Meta Description': None,...
486    {'Meta Title': None, 'Meta Description': None,...
487    {'Meta Title': 'HOME ZETEK', 'Meta Description...
488    {'Meta Title': None, 'Meta Description': None,...
489    {'Meta Title': None, 'Meta Description': None,...
Name: Additional Metadata, Length: 490, dtype: object

In [37]:
df = df.join(pd.DataFrame(df.pop("Additional Metadata").tolist()).rename(columns={
    "Meta Title": "Additional Meta Title",
    "Meta Description": "Additional Meta Description",
    "Meta Keywords": "Additional Meta Keywords"
}))


In [38]:
df[["Company Name", "Website", "Meta Title", "Meta Description", "Additional Websites", "Additional Meta Title", "Additional Meta Description"]].head(10)

Unnamed: 0,Company Name,Website,Meta Title,Meta Description,Additional Websites,Additional Meta Title,Additional Meta Description
0,ITX HELLAS SINGLE MEMBER S.A.,www.zara.com,,,www.fashionwear1.gr,IÎ¤Î§ ÎÎÎÎÎ£ MONOÎ Î¡ÎÎ£Î©Î Î Î.Î.,
1,RANDEVU,www.rendez-vous.ru,Интернет-магазин обуви и одежды в Москве,Интернет-магазин обуви и одежды в Москве. Дост...,www.rest-randevu.ru,,
2,ZEEMAN TEXTIELSUPERS,www.zeeman.com,Zeeman. Zo eenvoudig kan het zijn.,Goede kwaliteit voor de laagst mogelijke prijs...,,,
3,ANSWEAR.COM S.A.,www.answear.com,"Answear.com sklep internetowy z modą damską, m...",Masz styl z Answear.com! Odkryj modne ubrania ...,,,
4,REGENT GOLD,www.zolotoy.ru,"Женский интернет-журнал ""ЗОЛОТОЙ""",Женский журнал «ЗОЛОТОЙ» – твой верный советчи...,,,
5,WOLW-POL SP. Z O.O.,www.dieselshop.pl,,,,,
6,NILSON GROUP AB FILIAL NORGE,,,,,,
7,SAKURA LLC,xn--23-6kca3cvbes.xn--p1ai,,,ilovesakura.ru/spb,"Доставка роллов, суши и пиццы в Санкт-Петербур...","Сакура — это вкусно! Доставка суши, пиццы, кор..."
8,LIMITED LIABILITY COMPANY HUGO BOSS RUS,www.hugoboss.com/ru,404 - Page Not Found,,www.emg.ru,Агентство маркетинговых коммуникаций EMG,EMG - креативное агентство маркетинговых комму...
9,MNG-MANGO U.K. LIMITED,www.shop.mango.com,,,,,


In [39]:
df.to_csv("metadata2.csv", index=False)

# Translation

In [57]:
from deep_translator import GoogleTranslator

translator = GoogleTranslator(source="auto", target="en")

In [58]:
def translate_text(text):
    """Translate text to English, handling None values."""
    if pd.isna(text) or text == "None":
        return None  # Skip if no text available
    try:
        return translator.translate(text)
    except Exception as e:
        print(f"Translation error: {e}")
        return None

In [65]:
# ✅ Apply translation to `Meta Title` and `Meta Description`
df["Translated Meta Title"] = df["Meta Title"].apply(translate_text)
df["Translated Meta Description"] = df["Meta Description"].apply(translate_text)

In [67]:
df['Translated Additional Meta Title'] = df['Additional Meta Title'].apply(translate_text)
df['Translated Additional Meta Description'] = df['Additional Meta Description'].apply(translate_text)

In [68]:
df.to_csv("updated_metadata_with_translation.csv", index=False)

## Define the Missing Cases:
A company has no metadata available if
- The Website column is empty or NaN.
- The Meta Description and Additional Meta Description are both missing.

In [54]:
missing_info_df = df[
    (df["Website"].isna() | (df["Website"] == "")) &  # No website
    (df["Meta Description"].isna() | (df["Meta Description"] == "")) &  # No meta description
    (df["Additional Meta Description"].isna() | (df["Additional Meta Description"] == ""))  # No additional meta
]

In [55]:
missing_info_df

Unnamed: 0,Company Name,ISO Country Code,Country,City,Address,NACE Rev. 2 Code principal,"NACE Principal Rev. 2, description",Sector,Main activity,Description in Original language,...,Availability of financial data,2 years of consecutive losses,Average loss over 3 years,Additional Websites,Meta Title,Meta Description,Meta Keywords,Additional Meta Title,Additional Meta Description,Additional Meta Keywords
6,NILSON GROUP AB FILIAL NORGE,NO,Norvège,OSLO,,4772,Commerce de détail de chaussures et d'articles...,Engaged in the operation of a chain of shoe st...,Retail,SKODETALJHANDEL MED DERTIL NATURLIG HØRENDE VI...,...,YES,YES,,,,,,,,
16,ETERNA MODE GMBH ZWEIGNIEDERLASSUNG OESTERREICH,AT,Autriche,PARNDORF,,4771,Commerce de détail d'habillement en magasin sp...,,,,...,YES,NO,NO,,,,,,,
46,DIANITA STRUMICA DOOEL,MK,Macédoine du Nord,STRUMICA,,4771,Commerce de détail d'habillement en magasin sp...,Engaged in the operation and management of clo...,Retail,,...,YES,NO,NO,,,,,,,
56,ZOLOTOI KVADRAT,RU,Fédération de Russie,MOSCOW,,4771,Commerce de détail d'habillement en magasin sp...,Engaged in the operation and management of clo...,Retail,,...,YES,NO,NO,,,,,,,
60,DIESEL DEUTSCHLAND GMBH ZWEIGNIEDERLASSUNG OES...,AT,Autriche,WIEN,,4771,Commerce de détail d'habillement en magasin sp...,Engaged in the provision of management service...,Services,,...,YES,NO,NO,,,,,,,
71,GOLD MOVING COMPANY S.R.L.,IT,Italie,ROMA,VIA SILICELLA 47,4777,Commerce de détail d'articles d'horlogerie et ...,Engaged in the wholesale distribution of jewelry,Wholesale,COMMERCIO ALL'INGROSSO DI OROLOGI E DI GIOIELL...,...,YES,NO,NO,,,,,,,
89,N PEAL (RETAIL) LIMITED,IE,Irlande,KILDARE,,4771,Commerce de détail d'habillement en magasin sp...,,,,...,YES,NO,NO,,,,,,,
90,DESPI ASTIGI S.L.,ES,Espagne,CORNELLA DE LLOBREGAT,,4777,Commerce de détail d'articles d'horlogerie et ...,Engaged in the operation of a jewelry store,Retail,Explotación joyeria,...,YES,NO,NO,,,,,,,
111,"NIVEL PRECIOSO, LDA",PT,Portugal,VIZELA,,4777,Commerce de détail d'articles d'horlogerie et ...,Engaged in the retail trade of a wide range of...,Retail,Comércio a retalho de artigos de ourivesaria. ...,...,YES,NO,NO,,,,,,,
112,BELYETAZH,RU,Fédération de Russie,PERM,,4771,Commerce de détail d'habillement en magasin sp...,,,,...,YES,YES,,,,,,,,


In [56]:
missing_info_df[["Company Name", "Website", "Meta Title", "Meta Description", "Additional Websites", "Additional Meta Description"]].head(20)

Unnamed: 0,Company Name,Website,Meta Title,Meta Description,Additional Websites,Additional Meta Description
6,NILSON GROUP AB FILIAL NORGE,,,,,
16,ETERNA MODE GMBH ZWEIGNIEDERLASSUNG OESTERREICH,,,,,
46,DIANITA STRUMICA DOOEL,,,,,
56,ZOLOTOI KVADRAT,,,,,
60,DIESEL DEUTSCHLAND GMBH ZWEIGNIEDERLASSUNG OES...,,,,,
71,GOLD MOVING COMPANY S.R.L.,,,,,
89,N PEAL (RETAIL) LIMITED,,,,,
90,DESPI ASTIGI S.L.,,,,,
111,"NIVEL PRECIOSO, LDA",,,,,
112,BELYETAZH,,,,,


In [None]:
# Tran