In [30]:
# Data Scraping
import requests
from bs4 import BeautifulSoup
import pandas as pd

def scrape_wikipedia(language_code):
    url = f"https://{language_code}.wikipedia.org/wiki/Main_Page"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36"
    }
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, "html.parser")

        articles = []
        for heading in soup.find_all("span", class_="mw-headline"):
            articles.append(heading.text.strip())

        return articles[:10]

    except requests.HTTPError as errh:
        print(f"HTTP Error: {errh}")
    except requests.ConnectionError as errc:
        print(f"Error Connecting: {errc}")
    except requests.Timeout as errt:
        print(f"Timeout Error: {errt}")
    except requests.RequestException as err:
        print(f"Other Error: {err}")
    return []

languages = {
    "en": "English",
    "es": "Spanish",
    "fr": "French",
    "de": "German",
    "it": "Italian",
    "nl": "Dutch",
    "pt": "Portuguese",
    "ru": "Russian",
    "ja": "Japanese",
    "zh": "Chinese",
    "ar": "Arabic",
    "hi": "Hindi",
    "ko": "Korean",
    "tr": "Turkish",
    "pl": "Polish",
    "sv": "Swedish",
    "uk": "Ukrainian",
    "cs": "Czech",
    "el": "Greek",
    "he": "Hebrew",
    "th": "Thai",
    "da": "Danish",
    "fi": "Finnish",
    "id": "Indonesian",
    "no": "Norwegian",
    "hu": "Hungarian",
    "ro": "Romanian",
    "sk": "Slovak",
    "vi": "Vietnamese",
    "bn": "Bengali",
    "sr": "Serbian",
    "hr": "Croatian",
    "ms": "Malay",
    "lt": "Lithuanian",
    "sl": "Slovenian",
    "et": "Estonian",
    "gl": "Galician",
    "ta": "Tamil",
    "ca": "Catalan",
    "ur": "Urdu",
    "eu": "Basque",
    "be": "Belarusian",
    "is": "Icelandic",
    "af": "Afrikaans",
    "sw": "Swahili",
    "am": "Amharic",
    "fa": "Persian",
    "ne": "Nepali",
    "so": "Somali",
    "ka": "Georgian",
    "la": "Latin",
    "km": "Khmer",
    "mk": "Macedonian",
    "mn": "Mongolian",
    "fy": "Western Frisian",
    "gu": "Gujarati",
    "pa": "Punjabi",
    "jv": "Javanese",
    "ceb": "Cebuano",
    "ha": "Hausa",
    "yo": "Yoruba",
    "su": "Sundanese",
    "si": "Sinhala",
    "sn": "Shona",
    "hmn": "Hmong",
    "uz": "Uzbek",
    "rw": "Kinyarwanda",
    "glg": "Galician",
    "kk": "Kazakh",
    "ku": "Kurdish",
    "tt": "Tatar",
    "ast": "Asturian",
    "qu": "Quechua",
    "tl": "Tagalog",
    "tg": "Tajik",
    "my": "Burmese",
    "or": "Odia",
    "ky": "Kyrgyz",
    "am": "Amharic",

}


data = []
for lang_code, lang_name in languages.items():
    wiki_articles = scrape_wikipedia(lang_code)
    if wiki_articles:
        for article in wiki_articles:
            data.append({"Language": lang_name, "Text": article})

df = pd.DataFrame(data)
df.to_csv("multilingual_wikipedia_dataset.csv", index=False)


Error Connecting: HTTPSConnectionPool(host='hmn.wikipedia.org', port=443): Max retries exceeded with url: /wiki/Main_Page (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x7d23c23a2410>: Failed to resolve 'hmn.wikipedia.org' ([Errno -2] Name or service not known)"))
Error Connecting: HTTPSConnectionPool(host='glg.wikipedia.org', port=443): Max retries exceeded with url: /wiki/Main_Page (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x7d23c3e48f40>: Failed to resolve 'glg.wikipedia.org' ([Errno -2] Name or service not known)"))


In [37]:
#Data Wrangling


import pandas as pd
import pandas as pd
import re
from unicodedata import normalize



aggregated_df = df.groupby('Language')['Text'].agg(lambda x: ' '.join(x)).reset_index()



aggregated_df['Text'] = aggregated_df['Text'].str.lower()


aggregated_df['Text'] = aggregated_df['Text'].apply(lambda x: re.findall(r'\b\w+\b', x))


aggregated_df['Text'] = aggregated_df['Text'].apply(lambda x: [word for word in x if word.isalnum()])


aggregated_df['Text'] = aggregated_df['Text'].apply(lambda x: [normalize('NFKD', word).encode('ASCII', 'ignore').decode('utf-8') for word in x])


aggregated_df['Text'] = aggregated_df['Text'].apply(lambda x: [word for word in x if word.isalnum()])

print(aggregated_df)

       Language                                               Text
0      Asturian  [ciencies, naturales, ciencies, humanes, y, ci...
1        Basque  [esklabotza, esklabotza, txillardegi, donostia...
2    Belarusian                                                 []
3       Burmese                                                 []
4       Catalan  [henri, matisse, imatge, del, dia, projecte, d...
5       Cebuano                               [piniling, artikulo]
6       Chinese                                                 []
7      Croatian  [ostali, projekti, neprofitne, zaklade, wikime...
8        Danish  [de, nyeste, fremragende, artikler, de, nyeste...
9         Dutch  [uitgelicht, etalage, over, wikipedia, wist, j...
10      English  [welcome, to, wikipedia, from, today, s, featu...
11      Finnish  [suositeltu, artikkeli, miten, voit, auttaa, a...
12       French  [wikipedia, article, labellise, du, jour, actu...
13     Galician  [artigo, destacado, sobre, galicia, tal, dia,

In [39]:
# Data Embedding
!pip install gensim




In [41]:
from gensim.models import Word2Vec
import pandas as pd
import numpy as np


word2vec_model = Word2Vec(aggregated_df['Text'], vector_size=100, window=5, min_count=1, workers=4)

def average_word_embedding(sentence):
    embeddings = [word2vec_model.wv[word] for word in sentence if word in word2vec_model.wv]
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(word2vec_model.vector_size)

aggregated_df['Word_Embedding'] = aggregated_df['Text'].apply(lambda x: average_word_embedding(x))

df.to_csv("dataset.csv", index=False)


In [42]:
print(aggregated_df)

       Language                                               Text  \
0      Asturian  [ciencies, naturales, ciencies, humanes, y, ci...   
1        Basque  [esklabotza, esklabotza, txillardegi, donostia...   
2    Belarusian                                                 []   
3       Burmese                                                 []   
4       Catalan  [henri, matisse, imatge, del, dia, projecte, d...   
5       Cebuano                               [piniling, artikulo]   
6       Chinese                                                 []   
7      Croatian  [ostali, projekti, neprofitne, zaklade, wikime...   
8        Danish  [de, nyeste, fremragende, artikler, de, nyeste...   
9         Dutch  [uitgelicht, etalage, over, wikipedia, wist, j...   
10      English  [welcome, to, wikipedia, from, today, s, featu...   
11      Finnish  [suositeltu, artikkeli, miten, voit, auttaa, a...   
12       French  [wikipedia, article, labellise, du, jour, actu...   
13     Galician  [ar