In [99]:
# Data Scraping
import requests
from bs4 import BeautifulSoup
import pandas as pd

def scrape_wikipedia(language_code):
    url = f"https://{language_code}.wikipedia.org/wiki/Main_Page"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36"
    }
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, "html.parser")

        articles = []
        for heading in soup.find_all("p", class_=""):
            articles.append(heading.text.strip())

        # response = requests.get(url, headers=headers)
        # response.raise_for_status()
        # soup = BeautifulSoup(response.content, "html.parser")

        # for heading in soup.find_all("span", class_="mw-headline"):
        #     articles.append(heading.text.strip())

        # response = requests.get(url, headers=headers)
        # response.raise_for_status()
        # soup = BeautifulSoup(response.content, "html.parser")

        # for heading in soup.find_all("span", class_="mw-headline"):
        #     articles.append(heading.text.strip())


        return articles

    except requests.HTTPError as errh:
        print(f"HTTP Error: {errh}")
    except requests.ConnectionError as errc:
        print(f"Error Connecting: {errc}")
    except requests.Timeout as errt:
        print(f"Timeout Error: {errt}")
    except requests.RequestException as err:
        print(f"Other Error: {err}")
    return []



languages = {
    "tt": "Tatar",
    "en": "English",
    "it": "Italian",
    "ca": "Catalan",
    "pl": "Polish",
    "ar": "Arabic",
    "su": "Sundanese",
    "he": "Hebrew",
    "yo": "Yoruba",
    "pa": "Punjabi",
}



data = []
for lang_code, lang_name in languages.items():
    wiki_articles = scrape_wikipedia(lang_code)
    if wiki_articles:
        for article in wiki_articles:
            data.append({"Language": lang_name, "Text": article})

# for lang_code, lang_name in languages.items():
#     wiki_articles = scrape_wikipedia(lang_code)
#     if wiki_articles:
#         for article in wiki_articles:
#             data.append({"Language": lang_name, "Text": article})


df = pd.DataFrame(data)
df.dropna(inplace=True)
df.to_csv("multilingual_wikipedia_dataset.csv", index=False)


In [100]:
#Data Wrangling


import pandas as pd
import pandas as pd
import re
from unicodedata import normalize



# aggregated_df = df.groupby('Language')['Text'].agg(lambda x: ' '.join(x)).reset_index()
aggregated_df=pd.read_csv("multilingual_wikipedia_dataset.csv")
aggregated_df.dropna(inplace=True)

aggregated_df['Text'] = aggregated_df['Text'].str.lower()


aggregated_df['Text'] = aggregated_df['Text'].apply(lambda x: re.findall(r'\b\w+\b', x))


aggregated_df['Text'] = aggregated_df['Text'].apply(lambda x: [word for word in x if word.isalnum()])


# aggregated_df['Text'] = aggregated_df['Text'].apply(lambda x: [normalize('NFKD', word).encode('ASCII', 'ignore').decode('utf-8') for word in x])



print(aggregated_df)

    Language                                               Text
0      Tatar  [ирекле, эчтәлекле, энциклопедияне, һәркем, яз...
1      Tatar  [тулы, исемлек, эчтәлек, порталлар, latin, iml...
6      Tatar  [аргентинада, велосипедчылар, кубада, төзүчелә...
9      Tatar  [коену, рус, совет, һәм, америка, рәссамы, ник...
10     Tatar  [николай, фешин, картинасы, җәйге, эсселек, ва...
..       ...                                                ...
335  Punjabi  [ਕ, ਗ, ਲ, ਕਤ, ਤਰ, ਗਣਰ, ਜ, ਦ, ਕ, ਮ, ਪ, ਰਕ, ਵ, ਖ...
336  Punjabi  [ਉਪ, ਸ, ਰ, ਣ, ਆ, ਦ, ਖਣ, ਲਈ, ਕ, ਰਪ, ਕਰਕ, ਤ, ਰ, ...
347  Punjabi  [ਇਹ, ਵ, ਕ, ਪ, ਡ, ਆ, ਪ, ਜ, ਬ, ਵ, ਚ, ਲ, ਖ, ਆ, ਗ,...
348  Punjabi  [स, स, क, त, प, ल, भ, जप, र, मर, ठ, ಕನ, ನಡ, தம...
350  Punjabi  [ਵ, ਕ, ਪ, ਡ, ਆ, ਵ, ਲ, ਟ, ਅਰ, ਸ, ਪ, ਦਕ, ਦ, ਆਰ, ...

[255 rows x 2 columns]


In [82]:
# Data Embedding
#
row_counts = df['Language'].value_counts()
print(row_counts.head(15))

Tatar        79
Italian      55
Catalan      33
Asturian     33
Polish       33
Arabic       32
Sundanese    30
Hebrew       28
Yoruba       27
Slovak       27
Punjabi      27
Name: Language, dtype: int64


In [49]:
df.Language.unique()

array(['Tatar', 'Italian', 'Catalan', 'Asturian', 'Polish', 'Arabic',
       'Sundanese', 'Afrikaans', 'Hebrew', 'Yoruba', 'Slovak', 'Punjabi',
       'Galician', 'Somali', 'Basque'], dtype=object)

In [101]:
from gensim.models import Word2Vec
import pandas as pd
import numpy as np


word2vec_model = Word2Vec(aggregated_df['Text'], vector_size=100, window=5, min_count=1, workers=4)

def average_word_embedding(sentence):
    embeddings = [word2vec_model.wv[word] for word in sentence if word in word2vec_model.wv]
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(word2vec_model.vector_size)

aggregated_df['Word_Embedding'] = aggregated_df['Text'].apply(lambda x: average_word_embedding(x))

df.to_csv("dataset.csv", index=False)


In [77]:
print(aggregated_df)

    Language                                               Text  \
0      Tatar  [ирекле, эчтәлекле, энциклопедияне, һәркем, яз...   
1      Tatar  [тулы, исемлек, эчтәлек, порталлар, latin, iml...   
6      Tatar  [аргентинада, велосипедчылар, кубада, төзүчелә...   
9      Tatar  [коену, рус, совет, һәм, америка, рәссамы, ник...   
10     Tatar  [николай, фешин, картинасы, җәйге, эсселек, ва...   
..       ...                                                ...   
449   Basque  [iparragirreren, gernikako, arbola, euskal, er...   
451   Basque  [eaeko, auzitegi, nagusiak, udaletan, euskara,...   
453   Basque  [urriaren, 7an, gaza, israel, gatazka, hasi, z...   
454   Basque  [alan, griffin, denny, laine, concha, velasco,...   
455   Basque  [alan, griffin, denny, laine, concha, velasco,...   

                                        Word_Embedding  
0    [0.00056917703, 0.00034485586, 0.00019627949, ...  
1    [0.0030841334, 0.0003638751, 0.0011694995, -0....  
6    [-0.0022239294, -0.

In [102]:
import numpy as np
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, GlobalMaxPooling1D, Dense, Bidirectional, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

# Assuming 'aggregated_df' is your dataset DataFrame
# Assuming 'text' contains the text data for each sample
# Assuming 'Language' is the target variable

# Tokenize the text and pad sequences
max_length = 100
tokenizer = Tokenizer()
tokenizer.fit_on_texts(aggregated_df['Text'])
vocab_size = len(tokenizer.word_index) + 1
X = pad_sequences(tokenizer.texts_to_sequences(aggregated_df['Text']), maxlen=max_length, padding='post')
# X = np.vstack(aggregated_df['Word_Embedding'])  # Assuming 'Word_Embedding' contains numpy arrays
# Encode the target variable
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(aggregated_df['Language'])

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Define and compile the model
embedding_dim = 100
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length, trainable=True),
    Bidirectional(LSTM(128, return_sequences=True)),
    GlobalMaxPooling1D(),
    Dense(64, activation='relu'),
    Dropout(0.5),

    Dense(len(label_encoder.classes_), activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=30, batch_size=32, validation_data=(X_test, y_test))

# Evaluate the model on test data
test_loss, test_accuracy = model.evaluate(X_test, y_test)




print("Test Accuracy:", test_accuracy)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Test Accuracy: 0.7692307829856873


In [104]:
print(aggregated_df)

    Language                                               Text  \
0      Tatar  [ирекле, эчтәлекле, энциклопедияне, һәркем, яз...   
1      Tatar  [тулы, исемлек, эчтәлек, порталлар, latin, iml...   
6      Tatar  [аргентинада, велосипедчылар, кубада, төзүчелә...   
9      Tatar  [коену, рус, совет, һәм, америка, рәссамы, ник...   
10     Tatar  [николай, фешин, картинасы, җәйге, эсселек, ва...   
..       ...                                                ...   
335  Punjabi  [ਕ, ਗ, ਲ, ਕਤ, ਤਰ, ਗਣਰ, ਜ, ਦ, ਕ, ਮ, ਪ, ਰਕ, ਵ, ਖ...   
336  Punjabi  [ਉਪ, ਸ, ਰ, ਣ, ਆ, ਦ, ਖਣ, ਲਈ, ਕ, ਰਪ, ਕਰਕ, ਤ, ਰ, ...   
347  Punjabi  [ਇਹ, ਵ, ਕ, ਪ, ਡ, ਆ, ਪ, ਜ, ਬ, ਵ, ਚ, ਲ, ਖ, ਆ, ਗ,...   
348  Punjabi  [स, स, क, त, प, ल, भ, जप, र, मर, ठ, ಕನ, ನಡ, தம...   
350  Punjabi  [ਵ, ਕ, ਪ, ਡ, ਆ, ਵ, ਲ, ਟ, ਅਰ, ਸ, ਪ, ਦਕ, ਦ, ਆਰ, ...   

                                        Word_Embedding  
0    [-0.0016337071, -0.00082817115, -0.001665919, ...  
1    [-0.0016191887, -0.00067272974, 0.0011436198, ...  
6    [0.0001273412, 0.00

In [105]:
#Generate predictions on test data
from sklearn.metrics import classification_report

y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)  # Get the predicted classes

# Convert encoded labels back to original labelsma
y_test_original = label_encoder.inverse_transform(y_test)
y_pred_original = label_encoder.inverse_transform(y_pred_classes)

# Generate classification report
print(classification_report(y_test_original, y_pred_original))


              precision    recall  f1-score   support

      Arabic       1.00      1.00      1.00         2
     Catalan       1.00      1.00      1.00         1
     English       0.00      0.00      0.00         0
      Hebrew       0.60      1.00      0.75         3
     Italian       1.00      1.00      1.00         2
      Polish       1.00      0.33      0.50         3
   Sundanese       0.57      1.00      0.73         4
       Tatar       1.00      0.43      0.60         7
      Yoruba       1.00      1.00      1.00         4

    accuracy                           0.77        26
   macro avg       0.80      0.75      0.73        26
weighted avg       0.89      0.77      0.76        26



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [107]:
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Assuming 'text_to_predict' is the text you want to predict the language for
text_to_predict = "улы исемлек эчтәлек порталлар latin"

# Preprocess the text
sequence = tokenizer.texts_to_sequences([text_to_predict])
padded_sequence = pad_sequences(sequence, maxlen=max_length, padding='post')

# Make predictions
predictions = model.predict(padded_sequence)
predicted_language_index = np.argmax(predictions)
predicted_language = label_encoder.classes_[predicted_language_index]

# Print the predicted language
print("Predicted Language:", predicted_language)


Predicted Language: Tatar
