### Imports

In [None]:
# ML imports
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.manifold import MDS
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, f1_score, classification_report, roc_auc_score, RocCurveDisplay
from sklearn.model_selection import train_test_split, RandomizedSearchCV

# Data analysis and stats imports
import numpy as np
import pandas as pd
from scipy.stats import expon, reciprocal
from scipy.spatial.distance import pdist, squareform

# Data visualization imports
import seaborn as sns
import matplotlib.pyplot as plt

from common_language import _LANGUAGES
import processing as prlib

### Get data

In [None]:
full_df, train_df, test_df, validation_df = prlib.get_preprocessed_data()
df_without_label = full_df.iloc[:, :-1]
df_without_label

Relabeling

In [None]:
# relabel by types of language, to group similar languages in one class
print(full_df['label'].unique())
group_labels = {
    'Arabic': 'Languages of the Caucasus and Middle East', 'Basque': 'Other European Languages',
    'Breton': 'Constructed and Isolate Languages', 'Catalan': 'Other European Languages',
    'Chinese_China': 'East Asian Languages', 'Chinese_Hongkong': 'East Asian Languages',
    'Chinese_Taiwan': 'East Asian Languages', 'Chuvash': 'Turkic Languages',
    'Czech': 'Slavic Languages', 'Dhivehi': 'Diverse Asian and Pacific Languages',
    'Dutch': 'Germanic Languages', 'English': 'Germanic Languages',
    'Esperanto': 'Constructed and Isolate Languages', 'Estonian': 'Other European Languages',
    'French': 'Romance European Languages', 'Frisian': 'African and Other Languages',
    'Georgian': 'Languages of the Caucasus and Middle East', 'German': 'Germanic Languages',
    'Greek': 'Romance European Languages', 'Hakha_Chin': 'Diverse Asian and Pacific Languages',
    'Indonesian': 'Diverse Asian and Pacific Languages', 'Interlingua': 'Constructed and Isolate Languages',
    'Italian': 'Romance European Languages', 'Japanese': 'East Asian Languages',
    'Kabyle': 'African and Other Languages', 'Kinyarwanda': 'African and Other Languages',
    'Kyrgyz': 'Turkic Languages', 'Latvian': 'Other European Languages',
    'Maltese': 'Languages of the Caucasus and Middle East', 'Mongolian': 'Diverse Asian and Pacific Languages',
    'Persian': 'Languages of the Caucasus and Middle East', 'Polish': 'Slavic Languages',
    'Portuguese': 'Romance European Languages', 'Romanian': 'Romance European Languages',
    'Romansh_Sursilvan': 'Constructed and Isolate Languages', 'Russian': 'Slavic Languages',
    'Sakha': 'Turkic Languages', 'Slovenian': 'Slavic Languages',
    'Spanish': 'Romance European Languages', 'Swedish': 'Germanic Languages',
    'Tamil': 'African and Other Languages', 'Tatar': 'Turkic Languages',
    'Turkish': 'Turkic Languages', 'Ukranian': 'Slavic Languages', 'Welsh': 'Other European Languages'
}

# Apply the mapping to the DataFrame
#full_df['label'] = full_df['label'].map(group_labels)
#print(full_df['label'].unique())
#full_df.groupby('label').count()

Check multicolinearity

In [None]:
corr = df_without_label.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
f, ax = plt.subplots(figsize=(11, 9))
cmap = sns.diverging_palette(230, 20, as_cmap=True)
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

plt.show()

PCA or MDS

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(full_df.iloc[:, 0:-1], full_df['label'], stratify=full_df['label'], test_size=0.33, random_state=42)

#X_train, Y_train = df_without_label.iloc[:len(train_df), :], full_df['label'].iloc[:len(train_df)]
#X_test, Y_test = df_without_label.iloc[len(train_df):, :], full_df['label'].iloc[len(train_df):]

# PCA
X_train, embedding = prlib.get_PCs(X_train, 95)
X_test = embedding(X_test)

# MDS with mahalanobis
#df_mds = mds_mahalanobis(X_train, 70)
#df_mds['label'] = df['label']
X_train, X_test

## Models

### Training

In [None]:
# Model initialization
svm = SVC(verbose=3, kernel='rbf', random_state=42, probability=True)
random_forest = RandomForestClassifier(verbose=3)

# Models fiting
print("Training SVM")
svm.fit(X_train, Y_train)
print("Training RFC")
random_forest.fit(X_train, Y_train)


### Predictions

In [None]:
# Models prediction
mapper = {}
labels = full_df['label'].unique()
for i in range(len(labels)): 
    mapper[labels[i]] = i
svm_predictions = pd.Series(np.array(svm.predict(X_test))).replace(mapper)
p_svm = np.array(svm.predict_proba(X_test))
#prob_svm = np.exp(p_svm)/np.sum(np.exp(p_svm),axis=1, keepdims=True)
prob_svm = np.array(svm.predict_proba(X_test))
random_forest_predictions = pd.Series(random_forest.predict(X_test)).replace(mapper)
p_rfc = np.array(random_forest.predict_proba(X_test))
#prob_rfc = np.exp(p_rfc)/np.sum(np.exp(p_rfc),axis=1, keepdims=True)
#print(prob_rfc)

### Metrics

In [None]:
Y_test = pd.Series(Y_test).replace(mapper)
#print(Y_test.shape, svm_predictions.shape)
svm_accuracy = accuracy_score(Y_test, svm_predictions)
#print(svm_predictions.to_list())
#print(prob_svm[0])
print("SVM AUC_ROC = ", roc_auc_score(svm_predictions.to_list(), prob_svm, multi_class="ovr"))
print("RFC ROC_AU = ", roc_auc_score(random_forest_predictions, p_rfc, multi_class="ovr",))
display = RocCurveDisplay.from_estimator(
    svm, 
    X_test, 
    Y_test, 
)
plt.show()

plt.show()
random_forest_accuracy = accuracy_score(Y_test, random_forest_predictions)
print("RFC ROC_AU = ", roc_auc_score(random_forest_predictions, prob_rfc, multi_class="ovr",))
display = RocCurveDisplay.from_predictions(
    svm_predictions, 
    Y_test,
    name="micro-average OvR",
    color="darkorange",
    plot_chance_level=True,
)
_ = display.ax_.set(
    xlabel="False Positive Rate",
    ylabel="True Positive Rate",
    title="Micro-averaged One-vs-Rest\nReceiver Operating Characteristic",
)
plt.show()
print(f'SVM accuracy: {svm_accuracy}')
print(f'RDF accuracy: {random_forest_accuracy}')

RandomizedSearch SVM

In [None]:
param_grid_svm = {
    'C': reciprocal(0.001, 1000),
    'gamma': expon(scale=1.0),
    'kernel': ['linear', 'rbf', 'poly']
}

svm_clf = SVC(random_state=42)
random_search_svm = RandomizedSearchCV(svm_clf, param_distributions=param_grid_svm, n_iter=100, verbose=2, cv=5, random_state=42, n_jobs=-1, scoring = 'f1')
random_search_svm.fit(X_train, Y_train)
print("Best parameters for SVM:", random_search_svm.best_params_)
print("Best score:", random_search_svm.best_score_)

RandomizedSearch Random Forest Classifier

In [None]:
param_grid_rf = {
    'n_estimators': [80, 100, 200],
    'max_depth': [3, 4, 5, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None]
}

rfc = RandomForestClassifier(random_state=42)

random_search_rf = RandomizedSearchCV(rfc, param_distributions=param_grid_rf, n_iter=100, cv=5, verbose=2, random_state=42, n_jobs=-1, scoring = 'f1')
random_search_rf.fit(X_train, Y_train)
print("Best parameters:", random_search_rf.best_params_)
print("Best score:", random_search_rf.best_score_)