### Imports

In [None]:
# ML imports
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV

# Data analysis and stats imports
import numpy as np
from scipy.stats import expon, reciprocal

# Data visualization imports
import seaborn as sns
import matplotlib.pyplot as plt

from common_language import _LANGUAGES
import processing as prlib
import train_models as trainer

### Get data

In [None]:
full_df, train_df, test_df, validation_df = prlib.get_preprocessed_data()
df_without_label = full_df.iloc[:, :-1]
df_without_label

Relabeling

In [None]:
# relabel by types of language, to group similar languages in one class
print(full_df['label'].unique())
group_labels = {
    'Arabic': 'Languages of the Caucasus and Middle East', 'Basque': 'Other European Languages',
    'Breton': 'Constructed and Isolate Languages', 'Catalan': 'Other European Languages',
    'Chinese_China': 'East Asian Languages', 'Chinese_Hongkong': 'East Asian Languages',
    'Chinese_Taiwan': 'East Asian Languages', 'Chuvash': 'Turkic Languages',
    'Czech': 'Slavic Languages', 'Dhivehi': 'Diverse Asian and Pacific Languages',
    'Dutch': 'Germanic Languages', 'English': 'Germanic Languages',
    'Esperanto': 'Constructed and Isolate Languages', 'Estonian': 'Other European Languages',
    'French': 'Romance European Languages', 'Frisian': 'African and Other Languages',
    'Georgian': 'Languages of the Caucasus and Middle East', 'German': 'Germanic Languages',
    'Greek': 'Romance European Languages', 'Hakha_Chin': 'Diverse Asian and Pacific Languages',
    'Indonesian': 'Diverse Asian and Pacific Languages', 'Interlingua': 'Constructed and Isolate Languages',
    'Italian': 'Romance European Languages', 'Japanese': 'East Asian Languages',
    'Kabyle': 'African and Other Languages', 'Kinyarwanda': 'African and Other Languages',
    'Kyrgyz': 'Turkic Languages', 'Latvian': 'Other European Languages',
    'Maltese': 'Languages of the Caucasus and Middle East', 'Mongolian': 'Diverse Asian and Pacific Languages',
    'Persian': 'Languages of the Caucasus and Middle East', 'Polish': 'Slavic Languages',
    'Portuguese': 'Romance European Languages', 'Romanian': 'Romance European Languages',
    'Romansh_Sursilvan': 'Constructed and Isolate Languages', 'Russian': 'Slavic Languages',
    'Sakha': 'Turkic Languages', 'Slovenian': 'Slavic Languages',
    'Spanish': 'Romance European Languages', 'Swedish': 'Germanic Languages',
    'Tamil': 'African and Other Languages', 'Tatar': 'Turkic Languages',
    'Turkish': 'Turkic Languages', 'Ukranian': 'Slavic Languages', 'Welsh': 'Other European Languages'
}



Check multicolinearity

In [None]:
corr = df_without_label.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
f, ax = plt.subplots(figsize=(11, 9))
cmap = sns.diverging_palette(230, 20, as_cmap=True)
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

plt.show()

## Models

### Training

In [None]:
X_train,Y_train, X_test, Y_test = trainer.embedded_data()

# Model initialization
svm = SVC(verbose=3, kernel='linear', random_state=42)
svm_poly = SVC(verbose=3, kernel='poly', random_state=42)
svm_rbf = SVC(verbose=3, kernel='rbf', random_state=42)
random_forest = RandomForestClassifier(verbose=3, random_state=42)

# Models fiting
print("Training linear SVM")
svm.fit(X_train, Y_train)
print()
print('Training poly SVM')
svm_poly.fit(X_train, Y_train)
print()
print('Training rbf SVM')
svm_rbf.fit(X_train, Y_train)
print()
print("Training RFC")
random_forest.fit(X_train, Y_train)


### Predictions

In [None]:
# Models prediction
svm_predictions = svm.predict(X_test)
poly_svm_pred = svm_poly.predict(X_test)
rbf_svm_pred = svm_rbf.predict(X_test)
random_forest_predictions = random_forest.predict(X_test)

### Metrics

In [None]:
linear_svm_metrics = trainer.get_metrics(Y_test, svm_predictions)
poly_svm_metrics = trainer.get_metrics(Y_test, poly_svm_pred)
rbf_svm_metrics = trainer.get_metrics(Y_test, rbf_svm_pred)
rdf_metrics = trainer.get_metrics(Y_test, random_forest_predictions)

print(f'linear SVM accuracy: {linear_svm_metrics['accuracy_score']}')
print(f'poly SVM accuracy: {poly_svm_metrics['accuracy_score']}')
print(f'rbf SVM accuracy: {rbf_svm_metrics['accuracy_score']}')
print(f'RDF accuracy: {rdf_metrics['accuracy_score']}')
print()
print(f'linear SVM f1: {linear_svm_metrics['f1_score']}')
print(f'poly SVM f1: {poly_svm_metrics['f1_score']}')
print(f'rbf SVM f1: {rbf_svm_metrics['f1_score']}')
print(f'RDF f1: {rdf_metrics['f1_score']}')
print()
print(f'linear SVM precision: {linear_svm_metrics['precision_score']}')
print(f'poly SVM precision: {poly_svm_metrics['precision_score']}')
print(f'rbf SVM precision: {rbf_svm_metrics['precision_score']}')
print(f'RDF precision: {rdf_metrics['precision_score']}')
print()
print(f'linear SVM recall: {linear_svm_metrics['recall_score']}')
print(f'poly SVM recall: {poly_svm_metrics['recall_score']}')
print(f'rbf SVM recall: {rbf_svm_metrics['recall_score']}')
print(f'RDF recall: {rdf_metrics['recall_score']}')

### Random Search

RandomizedSearch SVM

In [None]:
param_grid_svm = {
    'C': reciprocal(0.001, 1000),
    'gamma': expon(scale=1.0),
    'kernel': ['linear', 'rbf', 'poly']
}

svm_clf = SVC(random_state=42)
random_search_svm = RandomizedSearchCV(svm_clf, param_distributions=param_grid_svm, n_iter=20, verbose=3, cv=5, random_state=42, n_jobs=-1, scoring = 'f1_macro')
random_search_svm.fit(X_train, Y_train)
print("Best parameters for SVM:", random_search_svm.best_params_)
print("Best score:", random_search_svm.best_score_)

RandomizedSearch Random Forest Classifier

In [25]:
param_grid_rf = {
    'n_estimators': [80, 100, 200],
    'max_depth': [3, 4, 5, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None],
    'criterion': ['gini', 'entropy', 'log_loss']
}

rfc = RandomForestClassifier(random_state=42)

random_search_rf = RandomizedSearchCV(rfc, param_distributions=param_grid_rf, 
                                      n_iter=20, cv=5, verbose=3, random_state=42,
                                        n_jobs=-1, scoring = 'f1_macro',)
random_search_rf.fit(X_train, Y_train)
print("Best parameters:", random_search_rf.best_params_)
print("Best score:", random_search_rf.best_score_)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


KeyboardInterrupt: 

### Training with best parameters

#### SVM

In [None]:
c = 0.1767016940294795
gamma = 3.010121430917521
new_svm = SVC(verbose=3, C=c, gamma=gamma, kernel='poly' )
print('Training SVM')
new_svm.fit(X_train, Y_train)
preds = new_svm.predict(X_test)
metrics = trainer.get_metrics(Y_test, preds)
print()
print(f'Accuracy: {metrics['accuracy_score']}')
print(f'F1: {metrics['f1_score']}')
print(f'Precision: {metrics['precision_score']}')
print(f'Recall: {metrics['recall_score']}')

In [None]:
c = 10
new_svm = SVC(verbose=3, C=c, gamma='auto', kernel='rbf' )
print('Training SVM')
new_svm.fit(X_train, Y_train)
preds = new_svm.predict(X_test)
metrics = trainer.get_metrics(Y_test, preds)
print()
print(f'Accuracy: {metrics['accuracy_score']}')
print(f'F1: {metrics['f1_score']}')
print(f'Precision: {metrics['precision_score']}')
print(f'Recall: {metrics['recall_score']}')

#### Random Forest

In [None]:
n_estimators = 200 
min_samples_split = 5
min_samples_leaf = 4

new_rdf = RandomForestClassifier(verbose=3, n_estimators=n_estimators, 
                                 min_samples_split=min_samples_split, 
                                 min_samples_leaf=min_samples_leaf, 
                                 max_features='sqrt', max_depth=None)
new_rdf.fit(X_train, Y_train)
preds = new_rdf.predict(X_test)
metrics = trainer.get_metrics(Y_test, preds)
print()
print(f'Accuracy: {metrics['accuracy_score']}')
print(f'F1: {metrics['f1_score']}')
print(f'Precision: {metrics['precision_score']}')
print(f'Recall: {metrics['recall_score']}')