In [None]:
import pandas as pd
import numpy as np
import random
from utils import make_dataframe, save_results

import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn import svm
from sklearn.metrics import classification_report as report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import MultiLabelBinarizer

SEED = 10
random.seed(SEED)
np.random.seed(SEED)

In [None]:
def train_test_split(df, train_size):
    indices = df.index.levels[0]
    train_idxs = np.random.choice(indices, size=int(len(indices)*train_size), replace=False)
    dev_idxs = np.setdiff1d(indices, train_idxs)

    train_df = df[np.in1d(df.index.get_level_values(0), train_idxs)]
    test_df = df[np.in1d(df.index.get_level_values(0), dev_idxs)]

    return train_df, test_df

In [None]:
languages = ["en", "fr", "ge", "it", "po", "ru"]

In [None]:
baseline_df = pd.DataFrame()
artifacts = dict.fromkeys(languages, None)
for lang in languages:
    train_folder = f"../data/{lang}/train-articles-subtask-3/"
    dev_folder = f"../data/{lang}/dev-articles-subtask-3/"
    labels_file = f"../data/{lang}/train-labels-subtask-3.txt"

    test_df = make_dataframe(dev_folder)
    train_df = make_dataframe(train_folder, labels_file)

    # split train into train and dev
    train_df, dev_df = train_test_split(train_df, train_size=0.7)

    X_train = train_df["text"].values
    y_train = train_df["labels"].str.split(",").values

    X_dev = dev_df["text"].values
    y_dev = dev_df["labels"].str.split(",").values

    multibin = MultiLabelBinarizer()
    y_train = multibin.fit_transform(y_train)
    y_dev = multibin.transform(y_dev)

    pipe = Pipeline([('vectorizer',CountVectorizer(ngram_range = (1, 3), 
                                               analyzer='word')),
                ('SVM_multiclass', MultiOutputClassifier(svm.SVC(class_weight= None,C=1, kernel='linear'),n_jobs=1))])

    pipe.fit(X_train,y_train)

    print(f'{lang}: In-sample Acc: \t\t', pipe.score(X_dev, y_dev))

    
    y_pred_transform = pipe.predict(X_dev)
    y_pred = [",".join(x) for x in multibin.inverse_transform(y_pred_transform)]

    dev_df["labels_pred"] = y_pred
    dev_df.loc[:, "y_true"] = y_dev.tolist()
    dev_df.loc[:, "y_pred"] = y_pred_transform.tolist() 
    dev_df["lang"] = lang

    baseline_df = pd.concat([baseline_df, dev_df], ignore_index=True)

    artifacts[lang] = {"model": pipe, "binarizer": multibin}
    # out_folder = f"../results/result-subtask3-dev-{lang}.txt"

In [None]:
baseline_df

In [None]:
from sklearn.metrics import f1_score, accuracy_score, classification_report
metrics_df = pd.DataFrame()

for lang in languages:
    binarizer = artifacts[lang]["binarizer"]
    lang_df = baseline_df[baseline_df["lang"] == lang]
    y_true = np.stack(lang_df["y_true"].apply(np.array).to_numpy())
    y_pred = np.stack(lang_df["y_pred"].apply(np.array).to_numpy())
    
    metric_f1micro = f1_score(y_true, y_pred, average="micro")
    metric_f1macro = f1_score(y_true, y_pred, average="macro")
    metric_acc = accuracy_score(y_true, y_pred)

    aux_df = pd.DataFrame({"f1-micro": [metric_f1micro], "f1-macro": [metric_f1macro], "accuracy": [metric_acc]}, index=[lang])
    metrics_df = pd.concat([metrics_df, aux_df])
    print(lang)
    print(classification_report(y_true, y_pred, target_names=binarizer.classes_))
    

metrics_df

## Findings
* Italian did best because of the Doubt class, in which Italian has the most number of samples than any other language.

In [None]:
out = multibin.inverse_transform(Y_pred)
out = list(map(lambda x: ','.join(x), out))
out = pd.DataFrame(out, test_df.index)
out.to_csv(out_folder, sep='\t', header=None)