In [None]:
!sudo apt-get install build-essential swig
!pip install auto-sklearn
!pip install liac-arff

In [None]:
import sklearn.metrics
from sklearn.feature_extraction.text import CountVectorizer
import autosklearn.classification
import pandas as pd

train = pd.read_csv("ptbr_train_1annotator.csv")
validation = pd.read_csv("ptbr_validation_1annotator.csv")
test = pd.read_csv("ptbr_test_1annotator.csv")

bow = CountVectorizer()
bow.fit(train["text"])

X_train = bow.transform(train["text"])
y_train = list(train["toxic"])

X_validation = bow.transform(validation["text"])
y_validation = list(validation["toxic"])

X_test = bow.transform(test["text"])
y_test = list(test["toxic"])

automl = autosklearn.classification.AutoSklearnClassifier()
automl.fit(X_train, y_train, X_test=X_validation, y_test=y_validation)

In [None]:
print(automl.sprint_statistics())

auto-sklearn results:
  Dataset name: abd3298b2b82c6af054b56730e6e1865
  Metric: accuracy
  Best validation score: 0.753968
  Number of target algorithm runs: 29
  Number of successful target algorithm runs: 20
  Number of crashed target algorithm runs: 0
  Number of target algorithms that exceeded the time limit: 5
  Number of target algorithms that exceeded the memory limit: 4



In [None]:
y_pred = automl.predict(X_test)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.76      0.75      0.75      1128
           1       0.71      0.73      0.72       972

    accuracy                           0.74      2100
   macro avg       0.74      0.74      0.74      2100
weighted avg       0.74      0.74      0.74      2100

[[843 285]
 [263 709]]


# Multi-Label Classification

In [None]:
from sklearn.metrics import hamming_loss, average_precision_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import autosklearn.classification
import pandas as pd
import numpy as np
import gdown

SEED = 42

In [None]:
dataset_url = "https://drive.google.com/uc?id=1refxcQXi-5bDUmmZxH6ZNvRCgSMl-bFe"
output = "ToLD-BR.csv"
gdown.download(dataset_url, output, quiet=False)

Downloading...
From: https://drive.google.com/uc?id=1refxcQXi-5bDUmmZxH6ZNvRCgSMl-bFe
To: /content/ToLD-BR.csv
2.43MB [00:00, 174MB/s]


'ToLD-BR.csv'

In [None]:
data = pd.read_csv("ToLD-BR.csv")
data.iloc[:, 1:] = data.iloc[:, 1:].apply(lambda x: [int(bool(v)) for v in x])
train, test = train_test_split(data, train_size=0.9, random_state=SEED)
test, validation = train_test_split(test, train_size=0.5, random_state=SEED)

In [None]:
np.array(train.iloc[:, 1:])

array([[0, 0, 1, 0, 0, 0],
       [0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       ...,
       [0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0]])

In [None]:
bow = CountVectorizer()
bow.fit(train["text"])

X_train = bow.transform(train["text"])
y_train = np.array(train.iloc[:, 1:])

X_validation = bow.transform(validation["text"])
y_validation = np.array(validation.iloc[:, 1:])

X_test = bow.transform(test["text"])
y_test = np.array(test.iloc[:, 1:])

In [None]:
automl = autosklearn.classification.AutoSklearnClassifiervb()
automl.fit(X=X_train, y=y_train, X_test=X_validation, y_test=y_validation)





AutoSklearnClassifier(dask_client=None,
                      delete_output_folder_after_terminate=True,
                      delete_tmp_folder_after_terminate=True,
                      disable_evaluator_output=False,
                      ensemble_memory_limit=1024, ensemble_nbest=50,
                      ensemble_size=50, exclude_estimators=None,
                      exclude_preprocessors=None, get_smac_object_callback=None,
                      include_estimators=None, include_preprocessors=None,
                      initial_configurations_via_metalearning=25,
                      logging_config=None, max_models_on_disc=50,
                      metadata_directory=None, metric=None,
                      ml_memory_limit=3072, n_jobs=None, output_folder=None,
                      per_run_time_limit=360, resampling_strategy='holdout',
                      resampling_strategy_arguments=None, seed=1,
                      smac_scenario_args=None, time_left_for_this_task=3600,


In [None]:
print(automl.sprint_statistics())

auto-sklearn results:
  Dataset name: 03181b8df02a171bb1a71ec0414d475a
  Metric: f1_macro
  Best validation score: 0.331135
  Number of target algorithm runs: 26
  Number of successful target algorithm runs: 12
  Number of crashed target algorithm runs: 2
  Number of target algorithms that exceeded the time limit: 6
  Number of target algorithms that exceeded the memory limit: 6



In [None]:
y_pred = automl.predict(X_test)

In [None]:
hamming_loss(y_test, y_pred)

0.0838095238095238

In [None]:
average_precision_score(y_test, y_pred)

0.2018712626773922