In [None]:
# A requirements.txt file is included for installing the complete environment on a local machine.
# However, it is recommended to run this notebook via google colab. This will get the
# environment running much faster and grants free access to a GPU. If running via
# colab skip this installation and install only the packages in the cell below

# install the environment from the requirements.txt file
!cat requirements.txt | xargs -n 1 pip install

Collecting aiohttp==3.8.1
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 4.1 MB/s 
Collecting async-timeout<5.0,>=4.0.0a3
  Downloading async_timeout-4.0.1-py3-none-any.whl (5.7 kB)
Collecting asynctest==0.13.0
  Downloading asynctest-0.13.0-py3-none-any.whl (26 kB)
Collecting multidict<7.0,>=4.5
  Downloading multidict-5.2.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (160 kB)
[K     |████████████████████████████████| 160 kB 54.1 MB/s 
[?25hCollecting aiosignal>=1.1.2
  Downloading aiosignal-1.2.0-py3-none-any.whl (8.2 kB)
Collecting frozenlist>=1.1.1
  Downloading frozenlist-1.2.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (192 kB)
[K     |████████████████████████████████| 192 kB 69.5 MB/s 
Collecting yarl<2.0,>=1.0
  Downloading yarl-1.7.2-

In [None]:
!pip install -r /content/requirements.txt

In [None]:
# these packages alone should work if using google colab
!pip install torch torchvision==0.11.1
!pip install transformers==4.13.0
!pip install seqeval==1.2.2
!pip install tensorboardx==2.7.0
!pip install simpletransformers==0.63.3

In [2]:
# Imports
import pandas as pd
import numpy as np
import gc
import requests
import os
from simpletransformers.classification import ClassificationModel
from simpletransformers.classification import ClassificationArgs
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score, recall_score, precision_score, precision_recall_curve, auc
import torch
import logging

# Verify that GPU is being used
print("Cuda available" if torch.cuda.is_available() is True else "CPU")
print("PyTorch version: ", torch.__version__)

Cuda available
PyTorch version:  1.10.0+cu111


In [3]:
# Mount drive for training data if using google colab, or skip and upload
# training data directly to session storage

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Model Training

In [None]:
# args
model_args = ClassificationArgs()
model_args.evaluate_during_training = True
model_args.evaluate_during_training_silent = False
model_args.evaluate_during_training_steps = 10
model_args.learning_rate = 1.3276125338114695e-05
model_args.manual_seed = 907
model_args.max_seq_length = 256
model_args.no_cache = False #True
model_args.no_save = False #True
model_args.num_train_epochs = 5
model_args.overwrite_output_dir = True
model_args.reprocess_input_data = True
model_args.train_batch_size = 16
model_args.eval_batch_size = 16
model_args.train_custom_parameters_only = False
model_args.save_steps = 40

# data import
url = '/content/drive/MyDrive/Data/training_tweets.csv' # file path to training tweets
df = pd.read_csv(url, error_bad_lines=False)

# train test split
train_df = df.sample(frac = 0.8, random_state = 907)
test_df = df.drop(train_df.index).reset_index(drop=True)
train_df = train_df.reset_index(drop = True)

# convert strings to lower
train_df['text'] = train_df['text'].str.lower()
test_df['text'] = test_df['text'].str.lower()

# Train the model
model = ClassificationModel(
        "electra",
        "google/electra-base-discriminator",
        use_cuda=True,
        args=model_args
    )

model.train_model(train_df, eval_df = test_df)

In [7]:
result, model_outputs, wrong_predictions = model.eval_model(test_df)

  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


  0%|          | 0/397 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/25 [00:00<?, ?it/s]

In [12]:
result

{'auprc': 0.8309032736873481,
 'auroc': 0.9012078687684224,
 'eval_loss': 0.39487834930419924,
 'fn': 38,
 'fp': 18,
 'mcc': 0.6276786265993519,
 'tn': 271,
 'tp': 70}

In [None]:
!zip -r ./outputs/best.zip ./outputs/best_model

  adding: outputs/best_model/ (stored 0%)
  adding: outputs/best_model/pytorch_model.bin (deflated 7%)
  adding: outputs/best_model/vocab.txt (deflated 53%)
  adding: outputs/best_model/optimizer.pt (deflated 23%)
  adding: outputs/best_model/eval_results.txt (deflated 26%)
  adding: outputs/best_model/scheduler.pt (deflated 49%)
  adding: outputs/best_model/special_tokens_map.json (deflated 40%)
  adding: outputs/best_model/tokenizer_config.json (deflated 39%)
  adding: outputs/best_model/training_args.bin (deflated 49%)
  adding: outputs/best_model/config.json (deflated 51%)
  adding: outputs/best_model/model_args.json (deflated 62%)
  adding: outputs/best_model/tokenizer.json (deflated 59%)
