In [None]:
!pip install transformers
!pip install datasets

In [None]:
import random
from IPython.display import display, HTML
import numpy as np
import pandas as pd

from datasets import load_dataset, DatasetDict, Dataset, ClassLabel
from datasets import load_metric

from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments
from transformers import Trainer

In [None]:
df = pd.read_csv('/content/drive/MyDrive/2IMP40/github-comments.csv', names=['labels', 'text'], header=0)
# df['labels'] = df['labels'].replace(['neutral','negative','positive'],[0,1,2])
df.to_csv('/content/drive/MyDrive/2IMP40/github-comments.csv', index=None)
df

Unnamed: 0,labels,text
0,0,No. I still see the wrong twins. * https://gi...
1,0,"Reverted."""
2,0,You can leave a queue while in queue ? (before...
3,2,"Didn't look at SpellTargetRestrictions XD"""
4,0,Not sure about what kind of line lengths the p...
...,...,...
7117,1,"Yeah, I'm capable of working around it. A set..."
7118,2,"Looks fine to me! On Jul 8, 2013, at 10:12 PM..."
7119,2,Formula updated to address this (would be nice...
7120,2,"I trust you"""


In [None]:
dataset_infos = {'api': {'path': '/content/drive/MyDrive/2IMP40/api-review.xls', 'num_labels': 3, 'sentence_key': 'text', 'label_key': 'label'},
                 'app': {'path': '/content/drive/MyDrive/2IMP40/app-review.csv', 'num_labels': 3, 'sentence_key': 'text', 'label_key': 'label'},
                 'code-review': {'path': '/content/drive/MyDrive/2IMP40/code-review.xlsx', 'num_labels': 3, 'sentence_key': 'text', 'label_key': 'label'},
                 'github': {'path': '/content/drive/MyDrive/2IMP40/github-comments.csv', 'num_labels': 3, 'sentence_key': 'Text', 'label_key': 'Polarity'},
                 'jira': {'path': '/content/drive/MyDrive/2IMP40/jira-issues.csv', 'num_labels': 3, 'sentence_key': 'text', 'label_key': 'label'},
                 'stackoverflow': {'path': '/content/drive/MyDrive/2IMP40/stackoverflow.csv', 'num_labels': 3, 'sentence_key': 'text', 'label_key': 'label'}
                 }

cache_dir = 'drive/MyDrive/2IMP40/cache'

base_model_checkpoint = 'albert-base-v2'
model_name = base_model_checkpoint.split("/")[-1]
metric_name = 'accuracy'

dataset_name = 'github'
num_labels = dataset_infos[dataset_name]['num_labels']
sentence_key = dataset_infos[dataset_name]['sentence_key']
label_key = dataset_infos[dataset_name]['label_key']

batch_size = 8
num_train_epochs = 2
task = f'{dataset_name}_task'

model_output_dir = f'{cache_dir}/{model_name}-finetuned-{task}'
model_save_dir = f'drive/MyDrive/2IMP40/{task}'

In [None]:
def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."

    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    display(HTML(df.to_html()))

dataset = load_dataset('csv', data_files=dataset_infos[dataset_name]['path'])
show_random_elements(dataset['train'], num_examples=10)

Using custom data configuration default-471b2b33f4b138f3


Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-471b2b33f4b138f3/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e...


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-471b2b33f4b138f3/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,labels,text
0,0,"Don't need to interpolate 'prefix' here (`brew audit` will catch that)."""
1,2,"Well done, thanks for not being stingy on the mc-dev update :)"""
2,0,"not really..."""
3,0,"we are talking about a config/ directory under a bundle. How many config files do you here? One for the routing? One for your services? One for the doctrine metadata (and perhaps of few more if you use one file per entity)? That's not a lot of them."""
4,0,"Note: """"CRYPTO_THREADID and associated functions were introduced in OpenSSL 1.0.0 to replace (actually, deprecate) the previous CRYPTO_set_id_callback(), CRYPTO_get_id_callback(), and CRYPTO_thread_id() functions which assumed thread IDs to always be repres"""
5,2,"@trustin true enough... :)"""
6,0,"and arnova, i'd rather see this non-critical issue around for a little longer than adding a hack that nobody will ever clean up later..."""
7,1,"yes, this is at least a source. Remains to figure if this is a good one or not :("""
8,2,"Is 5.0 coming in February now instead of 4.2, or still planning 5.0 for March 22nd? (I'm just excited for the release!)"""
9,1,"I am observing races here. After a flushBuffer CurrentVideo/audio.dts is NOPTS_VALUE -> InputState is updated -> dts and time_offset sent to players are invalid. We have a update timeout in UpdatePlayState and here :("""


In [None]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# 90% train, 10% validation
dataset = dataset['train'].train_test_split(test_size=0.1)

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True, cache_dir=cache_dir)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
train_dataset = tokenized_datasets['train']
eval_dataset = tokenized_datasets['test']

  0%|          | 0/7 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


model = AutoModelForSequenceClassification.from_pretrained(base_model_checkpoint, num_labels=num_labels, cache_dir=cache_dir)

metric = load_metric("accuracy")
training_args = TrainingArguments(
    output_dir=model_output_dir,
    evaluation_strategy="epoch",
    save_strategy='epoch',
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_train_epochs,
    weight_decay=0.01,
    load_best_model_at_end=False,
    metric_for_best_model=metric_name,
    )
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertForSequenceClassification: ['predictions.dense.weight', 'predictions.bias', 'predictions.decoder.bias', 'predictions.dense.bias', 'predictions.decoder.weight', 'predictions.LayerNorm.weight', 'predictions.LayerNorm.bias']
- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You sho

In [None]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `AlbertForSequenceClassification.forward` and have been ignored: text.
***** Running training *****
  Num examples = 6409
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 1604


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6477,0.605956,0.870968
2,0.3753,0.426194,0.907433


The following columns in the evaluation set  don't have a corresponding argument in `AlbertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 713
  Batch size = 8
Saving model checkpoint to drive/MyDrive/2IMP40/cache/albert-base-v2-finetuned-github_task/checkpoint-802
Configuration saved in drive/MyDrive/2IMP40/cache/albert-base-v2-finetuned-github_task/checkpoint-802/config.json
Model weights saved in drive/MyDrive/2IMP40/cache/albert-base-v2-finetuned-github_task/checkpoint-802/pytorch_model.bin
tokenizer config file saved in drive/MyDrive/2IMP40/cache/albert-base-v2-finetuned-github_task/checkpoint-802/tokenizer_config.json
Special tokens file saved in drive/MyDrive/2IMP40/cache/albert-base-v2-finetuned-github_task/checkpoint-802/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `AlbertForSequenceClassification.forward` and have been ignored: text.
***** Running Eva

TrainOutput(global_step=1604, training_loss=0.474164879530148, metrics={'train_runtime': 2504.3477, 'train_samples_per_second': 5.118, 'train_steps_per_second': 0.64, 'total_flos': 306356149192704.0, 'train_loss': 0.474164879530148, 'epoch': 2.0})

In [None]:
### EVALUATE AND SAVE THE BEST ONE
trainer.evaluate()

The following columns in the evaluation set  don't have a corresponding argument in `AlbertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 713
  Batch size = 8


{'epoch': 2.0,
 'eval_accuracy': 0.9074333800841514,
 'eval_loss': 0.4261937737464905,
 'eval_runtime': 53.4914,
 'eval_samples_per_second': 13.329,
 'eval_steps_per_second': 1.683}

In [None]:
from os import path, listdir, makedirs

### SAVE THE TRAINED MODEL
makedirs(model_save_dir)
model.save_pretrained(model_save_dir)

Configuration saved in drive/MyDrive/github_task/config.json
Model weights saved in drive/MyDrive/github_task/pytorch_model.bin
