In [1]:
# for data
#!wget -nc https://www.dropbox.com/s/lkd0eklmi64m9xm/AirlineTweets.csv?dl=0

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import torch
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix

In [3]:
df = pd.read_csv(f'data/AirlineTweets.csv')
df.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [4]:
df= df[['airline_sentiment', 'text']]
df.head()

Unnamed: 0,airline_sentiment,text
0,neutral,@VirginAmerica What @dhepburn said.
1,positive,@VirginAmerica plus you've added commercials t...
2,neutral,@VirginAmerica I didn't today... Must mean I n...
3,negative,@VirginAmerica it's really aggressive to blast...
4,negative,@VirginAmerica and it's a really big bad thing...


In [5]:
# convert target variable into numerical
target_map = { 'positive':1, 'negative':0,'neutral':2}
df['target']= df['airline_sentiment'].map( target_map)
df.head()

Unnamed: 0,airline_sentiment,text,target
0,neutral,@VirginAmerica What @dhepburn said.,2
1,positive,@VirginAmerica plus you've added commercials t...,1
2,neutral,@VirginAmerica I didn't today... Must mean I n...,2
3,negative,@VirginAmerica it's really aggressive to blast...,0
4,negative,@VirginAmerica and it's a really big bad thing...,0


In [6]:
from datasets import Dataset

In [21]:
raw_dataset= Dataset.from_dict({
    'text': [e for e in df['text']],
    'label': [e for e in df['target']], # The data-set must have columns with names "text" and "labels". 
})
raw_dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 14640
})

In [22]:
split = raw_dataset.train_test_split(test_size=0.3, seed=42)
split

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 10248
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 4392
    })
})

# Tokenization

In [23]:
from transformers import AutoTokenizer
model_name='bert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [24]:
def tokenize_fn(data):
    '''
    Truncation will trucate sequence that are longer than max_lenght which specify by the model.
    Args:
        data : dataframe
    return:
        tokenization function with truncation
    '''
    return tokenizer(data['text'], truncation= True)


In [25]:
tokenize_data = split.map(tokenize_fn, 
                          batched= True # tokenization fn should be applied to batched of data
                          )

Map:   0%|          | 0/10248 [00:00<?, ? examples/s]

Map:   0%|          | 0/4392 [00:00<?, ? examples/s]

In [26]:
tokenize_data

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 10248
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 4392
    })
})

## select pre-trained model
* **AutoModelForSequenceClassfication** - use to predict the class of sequence classfication model (eg. sentence)
* **Trainer** - Train a model using torch.nn.Module 
* **TrainingArguments** - Defines the argument use to configure a training run

In [27]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments 


In [28]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, 
                                           num_labels=3 # number of classification in target variable
                                           )

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [29]:
training_args = TrainingArguments(output_dir='training_dir', # save model and training related files
                                  evaluation_strategy='epoch',
                                  save_strategy= 'epoch', # model will save every epochs
                                  num_train_epochs=3, # training epoch
                                  per_device_train_batch_size=16, # batch size for traning 
                                  per_device_eval_batch_size=64, # batch size for evaludation
                                  )



## Evaluation Metrics

In [30]:
def compute_metrics(logits_and_labels):
    logits, labels = logits_and_labels
    predictions = np.argmax(logits, axis=-1)
    acc = np.mean(predictions == labels)
    f1 = f1_score(labels, predictions, average = 'micro')
    return {'accuracy': acc, 'f1_score': f1}

## Trainer

In [31]:
trainer = Trainer( model=model, # model to be trained
                  args=training_args, # instance of TrainingArguments
                  train_dataset=tokenize_data['train'], # training dataset after tokenized
                    eval_dataset=tokenize_data['test'],  # test dataset after tokenized
                  tokenizer=tokenizer,
                  compute_metrics=compute_metrics # evaluation matrics
                  )



In [32]:
trainer.train()

  0%|          | 0/1923 [00:00<?, ?it/s]

{'loss': 0.535, 'grad_norm': 5.519519805908203, 'learning_rate': 3.699947997919917e-05, 'epoch': 0.78}


  0%|          | 0/69 [00:00<?, ?it/s]

{'eval_loss': 0.4311637282371521, 'eval_accuracy': 0.8331056466302368, 'eval_f1_score': 0.8331056466302368, 'eval_runtime': 15.4398, 'eval_samples_per_second': 284.46, 'eval_steps_per_second': 4.469, 'epoch': 1.0}
{'loss': 0.3342, 'grad_norm': 9.324735641479492, 'learning_rate': 2.399895995839834e-05, 'epoch': 1.56}


  0%|          | 0/69 [00:00<?, ?it/s]

{'eval_loss': 0.4849131405353546, 'eval_accuracy': 0.8385701275045537, 'eval_f1_score': 0.8385701275045537, 'eval_runtime': 14.2544, 'eval_samples_per_second': 308.115, 'eval_steps_per_second': 4.841, 'epoch': 2.0}
{'loss': 0.2298, 'grad_norm': 3.333150625228882, 'learning_rate': 1.0998439937597505e-05, 'epoch': 2.34}


  0%|          | 0/69 [00:00<?, ?it/s]

{'eval_loss': 0.6494488716125488, 'eval_accuracy': 0.8419854280510018, 'eval_f1_score': 0.8419854280510017, 'eval_runtime': 14.5025, 'eval_samples_per_second': 302.844, 'eval_steps_per_second': 4.758, 'epoch': 3.0}
{'train_runtime': 536.4073, 'train_samples_per_second': 57.315, 'train_steps_per_second': 3.585, 'train_loss': 0.3165016947967958, 'epoch': 3.0}


TrainOutput(global_step=1923, training_loss=0.3165016947967958, metrics={'train_runtime': 536.4073, 'train_samples_per_second': 57.315, 'train_steps_per_second': 3.585, 'total_flos': 771719876490528.0, 'train_loss': 0.3165016947967958, 'epoch': 3.0})

In [33]:
! ls training_dir

[34mcheckpoint-1282[m[m [34mcheckpoint-1923[m[m [34mcheckpoint-641[m[m


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


# Evaluate Model
* Using Pipeline

In [35]:
from transformers import pipeline