In [1]:
'''
The BART model is designed to address some of the limitations of previous transformer-based models, 
such as the inability to handle bidirectional input and the lack of a pre-training method for 
sequence-to-sequence models. BART uses a combination of bidirectional and autoregressive training to 
achieve better performance on a range of NLP tasks.

'''

'\nThe BART model is designed to address some of the limitations of previous transformer-based models, \nsuch as the inability to handle bidirectional input and the lack of a pre-training method for \nsequence-to-sequence models. BART uses a combination of bidirectional and autoregressive training to \nachieve better performance on a range of NLP tasks.\n\n'

In [2]:
import tensorflow as tf
from transformers import BartTokenizer, TFBartForSequenceClassification
from sklearn.metrics import confusion_matrix, classification_report
import numpy as np

In [3]:
# Load the tokenizer and the pre-trained BART model
tokenizer = BartTokenizer.from_pretrained('facebook/bart-base',do_lower_case=True)      #####May have loading problem here
model = TFBartForSequenceClassification.from_pretrained('facebook/bart-base', num_labels = 5,from_pt=True)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBartForSequenceClassification: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight']
- This IS expected if you are initializing TFBartForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBartForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFBartForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classification_head.dense.weight', 'classification_head.dense.bias', 'classification_head.out_proj.weight', 'classification_head.out_proj.bias']
You should probably TRAIN this model on a

In [4]:
# Load and preprocess the data
import pandas as pd

def load_data(split_name='train', columns=['text', 'stars'], folder='data'):
    '''
        "split_name" may be set as 'train', 'valid' or 'test' to load the corresponding dataset.
        
        You may also specify the column names to load any columns in the .csv data file.
        Among many, "text" can be used as model input, and "stars" column is the labels (sentiment). 
        If you like, you are free to use columns other than "text" for prediction.
    '''
    try:
        print(f"select [{', '.join(columns)}] columns from the {split_name} split")
        df = pd.read_csv(f'{folder}/{split_name}.csv')
        df = df.loc[:,columns]
        print("Success")
        return df
    except:
        print(f"Failed loading specified columns... Returning all columns from the {split_name} split")
        df = pd.read_csv(f'{folder}/{split_name}.csv')
        return df

In [5]:
train_df = load_data('train', columns=['text', 'stars'], folder='data')
valid_df = load_data('valid', columns=['text', 'stars'], folder='data')
# the test set labels (the 'stars' column) are not available! So the following code will instead return all columns
test_df = load_data('test', columns=['text', 'stars'], folder='data')

select [text, stars] columns from the train split
Success
select [text, stars] columns from the valid split
Success
select [text, stars] columns from the test split
Failed loading specified columns... Returning all columns from the test split


In [6]:
# Prepare the data.
# As an example, we only use the text data. 
train_df = train_df.sample(frac=1).reset_index(drop=True)
valid_df = valid_df.sample(frac=1).reset_index(drop=True)

x_train = train_df['text']
y_train = train_df['stars']
  
x_valid = valid_df['text']
val_labels = valid_df['stars']

x_test = test_df['text']

In [7]:
train_text = train_df.text.values
train_text = train_text[:10000]
train_text_cut = []
for sentence in train_text:
    if len(sentence) > 126:
        first_part = sentence[:64]
        second_part = sentence[-62:]
        train_text_cut.append(first_part + second_part)
    else:
        train_text_cut.append(sentence)

In [8]:
val_text = valid_df.text.values
# val_text = val_text[:800]
val_text_cut = []
for sentence in val_text:
    if len(sentence) > 126:
        first_part = sentence[:64]
        second_part = sentence[-62:]
        val_text_cut.append(first_part + second_part)
    else:
        val_text_cut.append(sentence)

In [9]:
def encode_sentences(sentences):
    # Tokenize the sentences
    input_ids = [tokenizer.encode(sent, add_special_tokens=True) for sent in sentences]
    # Pad the tokenized sentences
    input_ids = tf.keras.preprocessing.sequence.pad_sequences(input_ids, 
                                                              value=0, 
                                                              padding='post', 
                                                              maxlen=128)  
    return input_ids

In [10]:
train_input_ids = encode_sentences(train_text_cut)
val_input_ids = encode_sentences(val_text_cut)

In [11]:
train_labels = y_train[:10000]

In [13]:
# Fine-tune the BART model
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

model.compile(optimizer=optimizer, 
              loss=loss, 
              metrics=[metric])

model.fit(train_input_ids, train_labels-1, 
          epochs=1, batch_size=8)# epoch2 haopiaohua, overfitting?danhaoxianghaizaishangsheng

  98/1500 [>.............................] - ETA: 7:18:14 - loss: 1.5365 - accuracy: 0.3865

In [None]:
y_pre = model.predict(val_input_ids)

In [None]:
y_pre = tf.argmax(y_pre.logits, axis=1)

In [None]:
# Evaluate the model
print(classification_report(val_labels-1, y_pre))
print("\n\n")
print(confusion_matrix(val_labels-1, y_pre))
#print('accuracy', np.mean(val_labels-1 == y_pre))

In [None]:
# Make predictions
#test_predictions = model.predict(test_input_ids)
#test_ratings = tf.argmax(test_predictions, axis=1)