In [1]:
!pip install transformers > /dev/null
!pip install datasets > /dev/null

In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import numpy as np
import pandas as pd
from datasets import load_dataset, load_metric, load_dataset
from sklearn.model_selection import train_test_split
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding
from transformers import EarlyStoppingCallback
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()

In [4]:
df = pd.read_csv("data/astd-b/ASTD-balanced-not-linked.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1330 entries, 0 to 1329
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   text      1330 non-null   object
 1   polarity  1330 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 20.9+ KB


In [5]:
df.head()

Unnamed: 0,text,polarity
0,اهنئ الدكتور احمد جمال الدين القيادي بحزب مصر ...,1
1,امير عيد هو اللي فعلا يتقال عليه ستريكر صريح ك...,1
2,الصداقه تزرع الحياه ازهارا مي زياده,1
3,دضياء رشوان ان الدكتور عبد المنعم ابو الفتوح ا...,1
4,الى زملائي المحامين الراغبين في الانضمام لمباد...,1


In [6]:
df

Unnamed: 0,text,polarity
0,اهنئ الدكتور احمد جمال الدين القيادي بحزب مصر ...,1
1,امير عيد هو اللي فعلا يتقال عليه ستريكر صريح ك...,1
2,الصداقه تزرع الحياه ازهارا مي زياده,1
3,دضياء رشوان ان الدكتور عبد المنعم ابو الفتوح ا...,1
4,الى زملائي المحامين الراغبين في الانضمام لمباد...,1
...,...,...
1325,العب بولنج ببلاش فقط بمولد سيدك السيسي جانب من...,-1
1326,قال الرئيس الفلسطيني محمود عباس ان ما حدث ل ال...,-1
1327,عندما يكون طموح انسان ان يبوس البياده او يجلس ...,-1
1328,بالفيديو توفيق عكاشه هيفا وليه ناشفه,-1


In [7]:
df.rename(columns={'polarity': 'label'}, inplace=True)

In [8]:
df['label'].value_counts(normalize=True)

 1    0.5
-1    0.5
Name: label, dtype: float64

In [9]:
def change_label(label):
    if label== -1:
        return 0
    else:
        return 1

df['label'] = df['label'].apply(change_label)
df.head()

Unnamed: 0,text,label
0,اهنئ الدكتور احمد جمال الدين القيادي بحزب مصر ...,1
1,امير عيد هو اللي فعلا يتقال عليه ستريكر صريح ك...,1
2,الصداقه تزرع الحياه ازهارا مي زياده,1
3,دضياء رشوان ان الدكتور عبد المنعم ابو الفتوح ا...,1
4,الى زملائي المحامين الراغبين في الانضمام لمباد...,1


In [10]:
df['label'].unique()

array([1, 0])

In [11]:
X = df['text']
y = df['label']

# create train test datasets
train_tmp_X, test_X, train_tmp_y, test_y = train_test_split(X, y,   test_size=0.2, random_state=43, stratify=y)
# create validation dataset
train_X, val_X, train_y, val_y = train_test_split(train_tmp_X, train_tmp_y, test_size=0.2, random_state=43, stratify=train_tmp_y)

In [12]:
len(train_X), len(val_X), len(test_X)

(851, 213, 266)

In [13]:
len(train_y), len(val_y), len(test_y)

(851, 213, 266)

In [14]:
#df train
frame_train = { 'text': train_X, 'label': train_y}
df_train = pd.DataFrame(frame_train)
df_train.to_csv("data/astd-b/train.csv", encoding='utf-8', index=False)

# df val
frame_val = { 'text': val_X, 'label': val_y}
df_val = pd.DataFrame(frame_val)
df_val.to_csv("data/astd-b/val.csv", encoding='utf-8', index=False)


#df test
frame_test = { 'text': test_X, 'label': test_y }
df_test = pd.DataFrame(frame_test)
df_test.to_csv("data/astd-b/test.csv", encoding='utf-8', index=False)

In [15]:
df_test.shape

(266, 2)

In [16]:
df_test['label'].value_counts(normalize=True)

0    0.5
1    0.5
Name: label, dtype: float64

In [17]:
ds = load_dataset('csv', data_files={'train': 'data/astd-b/train.csv', 'val': 'data/astd-b/val.csv', 'test': 'data/astd-b/test.csv'})
ds

Using custom data configuration default-03f00e2de2a92740


Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-03f00e2de2a92740/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e...


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-03f00e2de2a92740/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 851
    })
    val: Dataset({
        features: ['text', 'label'],
        num_rows: 213
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 266
    })
})

# Testing the fine-tuned model

In [18]:
model_path = "mofawzy/BERT-ASTD"
model_testing = AutoModelForSequenceClassification.from_pretrained(model_path,num_labels=2)
tokenizer_testing = AutoTokenizer.from_pretrained(model_path)

Downloading:   0%|          | 0.00/788 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.25G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/381 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/326k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/818k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [20]:
def tokenize(examples):
    outputs = tokenizer_testing(examples['text'], truncation=True,max_length=140)
    return outputs

tokenized_ds = ds.map(tokenize, batched=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [21]:
test_trainer = Trainer(model=model_testing,tokenizer=tokenizer_testing)

In [22]:
raw_pred, _, _ = test_trainer.predict(tokenized_ds['test']) 

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Prediction *****
  Num examples = 266
  Batch size = 8


In [23]:
# Preprocess raw predictions
y_pred = np.argmax(raw_pred, axis=1)

In [24]:
len(y_pred), type(y_pred)

(266, numpy.ndarray)

In [25]:
len(test_y), type(test_y)

(266, pandas.core.series.Series)

In [26]:
print(classification_report(test_y, y_pred,digits=4))

              precision    recall  f1-score   support

           0     0.9328    0.9398    0.9363       133
           1     0.9394    0.9323    0.9358       133

    accuracy                         0.9361       266
   macro avg     0.9361    0.9361    0.9361       266
weighted avg     0.9361    0.9361    0.9361       266

