In [1]:
!pip install transformers > /dev/null
!pip install datasets > /dev/null

In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import numpy as np
import pandas as pd
from datasets import load_dataset, load_metric, load_dataset
from sklearn.model_selection import train_test_split
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding
from transformers import EarlyStoppingCallback
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()

In [3]:
df = pd.read_csv("data/ajgt/AJGT.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1800 entries, 0 to 1799
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   ID          1800 non-null   int64  
 1   Feed        1799 non-null   object 
 2   Sentiment   1800 non-null   object 
 3   Unnamed: 3  0 non-null      float64
 4   Unnamed: 4  0 non-null      float64
dtypes: float64(2), int64(1), object(2)
memory usage: 70.4+ KB


In [4]:
df.drop(df.columns[3], axis=1, inplace=True)

In [5]:
df.drop(df.columns[3], axis=1, inplace=True)
df.head()

Unnamed: 0,ID,Feed,Sentiment
0,1,اربد فيها جامعات اكثر من عمان ... وفيها قد عم...,Positive
1,2,الحلو انكم بتحكوا على اساس انو الاردن ما فيه ...,Negative
2,3,كله رائع بجد ربنا يكرمك,Positive
3,4,لسانك قذر يا قمامه,Negative
4,5,,Negative


In [6]:
df.drop(df.columns[0], axis=1, inplace=True)
df.head()

Unnamed: 0,Feed,Sentiment
0,اربد فيها جامعات اكثر من عمان ... وفيها قد عم...,Positive
1,الحلو انكم بتحكوا على اساس انو الاردن ما فيه ...,Negative
2,كله رائع بجد ربنا يكرمك,Positive
3,لسانك قذر يا قمامه,Negative
4,,Negative


In [7]:
df.isna().sum()

Feed         1
Sentiment    0
dtype: int64

In [8]:
df.dropna(inplace=True)
df.isna().sum()

Feed         0
Sentiment    0
dtype: int64

In [9]:
df.rename(columns={'Feed': 'text', 'Sentiment': 'label'}, inplace=True)
df.head()

Unnamed: 0,text,label
0,اربد فيها جامعات اكثر من عمان ... وفيها قد عم...,Positive
1,الحلو انكم بتحكوا على اساس انو الاردن ما فيه ...,Negative
2,كله رائع بجد ربنا يكرمك,Positive
3,لسانك قذر يا قمامه,Negative
5,ابشرك فيه تحسن ولله الحمد باذن الله يرجع قريبا,Positive


In [10]:
df['label'].value_counts(normalize=True)

Positive    0.500278
Negative    0.499722
Name: label, dtype: float64

In [11]:
def change_label(label):
    if label== 'Positive':
        return 1
    else:
        return 0

df['label'] = df['label'].apply(change_label)
df.head()

Unnamed: 0,text,label
0,اربد فيها جامعات اكثر من عمان ... وفيها قد عم...,1
1,الحلو انكم بتحكوا على اساس انو الاردن ما فيه ...,0
2,كله رائع بجد ربنا يكرمك,1
3,لسانك قذر يا قمامه,0
5,ابشرك فيه تحسن ولله الحمد باذن الله يرجع قريبا,1


In [12]:
df['label'].unique()

array([1, 0])

In [13]:
df['label'].value_counts(normalize=True)

1    0.500278
0    0.499722
Name: label, dtype: float64

In [14]:
X = df['text']
y = df['label']

# create train test datasets
train_tmp_X, test_X, train_tmp_y, test_y = train_test_split(X, y,   test_size=0.1, random_state=43, stratify=y)
# create validation dataset
train_X, val_X, train_y, val_y = train_test_split(train_tmp_X, train_tmp_y, test_size=0.1, random_state=43, stratify=train_tmp_y)

In [15]:
len(train_X), len(val_X) ,len(test_X)

(1457, 162, 180)

In [16]:
len(train_y), len(val_y) ,len(test_y)

(1457, 162, 180)

In [17]:
#df train
frame_train = { 'text': train_X, 'label': train_y}
df_train = pd.DataFrame(frame_train)
df_train.to_csv("data/ajgt/train.csv", encoding='utf-8', index=False)

#df val
frame_val = { 'text': val_X, 'label': val_y}
df_val = pd.DataFrame(frame_val)
df_val.to_csv("data/ajgt/val.csv", encoding='utf-8', index=False)


#df test
frame_test = { 'text': test_X, 'label': test_y }
df_test = pd.DataFrame(frame_test)
df_test.to_csv("data/ajgt/test.csv", encoding='utf-8', index=False)

In [18]:
df_test.shape

(180, 2)

In [19]:
df_test['label'].value_counts(normalize=True)

1    0.5
0    0.5
Name: label, dtype: float64

In [20]:
ds = load_dataset('csv', data_files={'train': 'data/ajgt/train.csv', 'val': 'data/ajgt/val.csv', 'test': 'data/ajgt/test.csv'})
ds

Using custom data configuration default-a03513aee0f30279


Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-a03513aee0f30279/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e...


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-a03513aee0f30279/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 1457
    })
    val: Dataset({
        features: ['text', 'label'],
        num_rows: 162
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 180
    })
})

In [21]:
model_name = "mofawzy/bert-ajgt"
model_testing = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=2)
tokenizer_testing = AutoTokenizer.from_pretrained(model_name)


Downloading:   0%|          | 0.00/749 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/516M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/443 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/733k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.68M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [22]:
def tokenize(examples):
    outputs = tokenizer_testing(examples['text'], truncation=True,max_length=128)
    return outputs

tokenized_ds = ds.map(tokenize, batched=True)

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [23]:
tokenized_ds

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1457
    })
    val: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 162
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 180
    })
})

In [26]:
test_trainer = Trainer(model=model_testing,tokenizer=tokenizer_testing)
raw_pred, _, _ = test_trainer.predict(tokenized_ds['test'])
y_pred = np.argmax(raw_pred, axis=1)
print(classification_report(test_y, y_pred,digits=4))

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Prediction *****
  Num examples = 180
  Batch size = 8


              precision    recall  f1-score   support

           0     0.9462    0.9778    0.9617        90
           1     0.9770    0.9444    0.9605        90

    accuracy                         0.9611       180
   macro avg     0.9616    0.9611    0.9611       180
weighted avg     0.9616    0.9611    0.9611       180

