In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [21]:
!pip install bert

In [None]:
!pip install transformers

In [2]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import transformers
from transformers import AutoModel, BertTokenizerFast

In [None]:
from transformers import AutoTokenizer, BertModel 
tokenizer = AutoTokenizer.from_pretrained('HooshvareLab/bert-fa-zwnj-base')
bert = BertModel.from_pretrained('HooshvareLab/bert-fa-zwnj-base')

In [4]:
df = pd.read_csv ('/content/drive/MyDrive/Quize/snappfood.csv', on_bad_lines='skip' , delimiter='\t')
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
df.head()

Unnamed: 0,comment,label,label_id
0,واقعا حیف وقت که بنویسم سرویس دهیتون شده افتضاح,SAD,1.0
1,قرار بود ۱ ساعته برسه ولی نیم ساعت زودتر از مو...,HAPPY,0.0
2,قیمت این مدل اصلا با کیفیتش سازگاری نداره، فقط...,SAD,1.0
3,عالللی بود همه چه درست و به اندازه و کیفیت خوب...,HAPPY,0.0
4,شیرینی وانیلی فقط یک مدل بود.,HAPPY,0.0


In [5]:
df=df.dropna()

In [6]:
df.label_id = df.label_id.astype(int)

In [7]:
df.head()

Unnamed: 0,comment,label,label_id
0,واقعا حیف وقت که بنویسم سرویس دهیتون شده افتضاح,SAD,1
1,قرار بود ۱ ساعته برسه ولی نیم ساعت زودتر از مو...,HAPPY,0
2,قیمت این مدل اصلا با کیفیتش سازگاری نداره، فقط...,SAD,1
3,عالللی بود همه چه درست و به اندازه و کیفیت خوب...,HAPPY,0
4,شیرینی وانیلی فقط یک مدل بود.,HAPPY,0


In [8]:
X=df.comment
Y=df.label_id	
X_train, X_test, y_train, y_test = train_test_split( X, Y, test_size=0.33, random_state=42)

In [9]:
X_train=X_train.tolist()
y_train=y_train.tolist()
X_test=X_test.tolist()
y_test=y_test.tolist()

In [10]:
train_encodings = tokenizer(X_train, truncation=True, padding=True, max_length=70)
valid_encodings = tokenizer(X_test, truncation=True, padding=True, max_length=70)

In [11]:
class DATALOADER(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]])
        return item

    def __len__(self):
        return len(self.labels)

# convert our tokenized data into a torch Dataset
train_dataset = DATALOADER(train_encodings, y_train)
valid_dataset = DATALOADER(valid_encodings, y_test)

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("HooshvareLab/bert-fa-zwnj-base", num_labels=2).to("cuda")

In [13]:
from sklearn.metrics import accuracy_score

def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  acc = accuracy_score(labels, preds)
  return {
      'accuracy': acc,
  }

In [35]:
!pip uninstall -y transformers accelerate
!pip install transformers accelerate

Installing collected packages: transformers, accelerate
Successfully installed accelerate-0.19.0 transformers-4.29.2


In [14]:


from transformers import TrainingArguments
training_args = TrainingArguments(
    output_dir='./results',          
    num_train_epochs=1,            
    per_device_train_batch_size=8,  
    per_device_eval_batch_size=20,  
    warmup_steps=700,              
    weight_decay=0.01,               
    logging_dir='./logs',           
    load_best_model_at_end=True,     
    logging_steps=400,              
    save_steps=400,
    evaluation_strategy="steps",    
)
     


In [15]:
from transformers import  Trainer
trainer = Trainer(
    model=model,                         
    args=training_args,                 
    train_dataset=train_dataset,        
    eval_dataset=valid_dataset,         
    compute_metrics=compute_metrics,    
)

In [16]:
trainer.train()



Step,Training Loss,Validation Loss,Accuracy
400,0.5148,0.391917,0.831698
800,0.4424,0.456786,0.828252
1200,0.467,0.450934,0.842339
1600,0.4212,0.407582,0.833093
2000,0.4308,0.4455,0.834969
2400,0.4285,0.42732,0.842601
2800,0.4243,0.41602,0.844346
3200,0.3981,0.413343,0.846788
3600,0.4263,0.440465,0.836801
4000,0.4162,0.385873,0.843212


TrainOutput(global_step=5819, training_loss=0.41799698623313714, metrics={'train_runtime': 2147.0184, 'train_samples_per_second': 21.682, 'train_steps_per_second': 2.71, 'total_flos': 1674542561844600.0, 'train_loss': 0.41799698623313714, 'epoch': 1.0})

In [17]:
trainer.evaluate()

{'eval_loss': 0.3563826382160187,
 'eval_accuracy': 0.8573858432552662,
 'eval_runtime': 92.6363,
 'eval_samples_per_second': 247.516,
 'eval_steps_per_second': 12.382,
 'epoch': 1.0}

In [21]:
def get_prediction(text):
    inputs = tokenizer(text, padding=True, truncation=True, max_length=70 ,return_tensors="pt").to("cuda")
    outputs = model(**inputs)
    probs = outputs[0].softmax(1)
    target_names=['HAPPY', 'SAD']
    return target_names[probs.argmax()]

In [22]:
get_prediction('غذای فوق العاده بد طعمی بود')

'SAD'

In [23]:
get_prediction('همه چیز به موقع و با کیفیت بود')

'HAPPY'