In [None]:
!pip install simpletransformers transformers
!pip install transformers --upgrade
!pip install datasets
!pip install Pillow

In [2]:
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense # this will help us add layers to the neural network 
from sklearn.metrics import accuracy_score
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

from datasets import load_dataset
from datasets.utils.file_utils import get_datasets_user_agent
from concurrent.futures import ThreadPoolExecutor
from functools import partial
import numpy as np
import random
import requests
import PIL
from PIL import Image
import io
from io import BytesIO
import urllib
import urllib.request
import os
from simpletransformers.seq2seq import Seq2SeqModel
import pandas as pd

In [3]:
#import the labels and captions into the dataframe
label_file ='/content/urdu_labels.txt'
caption_file = '/content/urdu_captions.txt'

captions_df = pd.read_csv(caption_file, sep='\t', header=None, names=['caption'])
labels_df = pd.read_csv(label_file, sep='\t', header=None, names=['label'])

df = captions_df.join(labels_df)

In [13]:
#split the data into training and validation
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(df, test_size=0.2)


# Preprocess the data
train_data = {"input_text": [], "target_text": []}
for i in range(len(df)):
    train_data["input_text"].append(df.iloc[i]['label'])
    train_data["target_text"].append(df.iloc[i]['caption'])

val_data = {"input_text": [], "target_text": []}
for i in range(len(val_df)):
    val_data["input_text"].append(val_df.iloc[i]['label'])
    val_data["target_text"].append(val_df.iloc[i]['caption'])

#initialize the model
model = Seq2SeqModel(
    encoder_decoder_type="bart",
    encoder_decoder_name="facebook/bart-base",
    use_cuda=True,
)

In [14]:
#training BART architecture with the given urdu labels and captions
model.train_model(train_data=train_data, eval_data=val_data)

  0%|          | 0/9511 [00:00<?, ?it/s]

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/1189 [00:00<?, ?it/s]



(1189, 1.1501820750402942)

In [29]:
from transformers import BartTokenizer, BartForConditionalGeneration

tokenizer = BartTokenizer.from_pretrained("facebook/bart-large")
model = BartForConditionalGeneration.from_pretrained("facebook/bart-large")

urdu_test = train_data['input_text'][500]
input_ids = tokenizer.encode(urdu_test, add_special_tokens=True, return_tensors="pt")

outputs = model.generate(input_ids, num_beams=4, max_length=50, early_stopping=True)
decoded_preds = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(decoded_preds)



Downloading pytorch_model.bin:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

درخت کی چھوٹی پتی سیاہ سپروس بالسم فر


In [7]:
print(train_data[0])

{'input_text': 'کرسمس ٹری کرسمس ڈیکوریشن فونٹ ٹیکسٹ گرافک ڈیزائن کی مثال اندرونی ڈیزائن ٹری کرسمس کی شام زیور فر پلانٹ پائن پائن فیملی گرافکس', 'target_text': 'سیاہ پس منظر پر کرسمس ٹری۔'}
