In [94]:
# ================================
# Standard Library Imports
# ================================
import os
import sys
import transformers
import tensorflow as tf
from transformers import AutoTokenizer
from transformers import TFAutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
from transformers import AdamWeightDecay
from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM
import pandas as pd
import numpy as np
from datasets import Dataset

In [103]:
# We used the MarianMT model trained for English-to-French translation.
# "Helsinki-NLP/opus-mt-en-fr" is the model repository name.
model_checkpoint = "Helsinki-NLP/opus-mt-en-fr"
# Load the tokenizer associated with the model checkpoint.
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)



In [104]:
# ================================
# Step 1: Load the dataset
# ================================
df = pd.read_csv("/kaggle/input/en-fr-translation-dataset/en-fr.csv", nrows=100000)
# ================================
# Step 2: Preview the dataset
# ================================
df.head(20)

Unnamed: 0,en,fr
0,Changing Lives | Changing Society | How It Wor...,Il a transformé notre vie | Il a transformé la...
1,Site map,Plan du site
2,Feedback,Rétroaction
3,Credits,Crédits
4,Français,English
5,What is light ?,Qu’est-ce que la lumière?
6,The white light spectrum Codes in the light Th...,La découverte du spectre de la lumière blanche...
7,The sky of the first inhabitants A contemporar...,Le ciel des premiers habitants La vision conte...
8,Cartoon,Bande dessinée
9,Links,Liens


In [105]:
# ================================
# Step 3: Dataset Information
# ================================
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   en      99998 non-null   object
 1   fr      100000 non-null  object
dtypes: object(2)
memory usage: 1.5+ MB


In [106]:
#Check for missing values
df.isnull().sum()

en    2
fr    0
dtype: int64

In [107]:
#Remove the missing datapoints
df = df.dropna()
df.isnull().sum()

en    0
fr    0
dtype: int64

In [108]:
# Convert pandas DataFrame to Dataset
dataset = Dataset.from_pandas(df)
# Convert to "translation" column format
dataset = dataset.map(lambda x: {"translation": {"en": x["en"], "fr": x["fr"]}})

Map:   0%|          | 0/99998 [00:00<?, ? examples/s]

In [109]:
# First split 70:30
split_1 = dataset.train_test_split(test_size=0.30, seed=42)
train_dataset = split_1["train"]
temp_dataset = split_1["test"]
# Split temp into 15:15
split_2 = temp_dataset.train_test_split(test_size=0.50, seed=42)
val_dataset = split_2["train"]
test_dataset = split_2["test"]
# Put into DatasetDict
dataset_dict = {
    "train": train_dataset,
    "validation": val_dataset,
    "test": test_dataset
}

In [110]:
max_input_length = 128
max_target_length = 128

source_lang = "en"
target_lang = "fr"

def preprocess_function_direct(examples):
    inputs = examples["en"]
    targets = examples["fr"]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [111]:
tokenized_dataset = {}
for split in dataset_dict:
    tokenized_dataset[split] = dataset_dict[split].map(
        preprocess_function,
        batched=True,
        remove_columns=dataset_dict[split].column_names
    )

Map:   0%|          | 0/69998 [00:00<?, ? examples/s]



Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

***Loading the pretrained model***

In [112]:
model = TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

All model checkpoint layers were used when initializing TFMarianMTModel.

All the layers of TFMarianMTModel were initialized from the model checkpoint at Helsinki-NLP/opus-mt-en-fr.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMarianMTModel for predictions without further training.


***Stating the Model parameters***

In [113]:
batch_size = 16
learning_rate = 2e-5
weight_decay = 0.01
num_train_epochs = 1

In [114]:
# ================================
# Data Collator Setup
# ================================
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf")
# Generation-specific data collator
generation_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf", pad_to_multiple_of=128)

***Convert Hugging Face Dataset splits into TensorFlow-dataset***

In [115]:
train_dataset = model.prepare_tf_dataset(
    tokenized_dataset["train"],
    batch_size=batch_size,
    shuffle=True,
    collate_fn=data_collator,
)

val_dataset = model.prepare_tf_dataset(
    tokenized_dataset["validation"],
    batch_size=batch_size,
    shuffle=False,
    collate_fn=data_collator,
)

test_dataset = model.prepare_tf_dataset(
    tokenized_dataset["test"],
    batch_size=batch_size,
    shuffle=False,
    collate_fn=data_collator,
)

***Initialize the AdamW optimizer with a given learning rate and weight decay, then compiles the model for training***

In [116]:
optimizer = AdamWeightDecay(learning_rate=learning_rate, weight_decay_rate=weight_decay)
model.compile(optimizer=optimizer)

***Train the model on the training dataset for one epoch while evaluating its performance on the validation dataset.***

In [117]:
#Fit the model
model.fit(train_dataset, validation_data=val_dataset, epochs=1)



<tf_keras.src.callbacks.History at 0x799208c32c50>

In [118]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)



In [120]:
input_text = """Abstract: 
Diabetes Mellitus is a chronic disease that occurs when blood glucose levels rise above normal limits. In recent years, machine learning and deep learning techniques have been applied to predict diabetes and its complications. However, researchers face two main challenges when building type 2 diabetes predictive models. First, there is high variability in techniques used across studies, making it difficult to identify the best approach. Second, there is limited transparency about the features included in the models, which reduces interpretability. 

This systematic review aimed to address these challenges. The review followed PRISMA guidelines, enriched with the methodology from Keele and Durham Universities. A total of 90 studies were included. From each study, we extracted the type of model, complementary techniques, dataset, and reported performance metrics. 

Eighteen different types of models were compared. Tree-based algorithms showed the best performance. Deep Neural Networks were less effective, despite their strength in handling large and unstructured data. Data balancing and feature selection techniques improved efficiency. Models trained on well-prepared datasets achieved near-perfect accuracy."""

out = model.generate(**tokenizer([input_text], return_tensors='np'), max_length=128)
print(out)
with tokenizer.as_target_tokenizer():
    print(tokenizer.decode(out[0], skip_special_tokens=True))

tf.Tensor(
[[59513 11794    78    60 17695    43    38  3144 15310    44    95   742
   1150    19   813     5 44446 10695    16  3777 14998     3   672  1940
    655     2    14     6  5376  8810    11    16  1065    20     6  5376
  16692    91    90 14258    27 36768    19 17695    11   163 13947     3
   2373     2    16  4858   474  6895   203  2433  4115  1766     6   489
  22457    13  3693     5 38964    22 17695     5   542   992 16255     2
     92   167    15    38   768 32087    13  1065  3834    31   363    16
   1674     2    66    44  3865  2650    14     6  7910     5     8  2453
   2378     3 15127     2    92   167    15    38  4805  5098    39  1042
     13  2747     9 19649    31    16  3693     2    66    44  5975     8
   1467    20     6  5089     3  2916  1349     0]], shape=(1, 128), dtype=int32)
Résumé : Le diabète est une maladie chronique qui se produit lorsque le taux de glycémie dépasse les limites normales. Ces dernières années, l'apprentissage automatiqu

In [122]:
#Chatgpt translation
"""Abstract: 
Diabetes is a chronic disease that occurs when blood glucose levels exceed normal limits. 
In recent years, machine learning and deep learning techniques have been applied to predict diabetes and its complications. 
However, researchers face two major challenges when developing type 2 diabetes prediction models. 
First, there is great variability in the techniques used across studies, making it difficult to identify the best approach. 
Second, there is limited transparency regarding the features included in the models, which reduces interpretability."""

'Abstract: \nDiabetes is a chronic disease that occurs when blood glucose levels exceed normal limits. \nIn recent years, machine learning and deep learning techniques have been applied to predict diabetes and its complications. \nHowever, researchers face two major challenges when developing type 2 diabetes prediction models. \nFirst, there is great variability in the techniques used across studies, making it difficult to identify the best approach. \nSecond, there is limited transparency regarding the features included in the models, which reduces interpretability.'