In [2]:
import torch  
import torch.nn as nn  
import torch.optim as optim  
import math  
  
# Setting up the device for GPU usage  
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  
  
# Transformer model parameters  
d_model = 512  # The number of expected features in the encoder/decoder inputs  
nhead = 8  # The number of heads in the multiheadattention models  
num_encoder_layers = 3  # The number of sub-encoder-layers in the encoder  
num_decoder_layers = 3  # The number of sub-decoder-layers in the decoder  
dim_feedforward = 2048  # The dimension of the feedforward network model  
dropout = 0.1  # The dropout value  
  
# Sample tokenizers (these should be replaced with the actual tokenizers for your languages)  
src_language = 'en'  
tgt_language = 'fr'  
  
# Replace these with the actual vocabulary sizes for your source and target languages  
src_vocab_size = 10000  
tgt_vocab_size = 10000  
  
class TransformerModel(nn.Module):  
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, dropout):  
        super(TransformerModel, self).__init__()  
        self.transformer = nn.Transformer(d_model=d_model, nhead=nhead, num_encoder_layers=num_encoder_layers,  
                                          num_decoder_layers=num_decoder_layers, dim_feedforward=dim_feedforward, dropout=dropout)  
        self.src_tok_emb = nn.Embedding(src_vocab_size, d_model)  
        self.tgt_tok_emb = nn.Embedding(tgt_vocab_size, d_model)  
        self.positional_encoding = PositionalEncoding(d_model, dropout)  
        self.generator = nn.Linear(d_model, tgt_vocab_size)  
  
    def forward(self, src, tgt, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, memory_key_padding_mask):  
        src_emb = self.positional_encoding(self.src_tok_emb(src))  
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(tgt))  
        outs = self.transformer(src_emb, tgt_emb, src_mask, tgt_mask, None, src_padding_mask, tgt_padding_mask, memory_key_padding_mask)  
        return self.generator(outs)  
  
class PositionalEncoding(nn.Module):  
    def __init__(self, d_model, dropout=0.1, max_len=5000):  
        super(PositionalEncoding, self).__init__()  
        self.dropout = nn.Dropout(p=dropout)  
  
        position = torch.arange(max_len).unsqueeze(1)  
        div_term = torch.exp(torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model))  
        pe = torch.zeros(max_len, 1, d_model)  
        pe[:, 0, 0::2] = torch.sin(position * div_term)  
        pe[:, 0, 1::2] = torch.cos(position * div_term)  
        self.register_buffer('pe', pe)  
  
    def forward(self, x):  
        x = x + self.pe[:x.size(0)]  
        return self.dropout(x)  
  
# Instantiate the model  
transformer_model = TransformerModel(src_vocab_size, tgt_vocab_size, d_model, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, dropout).to(device)  
  
# Example input batch  
src = torch.rand((10, 32)).long().to(device)  # (source sequence length, batch size)  
tgt = torch.rand((20, 32)).long().to(device)  # (target sequence length, batch size)  
  
# Masks and padding  
src_mask = transformer_model.transformer.generate_square_subsequent_mask(src.size(0)).to(device)  
tgt_mask = transformer_model.transformer.generate_square_subsequent_mask(tgt.size(0)).to(device)  
src_padding_mask = (src == 0).transpose(0, 1).to(device)  
tgt_padding_mask = (tgt == 0).transpose(0, 1).to(device)  
memory_key_padding_mask = src_padding_mask  
  
# Forward pass  
outputs = transformer_model(src, tgt, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, memory_key_padding_mask)  
  
print(outputs.shape)  # (target sequence length, batch size, target vocabulary size)  




torch.Size([20, 32, 10000])


In [3]:
%pip install transformers  


Collecting transformersNote: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 24.0
[notice] To update, run: c:\Users\Legion 5 Pro 007\Documents\Github\MachineTranslationUsingTransformers\MachineTranslationUsingTransformers\.venv\Scripts\python.exe -m pip install --upgrade pip



  Obtaining dependency information for transformers from https://files.pythonhosted.org/packages/e2/52/02271ef16713abea41bab736dfc2dbee75e5e3512cf7441e233976211ba5/transformers-4.39.2-py3-none-any.whl.metadata
  Downloading transformers-4.39.2-py3-none-any.whl.metadata (134 kB)
     ---------------------------------------- 0.0/134.8 kB ? eta -:--:--
     --- ------------------------------------ 10.2/134.8 kB ? eta -:--:--
     ----------------------- --------------- 81.9/134.8 kB 1.2 MB/s eta 0:00:01
     -------------------------------------- 134.8/134.8 kB 1.6 MB/s eta 0:00:00
Collecting huggingface-hub<1.0,>=0.19.3 (from transformers)
  Obtaining dependency information for huggingface-hub<1.0,>=0.19.3 from https://files.pythonhosted.org/packages/05/c0/779afbad8e75565c09ffa24a88b5dd7e293c92b74eb09df6435fc58ac986/huggingface_hub-0.22.2-py3-none-any.whl.metadata
  Downloading huggingface_hub-0.22.2-py3-none-any.whl.metadata (12 kB)
Collecting tokenizers<0.19,>=0.14 (from transformers)

In [6]:
from transformers import pipeline  
  
# Initialize the translation pipeline  
translator = pipeline("translation_en_to_fr")  
  
# Function to translate English to French  
def translate_to_french(text):  
    translation = translator(text)  
    return translation[0]['translation_text']  
  
# User input  
english_text = input("Enter text in English to translate to French: ")  
# english_text = "how are you"


# Translate and print the result  
french_translation = translate_to_french(english_text)  
print(f"french translation: {french_translation}")  


No model was supplied, defaulted to google-t5/t5-base and revision 686f1db (https://huggingface.co/google-t5/t5-base).
Using a pipeline without specifying a model name and revision in production is not recommended.
For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on google-t5/t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


french translation: Comment le temps est-il aujourd'hui


In [7]:
%pip install transformers sentencepiece  


Collecting sentencepiece
  Obtaining dependency information for sentencepiece from https://files.pythonhosted.org/packages/a2/f6/587c62fd21fc988555b85351f50bbde43a51524caafd63bc69240ded14fd/sentencepiece-0.2.0-cp311-cp311-win_amd64.whl.metadata
  Downloading sentencepiece-0.2.0-cp311-cp311-win_amd64.whl.metadata (8.3 kB)
Downloading sentencepiece-0.2.0-cp311-cp311-win_amd64.whl (991 kB)
   ---------------------------------------- 0.0/991.5 kB ? eta -:--:--
   - -------------------------------------- 30.7/991.5 kB 1.3 MB/s eta 0:00:01
   ---------- ----------------------------- 256.0/991.5 kB 3.9 MB/s eta 0:00:01
   ------------------------------- -------- 778.2/991.5 kB 7.0 MB/s eta 0:00:01
   ---------------------------------------- 991.5/991.5 kB 7.9 MB/s eta 0:00:00
Installing collected packages: sentencepiece
Successfully installed sentencepiece-0.2.0



[notice] A new release of pip is available: 23.2.1 -> 24.0
[notice] To update, run: c:\Users\Legion 5 Pro 007\Documents\Github\MachineTranslationUsingTransformers\MachineTranslationUsingTransformers\.venv\Scripts\python.exe -m pip install --upgrade pip


In [2]:
from transformers import MarianMTModel, MarianTokenizer  
  
def translate_text(text, target_language):  
    # Define the model repository path  
    model_name = f'Helsinki-NLP/opus-mt-en-{target_language}'  
      
    # Load the tokenizer and model  
    tokenizer = MarianTokenizer.from_pretrained(model_name)  
    model = MarianMTModel.from_pretrained(model_name)  
  
    # Tokenize the text  
    translated = model.generate(**tokenizer(text, return_tensors="pt", padding=True))  
  
    # Decode the tokens to string  
    translation = tokenizer.decode(translated[0], skip_special_tokens=True)  
  
    return translation  
  
# Example usage:  
if __name__ == "__main__":  
    # User input  
    english_text = input("Enter English text to translate: ")  
    target_language = input("Enter target language code (e.g., 'fr' for French): ")  
  
    # Translate the text  
    translated_text = translate_text(english_text, target_language)  
  
    # Output the translation  
    print(f"Translated text ({target_language}): {translated_text}")  

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


source.spm:   0%|          | 0.00/768k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/797k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.27M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.33k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/298M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

Translated text (de): wie ist dein Name?


In [3]:
#Summerization

from transformers import pipeline  
  
# Load the summarization pipeline  
summarizer = pipeline("summarization")  
  
def get_summary(text):  
    # Use the model to generate a summary  
    summary = summarizer(text, max_length=130, min_length=30, do_sample=False)  
    return summary[0]['summary_text']  
  
# Get input from the user  
user_input = input("Please type the English text you want to summarize:\n")  
  
# Check if the text is too short to summarize  
if len(user_input.split()) < 56:  
    print("This text is too short to summarize, please provide more content.")  
else:  
    # Call the get_summary function  
    summary = get_summary(user_input)  
    print("\nSummary:\n", summary)  

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


pytorch_model.bin:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]


Summary:
  The sanitize function can be extended to perform additional sanitization tasks, depending on what kind of input you are expecting . For example, escaping HTML is necessary when inserting data into an HTML template . For database queries, using parameterized queries or the ORM's built-in methods is generally sufficient to prevent injection attacks .
