In [None]:
!nvidia-smi

In [None]:
import os
print(os.getcwd())
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [None]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

In [None]:
import re
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import BertTokenizerFast
import os
from tqdm import tqdm

# 1. define punctuation map
punctuation_map = {
    ',': 'COMMA',
    '.': 'PERIOD',
    '?': 'QUESTION',
    '!': 'EXCLAMATION',
    ';': 'SEMICOLON',
    ':': 'COLON',
    '-': 'HYPHEN',
    'â€“': 'EN_DASH',
    'â€”': 'EM_DASH',
    '(': 'LEFT_PAREN',
    ')': 'RIGHT_PAREN',
    '[': 'LEFT_BRACKET',
    ']': 'RIGHT_BRACKET',
    '{': 'LEFT_BRACE',
    '}': 'RIGHT_BRACE',
    '"': 'DOUBLE_QUOTE',
    "'": 'SINGLE_QUOTE',
    'â€¦': 'ELLIPSIS',
    '/': 'SLASH',
    '\\': 'BACKSLASH',
    '@': 'AT_SYMBOL',
    '#': 'HASH',
    '$': 'DOLLAR',
    '%': 'PERCENT',
    '&': 'AMPERSAND',
    '*': 'ASTERISK',
    '+': 'PLUS',
    '=': 'EQUALS',
    '<': 'LESS_THAN',
    '>': 'GREATER_THAN',
    '|': 'PIPE',
    '^': 'CARET',
    '`': 'BACKTICK',
    '~': 'TILDE'
}

# Automatically create label_list from punctuation_map
label_list = ["O"] + list(punctuation_map.values())
label_to_id = {l: i for i, l in enumerate(label_list)}

print("Label list:", label_list)

In [None]:
punctuation_reverse_map = {v: k for k, v in punctuation_map.items()}
punctuation_reverse_map["O"] = ""   # no punctuation

In [None]:
from datasets import load_dataset
from evaluate import load
raw_datasets = load_dataset("thenlpresearcher/test_data_marathi")
metric = load("sacrebleu")

In [None]:
import torch

def restore_punctuation(text: str, model, tokenizer, label_list, device, punctuation_reverse_map) -> str:
    """
    Restores punctuation to an unpunctuated text string, utilizing tokenizer subwords
    and word_ids() for accurate mapping.
    """
    # 1. Tokenize the input text
    words = re.findall(r"\w+|[^\w\s]", text.strip())
#     print(words)
    encoded_input = tokenizer(
        words, 
        is_split_into_words=True, 
        return_tensors="pt", 
        padding=True, 
        truncation=True
    ).to(device)
    
    # 2. Extract word IDs and perform inference
    word_ids_list = encoded_input.word_ids() # Maps subword token index to original word index (or None for special tokens)
    tokens = tokenizer.convert_ids_to_tokens(encoded_input['input_ids'][0])
    tokens = tokens[1:-1]
    # Run model inference to get logits/predictions
    with torch.no_grad():
        outputs = model(**encoded_input)
        logits = outputs.logits
    
    # Get the predicted label index (p_id) for each token
    # We take the first element [0] because the input is a batch of size 1
    p_ids = torch.argmax(logits, dim=-1).squeeze().tolist()
    
    p_ids = p_ids[1:-1]
    word_ids_list = word_ids_list[1:-1]
#     print(word_ids_list)
#     print(p_ids)
#     print(tokens)
    
    final_output = []
    
    i = 0
    for i in range(len(tokens)):
        t = tokens[i]
        p = p_ids[i]
        punct = punctuation_reverse_map[label_list[p]]
        
        t = t.strip('#')
        if t == punct:
            continue
        
        if i < len(tokens) -1 and punct == tokens[i+1]:
            final_output.extend([t, punct])
            i = i + 2
            continue
            
        if punct != " " and i < len(word_ids_list) - 1 and word_ids_list[i] != word_ids_list[i+1]:
            punct = punct + " "
        if punct == " " and i < len(word_ids_list) - 1 and word_ids_list[i] == word_ids_list[i+1]:
            punct = ""
        
        
        if i < len(word_ids_list) - 1 and word_ids_list[i] == word_ids_list[i+1] and p_ids[i] == p_ids[i+1]:
            final_output.append(t)
        else:
            final_output.extend([t, punct])
        i = i + 1
        
    # 6. Final cleanup
    result = "".join(final_output).strip()
    
    if result:
        # Capitalize the first letter
        return result[0].upper() + result[1:]
    return ""

In [None]:
# from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer

# tokenizer = AutoTokenizer.from_pretrained("thenlpresearcher/mpnet_token_cls_model")
# model = AutoModelForTokenClassification.from_pretrained("thenlpresearcher/mpnet_token_cls_model").to(device)

# sentence = "i am going to school but i forgot my bag"
# print(restore_punctuation(sentence, model, tokenizer, label_list, device, punctuation_reverse_map))

In [None]:
from transformers import pipeline
# This might accidentally default to a translation task
punctuator_pipeline = pipeline("text2text-generation", model="thenlpresearcher/iitb-t5-finetuned-punctuation")

text = "the morning sky stretched over the city like a quiet sheet of pale blue while people hurried through the streets"
punctuator_pipeline(text,
                   max_length=128)[0]['generated_text']

#output
# [{'generated_text': 'the morning sky stretched over the city like a quiet sheet of pale blue while people hurried through the streets.'}]

In [None]:
def restore_punctuation_t5(text):
    return punctuator_pipeline(text, max_length=128)[0]['generated_text']

In [None]:
raw_datasets["test"][0]

In [None]:
# predicted_sentences = []

# for text in raw_datasets["test"]['sent_written']:
#     pred = restore_punctuation(text.strip('.').strip('?'), model, tokenizer, label_list, device, punctuation_reverse_map)
#     predicted_sentences.append(pred)
#     print(text)
#     print(pred)
#     print('---')

In [None]:
predicted_sentences = []

for text in raw_datasets["test"]['sent_written']:
    pred = restore_punctuation_t5(text)
    predicted_sentences.append(pred)
    print(text)
    print(pred)
    print('---')

In [None]:
predicted_sentences[0]

In [None]:
import pandas as pd

# Create a brand-new empty dataframe
df = pd.DataFrame()
# Add model predictions
df["prediction"] = predicted_sentences

# Add source fields from HF dataset
df["src"] = raw_datasets["test"]["sent_written"]
df["gt"]   = raw_datasets["test"]["sent_meant"]

# Save the file
output_file = "approach1_eng_to_eng_t5_outputs_mar_data.csv"
df.to_csv(output_file, index=False)

print("Saved:", output_file)


In [None]:
import torch

def batch_translate(input_sentences, src_lang, tgt_lang, model, tokenizer, ip, device, batch_size=8, max_length=256):
    """
    Translate a batch of sentences using a seq2seq model like IndicTrans.

    Args:
        input_sentences (list of str): Source sentences to translate.
        src_lang (str): Source language code, e.g., "eng_Latn".
        tgt_lang (str): Target language code, e.g., "mar_Deva".
        model: Hugging Face seq2seq model.
        tokenizer: Corresponding tokenizer.
        ip: Preprocessing object (IndicProcessor).
        device: torch device ("cuda" or "cpu").
        batch_size (int): Batch size for generation.
        max_length (int): Maximum length of generated sequence.

    Returns:
        translations (list of str): Translated sentences.
    """
    model.to(device)
    model.eval()
    translations = []

    for i in range(0, len(input_sentences), batch_size):
        batch = input_sentences[i : i + batch_size]

        # Preprocess the batch
        batch = ip.preprocess_batch(batch, src_lang=src_lang, tgt_lang=tgt_lang)

        # Tokenize
        inputs = tokenizer(
            batch,
            truncation=True,
            padding="longest",
            return_tensors="pt",
            return_attention_mask=True,
        )
        # Move tensors to the correct device
        inputs = {k: v.to(device) for k, v in inputs.items()}

        # Generate translations
        with torch.no_grad():
            generated_tokens = model.generate(
                **inputs,
                use_cache=True,
                min_length=5,  # ensure some minimum length
                max_length=max_length,
                num_beams=5,
                num_return_sequences=1,
                early_stopping=True,
                decoder_start_token_id=model.config.decoder_start_token_id
            )

        # Decode generated tokens
        decoded_texts = tokenizer.batch_decode(
            generated_tokens,
            skip_special_tokens=True,
            clean_up_tokenization_spaces=True,
        )

        # Postprocess translations (remove language prefix, entity replacement, etc.)
        translations += ip.postprocess_batch(decoded_texts, lang=tgt_lang)

        # Free GPU memory
        del inputs, generated_tokens
        torch.cuda.empty_cache()

    return translations

In [None]:
def initialize_model_and_tokenizer(ckpt_dir, quantization=None):
    from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, BitsAndBytesConfig
    import torch

    # Quantization setup
    if quantization == "4-bit":
        qconfig = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_compute_dtype=torch.bfloat16,
        )
    elif quantization == "8-bit":
        qconfig = BitsAndBytesConfig(
            load_in_8bit=True,
            bnb_8bit_use_double_quant=True,
            bnb_8bit_compute_dtype=torch.bfloat16,
        )
    else:
        qconfig = None

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(ckpt_dir, trust_remote_code=True)

    # Load model
    model = AutoModelForSeq2SeqLM.from_pretrained(
        ckpt_dir,
        trust_remote_code=True,
        low_cpu_mem_usage=True,
        quantization_config=qconfig,
    )

    # Move to device and optionally convert to half precision
    if qconfig is None:
        model = model.to(DEVICE)
        if DEVICE == "cuda":
            model.half()

    # Make sure model is in training mode for fine-tuning
    model.eval()

    return tokenizer, model

In [None]:
import torch

def batch_translate(input_sentences, src_lang, tgt_lang, model, tokenizer, ip, device, batch_size=8, max_length=256):
    """
    Translate a batch of sentences using a seq2seq model like IndicTrans.

    Args:
        input_sentences (list of str): Source sentences to translate.
        src_lang (str): Source language code, e.g., "eng_Latn".
        tgt_lang (str): Target language code, e.g., "mar_Deva".
        model: Hugging Face seq2seq model.
        tokenizer: Corresponding tokenizer.
        ip: Preprocessing object (IndicProcessor).
        device: torch device ("cuda" or "cpu").
        batch_size (int): Batch size for generation.
        max_length (int): Maximum length of generated sequence.

    Returns:
        translations (list of str): Translated sentences.
    """
    model.to(device)
    model.eval()
    translations = []

    for i in range(0, len(input_sentences), batch_size):
        batch = input_sentences[i : i + batch_size]

        # Preprocess the batch
        batch = ip.preprocess_batch(batch, src_lang=src_lang, tgt_lang=tgt_lang)

        # Tokenize
        inputs = tokenizer(
            batch,
            truncation=True,
            padding="longest",
            return_tensors="pt",
            return_attention_mask=True,
        )
        # Move tensors to the correct device
        inputs = {k: v.to(device) for k, v in inputs.items()}

        # Generate translations
        with torch.no_grad():
            generated_tokens = model.generate(
                **inputs,
                use_cache=True,
                min_length=5,  # ensure some minimum length
                max_length=max_length,
                num_beams=5,
                num_return_sequences=1,
                early_stopping=True,
                decoder_start_token_id=model.config.decoder_start_token_id
            )

        # Decode generated tokens
        decoded_texts = tokenizer.batch_decode(
            generated_tokens,
            skip_special_tokens=True,
            clean_up_tokenization_spaces=True,
        )

        # Postprocess translations (remove language prefix, entity replacement, etc.)
        translations += ip.postprocess_batch(decoded_texts, lang=tgt_lang)

        # Free GPU memory
        del inputs, generated_tokens
        torch.cuda.empty_cache()

    return translations

In [None]:
import torch
en_indic_ckpt_dir = "ai4bharat/indictrans2-en-indic-dist-200M"

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
quantization = None
en_indic_tokenizer, en_indic_model = initialize_model_and_tokenizer(en_indic_ckpt_dir, quantization)

Then we just need to pass all of this along with our datasets to the Seq2SeqTrainer:

In [None]:
model = en_indic_model.to(DEVICE)

In [None]:
tokenizer = en_indic_tokenizer

In [None]:
raw_datasets['test']

In [None]:
from IndicTransToolkit.processor import IndicProcessor

In [None]:
ip = IndicProcessor(inference=True)

In [None]:
BATCH_SIZE = 4
src_lang, tgt_lang = "eng_Latn", "mar_Deva"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# -------------------- LOAD DATA --------------------
src_sentences = predicted_sentences
ref_gt     = raw_datasets['test']["gt_marathi"]
ref_gem    = raw_datasets['test']["gemini_out"]
ref_cfilt  = raw_datasets['test']["cfilt_out"]

In [None]:
print(len(src_sentences))
print(len(ref_gt))
print(len(ref_gem))
print(len(ref_cfilt))

In [None]:
import torch

def batch_translate(input_sentences, src_lang, tgt_lang, model, tokenizer, ip, device, batch_size=8, max_length=256):
    """
    Translate a batch of sentences using a seq2seq model like IndicTrans with safety checks.

    Args:
        input_sentences (list of str): Source sentences to translate.
        src_lang (str): Source language code, e.g., "eng_Latn".
        tgt_lang (str): Target language code, e.g., "mar_Deva".
        model: Hugging Face seq2seq model.
        tokenizer: Corresponding tokenizer.
        ip: Preprocessing object (IndicProcessor).
        device: torch device ("cuda" or "cpu").
        batch_size (int): Batch size for generation.
        max_length (int): Maximum length of generated sequence.

    Returns:
        translations (list of str): Translated sentences.
    """
    model.to(device)
    model.eval()
    translations = []

    # Safe access for decoder_start_token_id
    decoder_start_token_id = getattr(model.config, "decoder_start_token_id", None)
    pad_token_id = getattr(tokenizer, "pad_token_id", None)
    eos_token_id = getattr(tokenizer, "eos_token_id", None)

    if decoder_start_token_id is None:
        print("[Warning] decoder_start_token_id is None. Using default generation behavior.")

    for i in range(0, len(input_sentences), batch_size):
        batch = input_sentences[i : i + batch_size]
        print('here')

        # Preprocess the batch
        batch_preprocessed = ip.preprocess_batch(batch, src_lang=src_lang, tgt_lang=tgt_lang)
        if not isinstance(batch_preprocessed, list) or len(batch_preprocessed) == 0:
            print(f"[Warning] Preprocessed batch is empty at index {i}. Skipping...")
            continue

#         Debug: print first 2 sentences after preprocessing
        print(f"[Debug] Preprocessed batch sample: {batch_preprocessed[:2]}")

        # Tokenize
        inputs = tokenizer(
            batch_preprocessed,
            truncation=True,
            padding="longest",
            return_tensors="pt",
            return_attention_mask=True,
        )

        # Move tensors to the correct device
        inputs = {k: v.to(device) for k, v in inputs.items()}

        # Generate translations with safety parameters
        with torch.no_grad():
            try:
                generated_tokens = model.generate(
                    **inputs,
                    use_cache=True,
                    min_length=5,
                    max_length=max_length,
                    num_beams=5,
                    num_return_sequences=1,
                    early_stopping=True,
                    decoder_start_token_id=decoder_start_token_id,
                    pad_token_id=pad_token_id,
                    eos_token_id=eos_token_id
                )
            except Exception as e:
                print(f"[Error] Generation failed for batch starting at index {i}: {e}")
                continue

        # Decode generated tokens
        decoded_texts = tokenizer.batch_decode(
            generated_tokens,
            skip_special_tokens=True,
            clean_up_tokenization_spaces=True,
        )

#         Debug: print first 2 decoded outputs
        print(f"[Debug] Decoded sample: {decoded_texts[:2]}")

        # Postprocess translations
        try:
            postprocessed = ip.postprocess_batch(decoded_texts, lang=tgt_lang)
            translations += postprocessed
        except Exception as e:
            print(f"[Error] Postprocessing failed for batch starting at index {i}: {e}")
            translations += decoded_texts  # fallback

        # Free GPU memory
        del inputs, generated_tokens
        torch.cuda.empty_cache()

    return translations

In [None]:
src_lang, tgt_lang = "eng_Latn", "mar_Deva"

prefix = f"{tgt_lang} {src_lang}"

def remove_prefix(text):
    if text.startswith(prefix):
        return text[len(prefix):].strip()
    return text.strip()

In [None]:
from evaluate import load
from tqdm import tqdm
import pandas as pd

# -------------------- TRANSLATION --------------------
valid_src = []
valid_pred = []
valid_gt = []
valid_gem = []
valid_cfilt = []

# Translate in batches
all_translations = []
for i in tqdm(range(0, len(src_sentences), BATCH_SIZE)):
    batch = src_sentences[i:i+BATCH_SIZE]
#     print(batch)
    translations = batch_translate(
        batch,
        src_lang,
        tgt_lang,
        en_indic_model,
        en_indic_tokenizer,
        ip,
        device=DEVICE
    )
    all_translations.extend(translations)

    if translations is None:
        print(f"[SKIPPED] Batch {i}: Returned None")
        continue

    cleaned = [remove_prefix(t) for t in translations]

    valid_src.extend(batch)
    valid_pred.extend(cleaned)
    valid_gt.extend(ref_gt[i:i + len(batch)])
    valid_gem.extend(ref_gem[i:i + len(batch)])
    valid_cfilt.extend(ref_cfilt[i:i + len(batch)])

print(f"\nSuccessful translations: {len(valid_pred)} / {len(src_sentences)}")

In [None]:
mode = "approach1_t5"

In [None]:
import pandas as pd
from evaluate import load

# -------------------- SAVE OUTPUTS --------------------
results_df = pd.DataFrame({
    "src": valid_src,
    "prediction": valid_pred,
    "gt": valid_gt,
    "gemini": valid_gem,
    "cfilt": valid_cfilt
})

results_df.to_csv(f"{mode}_outputs.csv", index=False)
print(f"âœ” Saved predictions to {mode}_outputs.csv")

# -------------------- METRICS --------------------
bleu = load("sacrebleu")
chrf = load("chrf")

def compute_scores(preds, ref1, ref2, ref3):
    """
    Compute BLEU and chrF++ scores using all three references for each sentence.
    """
    references = [[r1, r2, r3] for r1, r2, r3 in zip(ref1, ref2, ref3)]  # sacrebleu format
    bleu_score = bleu.compute(predictions=preds, references=references)["score"]
    chrf_score = chrf.compute(predictions=preds, references=references)["score"]
    return bleu_score, chrf_score

bleu_score, chrf_score = compute_scores(valid_pred, valid_gt, valid_gem, valid_cfilt)

# Determine best reference per metric (based on BLEU)
all_scores = {
    "GT":    bleu.compute(predictions=valid_pred, references=[[r] for r in valid_gt])["score"],
    "Gemini": bleu.compute(predictions=valid_pred, references=[[r] for r in valid_gem])["score"],
    "CFILT":  bleu.compute(predictions=valid_pred, references=[[r] for r in valid_cfilt])["score"]
}

best_ref = max(all_scores, key=all_scores.get)

print("\n===== FINAL METRICS =====")
print(f"All references combined â†’ BLEU: {bleu_score:.2f}, chrF++: {chrf_score:.2f}")
print(f"GT Marathi â†’ BLEU: {all_scores['GT']:.2f}")
print(f"Gemini    â†’ BLEU: {all_scores['Gemini']:.2f}")
print(f"CFILT     â†’ BLEU: {all_scores['CFILT']:.2f}")
print(f"\nðŸŽ¯ BEST REFERENCE = {best_ref} (by highest BLEU)")

# -------------------- SAVE METRICS --------------------
with open(f"{mode}_indictrans2_eval_metrics.txt", "w") as f:
    f.write(f"All references combined â†’ BLEU {bleu_score:.2f}, chrF++ {chrf_score:.2f}\n")
    f.write(f"GT    BLEU {all_scores['GT']:.2f}\n")
    f.write(f"Gem   BLEU {all_scores['Gemini']:.2f}\n")
    f.write(f"CFILT BLEU {all_scores['CFILT']:.2f}\n")
    f.write(f"\nBEST REFERENCE = {best_ref}\n")

print(f"Metrics written to punct_{mode}_baseline_outputs_eval_metrics.txt")

In [None]:
results_df['prediction'][0]

In [None]:
src_sentences = raw_datasets['test']["sent_meant"]

In [None]:
from evaluate import load
from tqdm import tqdm
import pandas as pd

# -------------------- TRANSLATION --------------------
valid_src = []
valid_pred = []
valid_gt = []
valid_gem = []
valid_cfilt = []

# Translate in batches
all_translations = []
for i in tqdm(range(0, len(src_sentences), BATCH_SIZE)):
    batch = src_sentences[i:i+BATCH_SIZE]
#     print(batch)
    translations = batch_translate(
        batch,
        src_lang,
        tgt_lang,
        en_indic_model,
        en_indic_tokenizer,
        ip,
        device=DEVICE
    )
    all_translations.extend(translations)

    if translations is None:
        print(f"[SKIPPED] Batch {i}: Returned None")
        continue

    cleaned = [remove_prefix(t) for t in translations]

    valid_src.extend(batch)
    valid_pred.extend(cleaned)
    valid_gt.extend(ref_gt[i:i + len(batch)])
    valid_gem.extend(ref_gem[i:i + len(batch)])
    valid_cfilt.extend(ref_cfilt[i:i + len(batch)])

print(f"\nSuccessful translations: {len(valid_pred)} / {len(src_sentences)}")

In [None]:
mode = "sent_meant"

In [None]:
import pandas as pd
from evaluate import load

# -------------------- SAVE OUTPUTS --------------------
results_df = pd.DataFrame({
    "src": valid_src,
    "prediction": valid_pred,
    "gt": valid_gt,
    "gemini": valid_gem,
    "cfilt": valid_cfilt
})

results_df.to_csv(f"{mode}_outputs.csv", index=False)
print(f"âœ” Saved predictions to {mode}_outputs.csv")

# -------------------- METRICS --------------------
bleu = load("sacrebleu")
chrf = load("chrf")

def compute_scores(preds, ref1, ref2, ref3):
    """
    Compute BLEU and chrF++ scores using all three references for each sentence.
    """
    references = [[r1, r2, r3] for r1, r2, r3 in zip(ref1, ref2, ref3)]  # sacrebleu format
    bleu_score = bleu.compute(predictions=preds, references=references)["score"]
    chrf_score = chrf.compute(predictions=preds, references=references)["score"]
    return bleu_score, chrf_score

bleu_score, chrf_score = compute_scores(valid_pred, valid_gt, valid_gem, valid_cfilt)

# Determine best reference per metric (based on BLEU)
all_scores = {
    "GT":    bleu.compute(predictions=valid_pred, references=[[r] for r in valid_gt])["score"],
    "Gemini": bleu.compute(predictions=valid_pred, references=[[r] for r in valid_gem])["score"],
    "CFILT":  bleu.compute(predictions=valid_pred, references=[[r] for r in valid_cfilt])["score"]
}

best_ref = max(all_scores, key=all_scores.get)

print("\n===== FINAL METRICS =====")
print(f"All references combined â†’ BLEU: {bleu_score:.2f}, chrF++: {chrf_score:.2f}")
print(f"GT Marathi â†’ BLEU: {all_scores['GT']:.2f}")
print(f"Gemini    â†’ BLEU: {all_scores['Gemini']:.2f}")
print(f"CFILT     â†’ BLEU: {all_scores['CFILT']:.2f}")
print(f"\nðŸŽ¯ BEST REFERENCE = {best_ref} (by highest BLEU)")

# -------------------- SAVE METRICS --------------------
with open(f"{mode}_indictrans2_eval_metrics.txt", "w") as f:
    f.write(f"All references combined â†’ BLEU {bleu_score:.2f}, chrF++ {chrf_score:.2f}\n")
    f.write(f"GT    BLEU {all_scores['GT']:.2f}\n")
    f.write(f"Gem   BLEU {all_scores['Gemini']:.2f}\n")
    f.write(f"CFILT BLEU {all_scores['CFILT']:.2f}\n")
    f.write(f"\nBEST REFERENCE = {best_ref}\n")

print(f"Metrics written to punct_{mode}_baseline_outputs_eval_metrics.txt")

In [None]:
predicted_sentences[7:13]

In [None]:
src_sentences[7:13]