In [2]:
# # Arabic Text Auto-correction with BERT Fine-tuning



In [3]:
# ## 1. Setup and Installation

# First, let's install the necessary libraries:


In [4]:
!pip install transformers datasets torch evaluate pandas numpy tqdm


Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting

In [5]:
# ## 2. Mount Google Drive and Load Dataset

# Connect to Google Drive to access the dataset:


In [6]:
from google.colab import drive
drive.mount('/content/drive')

# Set the path to your dataset
dataset_path = '/content/drive/MyDrive/NTI Project/Dataset_corrected_incorrect.csv'  # Update this path


Mounted at /content/drive


In [7]:
## 3. Load and Explore the Dataset


In [8]:
import pandas as pd

# Load the dataset
df = pd.read_csv(dataset_path)

# Display basic information
print(f"Dataset shape: {df.shape}")
df.head()


Dataset shape: (30574, 2)


Unnamed: 0,correct_words,incorrect_words
0,بين أستوديوهات ورزازات وصحراء مرزوكة وآثار ولي...,بين أستوديوهات ورزازات وصحراء مرزوكة وآثار ولي...
1,قررت النجمة الأمريكية أوبرا وينفري ألا يقتصر ع...,قررت النجمٌة الأمريكية أوبرا وينفري ألا يقتصر ...
2,أخبارنا المغربية الوزاني تصوير الشملالي ألهب ا...,أخبارنا ئلمغربية إلوزاني ًتصوير الشملالي ألهب ...
3,اخبارنا المغربية قال ابراهيم الراشدي محامي سعد...,اخبارنا المغربية قال ابراهيم الراشدي محامي شعد...
4,تزال صناعة الجلود في المغرب تتبع الطريقة التقل...,تزال صنءعة الجلود في اشلمغرب تتعب الطريقة التق...


In [9]:
# ## 4. Preprocess the Dataset



In [10]:
from datasets import Dataset

# Using the dataset
train_data = df.sample(frac=0.8, random_state=42)
test_data = df.drop(train_data.index)

# Convert to Hugging Face datasets
train_dataset = Dataset.from_pandas(train_data)
test_dataset = Dataset.from_pandas(test_data)

print(f"Training samples: {len(train_dataset)}")
print(f"Testing samples: {len(test_dataset)}")


Training samples: 24459
Testing samples: 6115


In [11]:
# ## 5. Load the Pre-trained BERT Model




In [13]:
from transformers import AutoTokenizer, AutoModelForMaskedLM

# Load the tokenizer and model
model_name = "twitter/twhin-bert-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForMaskedLM.from_pretrained(model_name)

# Check if the tokenizer has Arabic vocabulary
arabic_sample = "مرحبا بالعالم"
tokens = tokenizer.tokenize(arabic_sample)
print(f"Tokenized Arabic sample: {tokens}")


tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/634 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.25G [00:00<?, ?B/s]

Tokenized Arabic sample: ['▁م', 'رحب', 'ا', '▁بال', 'عالم']


In [None]:
# ## 6. Prepare Training Data for Masked Language Model




In [14]:
import random
import torch
import numpy as np

def mask_tokens(inputs, tokenizer, mlm_probability=0.15):
    """Prepare masked tokens for masked language modeling prediction."""
    labels = inputs.clone()

    # We sample a few tokens in each sequence for masked-LM training
    probability_matrix = torch.full(labels.shape, mlm_probability)
    special_tokens_mask = [
        tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
    ]
    probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
    masked_indices = torch.bernoulli(probability_matrix).bool()

    # We only compute loss on masked tokens
    labels[~masked_indices] = -100

    # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
    indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
    inputs[indices_replaced] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token)

    # 10% of the time, we replace masked input tokens with random word
    indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
    random_words = torch.randint(len(tokenizer), labels.shape, dtype=torch.long)
    inputs[indices_random] = random_words[indices_random]

    # The rest of the time (10% of the time) we keep the masked input tokens unchanged
    return inputs, labels

def tokenize_function(examples):
    # Using 'correct_words' as the column with correct Arabic text
    return tokenizer(examples["correct_words"], padding="max_length", truncation=True, max_length=128)

# Tokenize the datasets
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

# Set the format for PyTorch
tokenized_train.set_format("torch", columns=["input_ids", "attention_mask"])
tokenized_test.set_format("torch", columns=["input_ids", "attention_mask"])


Map:   0%|          | 0/24459 [00:00<?, ? examples/s]

Map:   0%|          | 0/6115 [00:00<?, ? examples/s]

In [None]:
## 7. Create DataCollator for Masked Language Modeling


In [15]:
from transformers import DataCollatorForLanguageModeling

# Create a data collator for masked language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15
)


In [None]:
## 8. Fine-tune the Model


In [16]:
from transformers import Trainer, TrainingArguments

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    # Make sure eval_strategy and save_strategy match
    eval_strategy="steps",  # or "epoch" if you prefer
    save_strategy="steps",  # must match eval_strategy
    eval_steps=1000,         # how often to evaluate
    save_steps=1000,         # how often to save (must match eval_steps)
    logging_dir="./logs",
    logging_steps=200,
    save_total_limit=4,
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    greater_is_better=False,
    fp16=True,  # Use mixed precision training to save memory
)

# Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    data_collator=data_collator,
)

# Train the model
trainer.train()



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mhousam3a[0m ([33mhousam3a-helwan-universty[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


  return forward_call(*args, **kwargs)


Step,Training Loss,Validation Loss
1000,2.1547,1.990164
2000,1.8746,1.775375
3000,1.7943,1.663319


  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
There were missing keys in the checkpoint model loaded: ['cls.predictions.decoder.weight', 'cls.predictions.decoder.bias'].


TrainOutput(global_step=3058, training_loss=2.077185499192532, metrics={'train_runtime': 3539.0795, 'train_samples_per_second': 6.911, 'train_steps_per_second': 0.864, 'total_flos': 5703223431909888.0, 'train_loss': 2.077185499192532, 'epoch': 1.0})

In [17]:
# Cell 1: Save the model
model_save_path = "./arabic-autocorrect-model"
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)
print(f"Model saved to {model_save_path}")

# Save to Google Drive
drive_save_path = "/content/drive/MyDrive/NTI Project/arabic-autocorrect-model"
!mkdir -p {drive_save_path}
!cp -r {model_save_path}/* {drive_save_path}/
print(f"Model saved to Google Drive at {drive_save_path}")

Model saved to ./arabic-autocorrect-model
Model saved to Google Drive at /content/drive/MyDrive/NTI Project/arabic-autocorrect-model


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [18]:
# Cell 2: Install NLTK and Gradio and download all required data
!pip install nltk gradio
import nltk

# Download all necessary NLTK data
nltk.download('punkt')
nltk.download('perluniprops')
nltk.download('nonbreaking_prefixes')

from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package perluniprops to /root/nltk_data...
[nltk_data]   Unzipping misc/perluniprops.zip.
[nltk_data] Downloading package nonbreaking_prefixes to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping corpora/nonbreaking_prefixes.zip.


In [None]:
# Cell 3: Define autocorrection function with device handling and chunking for long texts
import re
from tqdm import tqdm

def autocorrect_text(text, model, tokenizer, max_corrections=5, max_length=512, use_jaccard=True):
    """Autocorrect Arabic text by identifying and fixing potential errors."""
    # Get the device that the model is on
    device = next(model.parameters()).device

    # Tokenize the text into words
    words = re.findall(r'\w+|[^\w\s]', text)
    corrections_made = 0
    corrections_details = []

    # Process words in chunks if the text is too long
    chunk_size = 50  # Process 50 words at a time
    for chunk_start in range(0, len(words), chunk_size):
        chunk_end = min(chunk_start + chunk_size, len(words))
        chunk_words = words[chunk_start:chunk_end]

        # Process each word in the chunk
        for i, word in enumerate(chunk_words):
            if len(word) <= 1 or not re.match(r'^[\u0600-\u06FF]+$', word):  # Skip non-Arabic or very short words
                continue

            # First try BERT model prediction
            # Get the context (words around the current word)
            context_start = max(0, i - 10)  # Take up to 10 words before
            context_end = min(len(chunk_words), i + 10)  # Take up to 10 words after
            context = chunk_words[context_start:i] + [tokenizer.mask_token] + chunk_words[i+1:context_end]

            # Create a masked version of the text with limited context
            masked_text = ' '.join(context)

            # Tokenize the masked text
            inputs = tokenizer(masked_text, return_tensors="pt", truncation=True, max_length=max_length)

            # Move inputs to the same device as the model
            inputs = {k: v.to(device) for k, v in inputs.items()}

            # Get the position of the mask token
            mask_token_index = torch.where(inputs["input_ids"][0] == tokenizer.mask_token_id)[0]
            if len(mask_token_index) == 0:
                continue

            # Get model predictions
            with torch.no_grad():
                outputs = model(**inputs)
                predictions = outputs.logits

            # Get the predicted token
            mask_token_index = mask_token_index[0].item()
            predicted_token_id = predictions[0, mask_token_index].argmax(dim=0).item()
            predicted_token = tokenizer.decode([predicted_token_id])
            
            # Check if prediction is valid Arabic
            if predicted_token != word and re.match(r'^[\u0600-\u06FF]+$', predicted_token):
                chunk_words[i] = predicted_token
                corrections_made += 1
                corrections_details.append(f"BERT: '{word}' → '{predicted_token}'")
                
            # If BERT didn't make a correction and Jaccard similarity is enabled, try using similarity dictionary
            elif use_jaccard and 'similarity_dict' in globals() and word in similarity_dict:
                # Get similar words from the dictionary
                similar_words = similarity_dict.get(word, [])
                if similar_words:
                    # Use the most similar word as a correction
                    most_similar_word, similarity = similar_words[0]
                    if similarity > 0.8:  # Only use if similarity is very high
                        chunk_words[i] = most_similar_word
                        corrections_made += 1
                        corrections_details.append(f"Jaccard: '{word}' → '{most_similar_word}' (similarity: {similarity:.2f})")
            
            # Stop if we've reached the maximum number of corrections
            if corrections_made >= max_corrections:
                break

        # Update the original words list with corrected words
        words[chunk_start:chunk_end] = chunk_words

        # Stop if we've reached the maximum number of corrections
        if corrections_made >= max_corrections:
            break

    corrected_text = ' '.join(words)
    return corrected_text, corrections_made, corrections_details

In [20]:
# Cell 4: Define evaluation metrics with robust tokenization
def calculate_bleu_score(references, candidates):
    """Calculate BLEU score between reference and candidate texts."""
    smoothie = SmoothingFunction().method1

    bleu_scores = []
    for ref, cand in zip(references, candidates):
        try:
            # Simple whitespace tokenization for Arabic
            ref_tokens = ref.split()
            cand_tokens = cand.split()

            # Calculate BLEU score for this pair
            score = sentence_bleu([ref_tokens], cand_tokens, smoothing_function=smoothie)
            bleu_scores.append(score)
        except Exception as e:
            print(f"Error calculating BLEU: {str(e)}")
            bleu_scores.append(0.0)

    # Return average BLEU score
    return sum(bleu_scores) / len(bleu_scores) if bleu_scores else 0.0

def calculate_character_accuracy(references, candidates):
    """Calculate character-level accuracy between reference and candidate texts."""
    total_chars = 0
    correct_chars = 0

    for ref, cand in zip(references, candidates):
        try:
            # Calculate Levenshtein distance
            distance = nltk.edit_distance(ref, cand)

            # Calculate character accuracy
            total_chars += len(ref)
            correct_chars += len(ref) - distance
        except Exception as e:
            print(f"Error calculating character accuracy: {str(e)}")
            # Skip this pair
            continue

    # Return character-level accuracy
    return correct_chars / total_chars if total_chars > 0 else 0.0

In [21]:
# Cell 5: Evaluate the model on test data with smaller batch size
print("Evaluating model on test data...")
# Use a smaller number of samples if running into memory issues
num_eval_samples = 50  # Reduced from 100
test_samples = test_dataset.select(range(min(num_eval_samples, len(test_dataset))))

# Get original and reference texts
original_texts = test_samples["incorrect_words"]
reference_texts = test_samples["correct_words"]
corrected_texts = []

# Apply correction to each text
for text in tqdm(original_texts):
    try:
        corrected_text, _ = autocorrect_text(text, model, tokenizer)
        corrected_texts.append(corrected_text)
    except Exception as e:
        print(f"Error processing text: {str(e)}")
        # Use original text as fallback
        corrected_texts.append(text)

Evaluating model on test data...


  return forward_call(*args, **kwargs)
100%|██████████| 50/50 [00:30<00:00,  1.66it/s]


In [22]:
# Cell 6: Calculate and print evaluation metrics with fallback methods
# Define a simple BLEU score calculator as fallback
def simple_bleu_score(references, candidates):
    """Simple BLEU score calculation without NLTK dependencies."""
    correct_count = 0
    total_count = 0

    for ref, cand in zip(references, candidates):
        # Simple whitespace tokenization
        ref_tokens = ref.split()
        cand_tokens = cand.split()

        # Count matching tokens
        for token in cand_tokens:
            if token in ref_tokens:
                correct_count += 1
            total_count += 1

    # Calculate precision
    return correct_count / total_count if total_count > 0 else 0.0

# Try to calculate BLEU scores using NLTK
try:
    original_bleu = calculate_bleu_score(reference_texts, original_texts)
    corrected_bleu = calculate_bleu_score(reference_texts, corrected_texts)
except Exception as e:
    print(f"Error with NLTK BLEU calculation: {str(e)}")
    print("Using simple BLEU score calculation instead")
    original_bleu = simple_bleu_score(reference_texts, original_texts)
    corrected_bleu = simple_bleu_score(reference_texts, corrected_texts)

bleu_improvement = corrected_bleu - original_bleu

# Calculate character-level accuracy
try:
    original_char_acc = calculate_character_accuracy(reference_texts, original_texts)
    corrected_char_acc = calculate_character_accuracy(reference_texts, corrected_texts)
except Exception as e:
    print(f"Error with character accuracy calculation: {str(e)}")
    # Fallback to simple character match rate
    original_char_acc = sum(sum(1 for a, b in zip(ref, orig) if a == b) / len(ref)
                           for ref, orig in zip(reference_texts, original_texts)) / len(reference_texts)
    corrected_char_acc = sum(sum(1 for a, b in zip(ref, corr) if a == b) / len(ref)
                            for ref, corr in zip(reference_texts, corrected_texts)) / len(reference_texts)

char_acc_improvement = corrected_char_acc - original_char_acc

# Print results
print("\nEvaluation Results:")
print(f"Original BLEU Score: {original_bleu:.4f}")
print(f"Corrected BLEU Score: {corrected_bleu:.4f}")
print(f"BLEU Score Improvement: {bleu_improvement:.4f}")
print(f"Original Character-level Accuracy: {original_char_acc:.4f}")
print(f"Corrected Character-level Accuracy: {corrected_char_acc:.4f}")
print(f"Character-level Accuracy Improvement: {char_acc_improvement:.4f}")


Evaluation Results:
Original BLEU Score: 0.4781
Corrected BLEU Score: 0.4260
BLEU Score Improvement: -0.0521
Original Character-level Accuracy: 0.9487
Corrected Character-level Accuracy: 0.9179
Character-level Accuracy Improvement: -0.0308


In [23]:
# Cell 7: Save evaluation results with directory creation
import pickle
import os

# Create the directory if it doesn't exist
drive_save_path = "/content/drive/MyDrive/NTI Project/arabic-autocorrect-model"
os.makedirs(drive_save_path, exist_ok=True)
print(f"Created directory: {drive_save_path}")

# Create a dictionary with evaluation results
evaluation_results = {
    "original_bleu": original_bleu,
    "corrected_bleu": corrected_bleu,
    "bleu_improvement": bleu_improvement,
    "original_char_acc": original_char_acc,
    "corrected_char_acc": corrected_char_acc,
    "char_acc_improvement": char_acc_improvement
}

# Save the results
try:
    with open(os.path.join(drive_save_path, "evaluation_results.pkl"), "wb") as f:
        pickle.dump(evaluation_results, f)
    print("Evaluation results saved to Google Drive")
except Exception as e:
    print(f"Error saving to Google Drive: {str(e)}")

    # Fallback to local storage
    local_save_path = "./evaluation_results.pkl"
    with open(local_save_path, "wb") as f:
        pickle.dump(evaluation_results, f)
    print(f"Evaluation results saved locally to {local_save_path}")

Created directory: /content/drive/MyDrive/NTI Project/arabic-autocorrect-model
Evaluation results saved to Google Drive


In [None]:
# Cell 8: Create Gradio interface with Jaccard similarity support
import gradio as gr

def gradio_autocorrect(input_text, use_jaccard=True):
    """Function for Gradio interface"""
    corrected_text, num_corrections, corrections_details = autocorrect_text(
        input_text, model, tokenizer, use_jaccard=use_jaccard
    )

    if num_corrections > 0:
        correction_info = f"Made {num_corrections} correction(s):\n" + "\n".join(corrections_details)
        return corrected_text, correction_info
    else:
        return corrected_text, "No corrections needed"

def get_similar_words(word):
    """Interface function to get similar words for a given word"""
    if not re.match(r'^[\u0600-\u06FF]+$', word):
        return "Please enter an Arabic word"
    
    if 'similarity_dict' not in globals():
        return "Similarity dictionary not loaded"
    
    similar_words = similarity_dict.get(word, [])
    if not similar_words:
        return "No similar words found"
    
    result = "Similar words:\n"
    for similar_word, similarity in similar_words:
        result += f"{similar_word} (similarity: {similarity:.2f})\n"
    
    return result

# Create the Gradio interface
with gr.Blocks(title="Arabic Text Autocorrection with Jaccard Similarity") as demo:
    gr.Markdown("# Arabic Text Autocorrection with Jaccard Similarity")
    gr.Markdown(f"""
    ### Model Evaluation Results:
    - BLEU Score Improvement: {bleu_improvement:.4f}
    - Character-level Accuracy Improvement: {char_acc_improvement:.4f}
    """)
    
    with gr.Tabs():
        with gr.TabItem("Text Correction"):
            with gr.Row():
                with gr.Column():
                    input_text = gr.Textbox(label="Input Text (Arabic)", placeholder="أدخل النص العربي هنا...", lines=5)
                    use_jaccard = gr.Checkbox(label="Use Jaccard Similarity", value=True)
                    submit_btn = gr.Button("Correct Text")

                with gr.Column():
                    output_text = gr.Textbox(label="Corrected Text", lines=5)
                    correction_info = gr.Textbox(label="Correction Info", lines=5)

            # Add examples
            examples = [
                ["مرحبا بالعلم"],  # Hello to science (should be "world" - العالم)
                ["انا احب القراه"],  # I love reading (misspelled - should be القراءة)
                ["هذا كتب جميل"],  # This is a beautiful book (grammar error - should be كتاب)
                ["المدرسه كبيره"],  # The school is big (missing dots on ة - should be المدرسة كبيرة)
                ["سافرت الى القاهره امس"]  # I traveled to Cairo yesterday (missing dots - القاهرة)
            ]

            gr.Examples(examples, input_text)

            # Set up the function call
            submit_btn.click(fn=gradio_autocorrect, inputs=[input_text, use_jaccard], outputs=[output_text, correction_info])
            
        with gr.TabItem("Word Similarity Lookup"):
            with gr.Row():
                with gr.Column():
                    word_input = gr.Textbox(label="Enter Arabic Word", placeholder="أدخل كلمة عربية هنا...")
                    word_submit_btn = gr.Button("Find Similar Words")
                
                with gr.Column():
                    similar_words_output = gr.Textbox(label="Similar Words", lines=5)
            
            # Set up the function call
            word_submit_btn.click(fn=get_similar_words, inputs=word_input, outputs=similar_words_output)

# Launch the interface
demo.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://44e235ad7a0c3a38b2.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [None]:
# Create Jaccard Similarity Reference Dictionary

print("Creating Jaccard similarity reference dictionary...")
import re
from tqdm import tqdm
import pickle
import numpy as np

# Function to calculate Jaccard similarity between two words
def jaccard_similarity(word1, word2):
    """Calculate Jaccard similarity between two words (character-level)"""
    set1 = set(word1)
    set2 = set(word2)
    
    # Calculate Jaccard similarity
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    
    return intersection / union if union > 0 else 0

# Extract all unique Arabic words from the dataset
all_words = set()
for text in tqdm(df['correct_words']):
    # Extract Arabic words
    words = re.findall(r'[\u0600-\u06FF]+', text)
    all_words.update(words)

print(f"Found {len(all_words)} unique Arabic words")

# Create similarity dictionary (for words that are commonly confused)
# This is a computationally intensive process, so we'll limit it
# We'll create a dictionary of similar words for each word in the dataset
similarity_threshold = 0.7  # Words with similarity above this threshold are considered similar
max_words_to_process = 5000  # Limit the number of words to process

# Get the most common words from the dataset
word_freq = {}
for text in df['correct_words']:
    words = re.findall(r'[\u0600-\u06FF]+', text)
    for word in words:
        word_freq[word] = word_freq.get(word, 0) + 1

# Sort words by frequency
common_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
common_words = [word for word, _ in common_words[:max_words_to_process]]

print(f"Building similarity dictionary for {len(common_words)} most common words...")

# Create similarity dictionary
similarity_dict = {}
for i, word1 in enumerate(tqdm(common_words)):
    similar_words = []
    # Compare with other words
    for word2 in common_words:
        if word1 != word2:
            # Calculate similarity only if words have similar length
            # This optimization speeds up the process significantly
            if abs(len(word1) - len(word2)) <= 2:
                similarity = jaccard_similarity(word1, word2)
                if similarity > similarity_threshold:
                    similar_words.append((word2, similarity))
    
    # Sort by similarity (highest first) and keep top 5
    similar_words.sort(key=lambda x: x[1], reverse=True)
    similarity_dict[word1] = similar_words[:5]

# Save the similarity dictionary
similarity_dict_path = "./arabic-autocorrect-model/similarity_dict.pkl"
with open(similarity_dict_path, "wb") as f:
    pickle.dump(similarity_dict, f)

print(f"Similarity dictionary saved to {similarity_dict_path}")

# Save to Google Drive as well
drive_similarity_dict_path = "/content/drive/MyDrive/NTI Project/arabic-autocorrect-model/similarity_dict.pkl"
!cp {similarity_dict_path} {drive_similarity_dict_path}
print(f"Similarity dictionary saved to Google Drive at {drive_similarity_dict_path}")


In [None]:
# Download the saved model with all dependencies to local machine

# First, compress the model folder and its dependencies
!zip -r arabic_autocorrect_model.zip ./arabic-autocorrect-model

# Install required packages for the model to run locally
with open("requirements.txt", "w") as f:
    f.write("""transformers==4.54.1
torch>=2.0.0
numpy>=2.0.0
pandas>=2.2.0
nltk>=3.9.0
gradio>=5.0.0
tqdm>=4.0.0
pickle5; python_version < '3.8'
""")

# Create a simple README file with instructions
with open("README.md", "w") as f:
    f.write("""# Arabic Autocorrect BERT Model with Jaccard Similarity

This package contains a fine-tuned BERT model for Arabic text autocorrection, enhanced with Jaccard similarity for word comparison.

## Setup Instructions

1. Install the required dependencies:
   ```
   pip install -r requirements.txt
   ```

2. Extract the model files:
   ```
   unzip arabic_autocorrect_model.zip
   ```

3. Run the Gradio interface:
   ```python
   python run_arabic_autocorrect.py
   ```

## Model Information
- Base model: twitter/twhin-bert-large
- Fine-tuned on Arabic text correction dataset
- Uses masked language modeling for text correction
- Enhanced with Jaccard similarity for word comparison

## Features
- BERT-based text correction
- Jaccard similarity for finding similar words
- Interactive Gradio interface with two modes:
  - Text correction mode
  - Word similarity lookup mode
""")

# Create a simple script to run the model locally
with open("run_arabic_autocorrect.py", "w") as f:
    f.write("""
import gradio as gr
import torch
import re
import pickle
from transformers import AutoModelForMaskedLM, AutoTokenizer
import nltk
import os

# Download NLTK data
nltk.download('punkt')

# Load the model and tokenizer
model_path = "./arabic-autocorrect-model"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForMaskedLM.from_pretrained(model_path)

# Load the similarity dictionary
similarity_dict_path = os.path.join(model_path, "similarity_dict.pkl")
if os.path.exists(similarity_dict_path):
    with open(similarity_dict_path, "rb") as f:
        similarity_dict = pickle.load(f)
    print(f"Loaded similarity dictionary with {len(similarity_dict)} words")
else:
    print(f"Warning: Similarity dictionary not found at {similarity_dict_path}")
    similarity_dict = {}

def jaccard_similarity(word1, word2):
    """Calculate Jaccard similarity between two words (character-level)"""
    set1 = set(word1)
    set2 = set(word2)
    
    # Calculate Jaccard similarity
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    
    return intersection / union if union > 0 else 0

def get_similar_words(word):
    """Get similar words from the similarity dictionary or calculate on the fly"""
    if word in similarity_dict:
        return similarity_dict[word]
    
    # If word not in dictionary, calculate similarity with common words
    # (This is a fallback and might be slow)
    similar_words = []
    for dict_word in list(similarity_dict.keys())[:1000]:  # Limit to first 1000 words for speed
        if abs(len(word) - len(dict_word)) <= 2:  # Only compare words of similar length
            similarity = jaccard_similarity(word, dict_word)
            if similarity > 0.7:
                similar_words.append((dict_word, similarity))
    
    # Sort by similarity and return top 5
    similar_words.sort(key=lambda x: x[1], reverse=True)
    return similar_words[:5]

def autocorrect_text(text, model, tokenizer, max_corrections=5, max_length=512, use_jaccard=True):
    """Autocorrect Arabic text by identifying and fixing potential errors."""
    # Get the device that the model is on
    device = next(model.parameters()).device

    # Tokenize the text into words
    words = re.findall(r'\\w+|[^\\w\\s]', text)
    corrections_made = 0
    corrections_details = []

    # Process words in chunks if the text is too long
    chunk_size = 50  # Process 50 words at a time
    for chunk_start in range(0, len(words), chunk_size):
        chunk_end = min(chunk_start + chunk_size, len(words))
        chunk_words = words[chunk_start:chunk_end]

        # Process each word in the chunk
        for i, word in enumerate(chunk_words):
            if len(word) <= 1 or not re.match(r'^[\\u0600-\\u06FF]+$', word):  # Skip non-Arabic or very short words
                continue

            # First try BERT model prediction
            # Get the context (words around the current word)
            context_start = max(0, i - 10)  # Take up to 10 words before
            context_end = min(len(chunk_words), i + 10)  # Take up to 10 words after
            context = chunk_words[context_start:i] + [tokenizer.mask_token] + chunk_words[i+1:context_end]

            # Create a masked version of the text with limited context
            masked_text = ' '.join(context)

            # Tokenize the masked text
            inputs = tokenizer(masked_text, return_tensors="pt", truncation=True, max_length=max_length)

            # Move inputs to the same device as the model
            inputs = {k: v.to(device) for k, v in inputs.items()}

            # Get the position of the mask token
            mask_token_index = torch.where(inputs["input_ids"][0] == tokenizer.mask_token_id)[0]
            if len(mask_token_index) == 0:
                continue

            # Get model predictions
            with torch.no_grad():
                outputs = model(**inputs)
                predictions = outputs.logits

            # Get the predicted token
            mask_token_index = mask_token_index[0].item()
            predicted_token_id = predictions[0, mask_token_index].argmax(dim=0).item()
            predicted_token = tokenizer.decode([predicted_token_id])
            
            # Check if prediction is valid Arabic
            if predicted_token != word and re.match(r'^[\\u0600-\\u06FF]+$', predicted_token):
                chunk_words[i] = predicted_token
                corrections_made += 1
                corrections_details.append(f"BERT: '{word}' → '{predicted_token}'")
                
            # If BERT didn't make a correction and Jaccard similarity is enabled, try using similarity dictionary
            elif use_jaccard and similarity_dict:
                # Get similar words from the dictionary
                similar_words = get_similar_words(word)
                if similar_words:
                    # Use the most similar word as a correction
                    most_similar_word, similarity = similar_words[0]
                    if similarity > 0.8:  # Only use if similarity is very high
                        chunk_words[i] = most_similar_word
                        corrections_made += 1
                        corrections_details.append(f"Jaccard: '{word}' → '{most_similar_word}' (similarity: {similarity:.2f})")
            
            # Stop if we've reached the maximum number of corrections
            if corrections_made >= max_corrections:
                break

        # Update the original words list with corrected words
        words[chunk_start:chunk_end] = chunk_words

        # Stop if we've reached the maximum number of corrections
        if corrections_made >= max_corrections:
            break

    corrected_text = ' '.join(words)
    return corrected_text, corrections_made, corrections_details

def gradio_autocorrect(input_text, use_jaccard=True):
    """Function for Gradio interface"""
    corrected_text, num_corrections, corrections_details = autocorrect_text(
        input_text, model, tokenizer, use_jaccard=use_jaccard
    )

    if num_corrections > 0:
        correction_info = f"Made {num_corrections} correction(s):\\n" + "\\n".join(corrections_details)
        return corrected_text, correction_info
    else:
        return corrected_text, "No corrections needed"

def get_similar_words_interface(word):
    """Interface function to get similar words for a given word"""
    if not re.match(r'^[\\u0600-\\u06FF]+$', word):
        return "Please enter an Arabic word"
    
    similar_words = get_similar_words(word)
    if not similar_words:
        return "No similar words found"
    
    result = "Similar words:\\n"
    for similar_word, similarity in similar_words:
        result += f"{similar_word} (similarity: {similarity:.2f})\\n"
    
    return result

# Create the Gradio interface
with gr.Blocks(title="Arabic Text Autocorrection with Jaccard Similarity") as demo:
    gr.Markdown("# Arabic Text Autocorrection")
    gr.Markdown("This model uses BERT and Jaccard similarity to correct Arabic text.")
    
    with gr.Tabs():
        with gr.TabItem("Text Correction"):
            with gr.Row():
                with gr.Column():
                    input_text = gr.Textbox(label="Input Text (Arabic)", placeholder="أدخل النص العربي هنا...", lines=5)
                    use_jaccard = gr.Checkbox(label="Use Jaccard Similarity", value=True)
                    submit_btn = gr.Button("Correct Text")

                with gr.Column():
                    output_text = gr.Textbox(label="Corrected Text", lines=5)
                    correction_info = gr.Textbox(label="Correction Info", lines=5)

            # Add examples
            examples = [
                ["مرحبا بالعلم"],  # Hello to science (should be "world" - العالم)
                ["انا احب القراه"],  # I love reading (misspelled - should be القراءة)
                ["هذا كتب جميل"],  # This is a beautiful book (grammar error - should be كتاب)
                ["المدرسه كبيره"],  # The school is big (missing dots on ة - should be المدرسة كبيرة)
                ["سافرت الى القاهره امس"]  # I traveled to Cairo yesterday (missing dots - القاهرة)
            ]

            gr.Examples(examples, input_text)

            # Set up the function call
            submit_btn.click(fn=gradio_autocorrect, inputs=[input_text, use_jaccard], outputs=[output_text, correction_info])
            
        with gr.TabItem("Word Similarity"):
            with gr.Row():
                with gr.Column():
                    word_input = gr.Textbox(label="Enter Arabic Word", placeholder="أدخل كلمة عربية هنا...")
                    word_submit_btn = gr.Button("Find Similar Words")
                
                with gr.Column():
                    similar_words_output = gr.Textbox(label="Similar Words", lines=5)
            
            # Set up the function call
            word_submit_btn.click(fn=get_similar_words_interface, inputs=word_input, outputs=similar_words_output)

# Launch the interface
if __name__ == "__main__":
    demo.launch()
""")

# Create a zip file with all the necessary files for local use
!zip -r arabic_autocorrect_package.zip arabic_autocorrect_model.zip requirements.txt README.md run_arabic_autocorrect.py

print("Model package with Jaccard similarity created successfully!")
print("To download to your local machine:")
print("1. Click on the folder icon in the left sidebar")
print("2. Find 'arabic_autocorrect_package.zip' and download it")
print("3. Extract the zip file on your local machine")
print("4. Follow the instructions in the README.md file")

# Optional: Create a direct download link using Google Drive
from google.colab import files
files.download('arabic_autocorrect_package.zip')
