<a href="https://colab.research.google.com/github/Lamishij/insurance/blob/main/Lamis_and_Ghada_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install camel-tools
!pip install camel-tools --upgrade



In [4]:
import numpy as np
import pandas as pd
import re
import nltk
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem.isri import ISRIStemmer
from io import StringIO
import os

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Initialize Arabic stopwords and stemmer
arabic_stopwords = set(stopwords.words('arabic'))
stemmer = ISRIStemmer()

def normalize_arabic(text):
    """Normalize Arabic text by replacing common variations of letters."""
    text = text.replace("أ", "ا").replace("إ", "ا").replace("آ", "ا")
    text = text.replace("ى", "ي").replace("ة", "ه")
    return text

def lemmatize_arabic(word):
    """Basic lemmatization function for Arabic words."""
    # This is a placeholder for a more sophisticated lemmatization approach.
    # For now, we will just return the word as is.
    return word

def preprocess_arabic(text):
    """Preprocess Arabic text by normalizing, removing punctuation, tokenizing, and removing stopwords."""
    text = normalize_arabic(text)
    text = re.sub(r'[^\u0600-\u06FF\s]', '', text)  # Keep only Arabic characters
    words = text.split()  # Using split() for better Arabic handling
    words = [word for word in words if word not in arabic_stopwords]  # Remove stopwords but keep full words
    # Apply stemming and lemmatization
    processed_words = [(lemmatize_arabic(stemmer.stem(word)), stemmer.stem(word)) for word in words]
    return processed_words

def compute_tfidf_matrix(docs):
    """Compute and return the TF-IDF matrix for Arabic documents with improved word retention."""
    preprocessed_docs = [preprocess_arabic(doc) for doc in docs]
    print("Preprocessed Documents:", preprocessed_docs)  # Debugging output

    # Flatten the list of processed words for TF-IDF
    flat_docs = [' '.join([word[0] for word in doc]) for doc in preprocessed_docs]  # Using lemmatized words
    vectorizer = TfidfVectorizer(min_df=1, token_pattern=r'[^\s]+')  # Ensure all words are considered
    tfidf_matrix = vectorizer.fit_transform(flat_docs)

    print("Vocabulary:", vectorizer.get_feature_names_out())  # Debugging output

    return pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out()), preprocessed_docs

from google.colab import files

def load_file():
    """Upload and read a text file in Google Colab, trying different encodings."""
    uploaded = files.upload()  # Opens a file upload dialog
    file_name = list(uploaded.keys())[0]

    for encoding in ['utf-8', 'windows-1256', 'ISO-8859-6']:
        try:
            with open(file_name, "r", encoding=encoding) as file:
                docs = file.readlines()
            break  # If successful, exit the loop
        except UnicodeDecodeError:
            print(f"Failed to decode with {encoding}, trying next encoding...")

    return [doc.strip() for doc in docs]

# Load Arabic documents from file
docs = load_file()
if docs:
    tfidf_matrix, processed_docs = compute_tfidf_matrix(docs)
    print("TF-IDF Matrix:")
    print(tfidf_matrix)

    # Save the lemmatized and stemmed words to a file
    with open('processed_words.txt', 'w', encoding='utf-8') as f:
        for doc in processed_docs:
            for word, stem in doc:
                f.write(f"Lemmatized: {word}, Stemmed: {stem}\n")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Saving arabic.txt to arabic.txt
Preprocessed Documents: [[('حلل', 'حلل'), ('خطب', 'خطب'), ('نقد', 'نقد'), ('هجي', 'هجي'), ('بحث', 'بحث'), ('درس', 'درس'), ('خدم', 'خدم'), ('لغه', 'لغه'), ('خلق', 'خلق'), ('حفظ', 'حفظ'), ('علي', 'علي'), ('سلط', 'سلط'), ('هيم', 'هيم'), ('عدم', 'عدم'), ('ساا', 'ساا'), ('جمع', 'جمع')], [], [('قصر', 'قصر'), ('علي', 'علي'), ('حلل', 'حلل'), ('نصص', 'نصص'), ('حسب،', 'حسب،'), ('درس', 'درس'), ('ايض', 'ايض'), ('سيق', 'سيق'), ('جمع', 'جمع'), ('سيس', 'سيس'), ('نتج', 'نتج'), ('نصص', 'نصص'), ('سهل', 'سهل')], [], [('يتب', 'يتب'), ('حللو', 'حللو'), ('خطب', 'خطب'), ('نقد', 'نقد'), ('وقف', 'وقف'), ('نقد', 'نقد'), ('قضا', 'قضا'), ('يسع', 'يسع'), ('الي', 'الي'), ('كشف', 'كشف'), ('قام', 'قام')]]
Vocabulary: ['الي' 'ايض' 'بحث' 'جمع' 'حسب،' 'حفظ' 'حلل' 'حللو' 'خدم' 'خطب' 'خلق' 'درس'
 'ساا' 'سلط' 'سهل' 'سيس' 'سيق' 'عدم' 'علي' 'قام' 'قصر' 'قضا' 'كشف' 'لغه'
 'نتج' 'نصص' 'نقد' 'هجي' 'هيم' 'وقف' 'يتب' 'يسع']
TF-IDF Matrix:
        الي       ايض       بحث       جمع      حسب،       حف