<a href="https://colab.research.google.com/github/Lamishij/insurance/blob/main/Lamis_and_Ghada_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import re
import nltk
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem.isri import ISRIStemmer
from io import StringIO
import os

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab') # Download the missing 'punkt_tab' data

# Initialize Arabic stopwords and stemmer
arabic_stopwords = set(stopwords.words('arabic'))
stemmer = ISRIStemmer()

def preprocess_arabic(text):
    """Preprocess Arabic text by removing punctuation, normalizing, tokenizing, stemming, and removing stopwords."""
    text = re.sub(r'[^\u0600-\u06FF\s]', '', text)  # Keep only Arabic characters
    words = nltk.word_tokenize(text)
    words = [stemmer.stem(word) for word in words if word not in arabic_stopwords]  # Stem and remove stopwords
    return ' '.join(words)

def compute_tfidf_matrix(docs):
    """Compute and return the TF-IDF matrix for Arabic documents."""
    preprocessed_docs = [preprocess_arabic(doc) for doc in docs]
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(preprocessed_docs)
    return pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

from google.colab import files

def load_file():
    """Upload and read a text file in Google Colab, trying different encodings."""
    uploaded = files.upload()  # Opens a file upload dialog
    file_name = list(uploaded.keys())[0]

    # Try different encodings, starting with 'utf-8' then falling back to 'windows-1256'
    for encoding in ['utf-8', 'windows-1256', 'ISO-8859-6']:
        try:
            with open(file_name, "r", encoding=encoding) as file:
                docs = file.readlines()
            break  # If successful, exit the loop
        except UnicodeDecodeError:
            print(f"Failed to decode with {encoding}, trying next encoding...")

    return [doc.strip() for doc in docs]

# Load Arabic documents from file
docs = load_file()
if docs:
    tfidf_matrix = compute_tfidf_matrix(docs)
    print("TF-IDF Matrix:")
    print(tfidf_matrix)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Saving arabic.txt to arabic.txt
TF-IDF Matrix:
        ايض       بحث       بنى       جمع       حسب       حفظ       حلل  \
0  0.000000  0.274674  0.000000  0.221605  0.000000  0.274674  0.221605   
1  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
2  0.277856  0.000000  0.000000  0.224172  0.277856  0.000000  0.224172   
3  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
4  0.000000  0.000000  0.312278  0.000000  0.000000  0.000000  0.000000   

       حللو       خدم       خطب  ...       قضا       كشف       لغة       نتج  \
0  0.000000  0.274674  0.221605  ...  0.000000  0.000000  0.274674  0.000000   
1  0.000000  0.000000  0.000000  ...  0.000000  0.000000  0.000000  0.000000   
2  0.000000  0.000000  0.000000  ...  0.000000  0.000000  0.000000  0.277856   
3  0.000000  0.000000  0.000000  ...  0.000000  0.000000  0.000000  0.000000   
4  0.312278  0.000000  0.251944  ...  0.312278  0.312278  0.000000  0.000000   

        نصص       نقد