<a href="https://colab.research.google.com/github/Lujain-M02/2023-GP1-6/blob/main/scoring_system.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pyarabic
!pip install stanza
!pip install nltk
!pip install transformers

Collecting pyarabic
  Downloading PyArabic-0.6.15-py3-none-any.whl (126 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.4/126.4 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarabic
Successfully installed pyarabic-0.6.15
Collecting stanza
  Downloading stanza-1.6.1-py3-none-any.whl (881 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m881.2/881.2 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting emoji (from stanza)
  Downloading emoji-2.8.0-py2.py3-none-any.whl (358 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m358.9/358.9 kB[0m [31m42.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: emoji, stanza
Successfully installed emoji-2.8.0 stanza-1.6.1


# Feature 1: NER (PERSON, LOC, ORG) and
# Feature 2: Title Word




In [None]:
import stanza
from pyarabic.araby import sentence_tokenize, strip_tashkeel
import re
import nltk
from nltk.stem.isri import ISRIStemmer

# Initialize the Arabic stemmer
stemmer = ISRIStemmer()

def stem_tokens(tokens):
    """Stem a list of tokens."""
    return [stemmer.stem(token) for token in tokens]


def calculate_clause_scores_arabic(story, title):
    # Load the Arabic pipeline
    nlp = stanza.Pipeline(lang='ar', processors='tokenize,mwt,pos,lemma,depparse,ner')
    # Process the title and stem title words
    title_doc = nlp(title)
    stemmed_title_words = set(stem_tokens([word.text for sent in title_doc.sentences for word in sent.words]))
    clauses = sentence_tokenize(story)

    scores = []
    for clause in clauses:
        if clause.strip():  # Ensure the clause is not just whitespace
            clause_doc = nlp(clause)
            clause_tokens = [word.text for sent in clause_doc.sentences for word in sent.words]
            stemmed_clause_tokens = stem_tokens(clause_tokens)

            #title_word_score = calculate_title_word_score(stemmed_clause_tokens, stemmed_title_words)
            # print(f"Clause: {clause}\nTitle Word Score: {title_word_score}\n")

            ner_entities = [(ent.text, ent.type) for sent in clause_doc.sentences for ent in sent.ents]
            meaningful_ner_labels = ['PER', 'ORG', 'LOC']

            # Count the number of detected NER types within meaningful labels
            ner_score = sum(ner_type in meaningful_ner_labels for _, ner_type in ner_entities)

            # Print NER score and detected NER types
            print(f"Clause: {clause}")
            print(f"NER Score: {ner_score}")
            for entity, ner_type in ner_entities:
                if ner_type in meaningful_ner_labels:
                    print(f"Detected Entity: {entity}, Type: {ner_type}")
            print("\n")

    return scores

def calculate_title_word_score(clause_tokens, stemmed_title_words):
    """Count the number of distinct title words present in the clause."""
    return sum(word in stemmed_title_words for word in words_in_clause)


def calculate_similarity_matrix(clauses):
    stemmed_clauses = [stem_tokens(clause.split()) for clause in clauses]
    n = len(stemmed_clauses)
    similarity_matrix = [[0 for _ in range(n)] for _ in range(n)]

    for i in range(n):
        for j in range(n):
            if i != j and clauses[i].strip() and clauses[j].strip():
                set_i = set(stemmed_clauses[i])
                set_j = set(stemmed_clauses[j])
                unique_tokens_i = set_i - set_j
                unique_tokens_j = set_j - set_i
                total_unique_tokens = len(unique_tokens_i) + len(unique_tokens_j)
                total_tokens = len(set_i.union(set_j))
                similarity_matrix[i][j] = total_unique_tokens / total_tokens if total_tokens else 0

    return similarity_matrix




def score_clauses(all_clauses):
    all_scores = []
    clause_score_pairs = []
    for clauses in all_clauses:
        similarity_matrix = calculate_similarity_matrix(clauses)
        clause_scores = [sum(row) for row in similarity_matrix]
        all_scores.extend(clause_scores)

        # Pairing each clause with its score and filtering out whitespace-only clauses
        clause_score_pairs.extend([(clause, score) for clause, score in zip(clauses, clause_scores) if clause.strip()])

    # Sorting clauses by score in descending order
    sorted_clauses = sorted(clause_score_pairs, key=lambda x: x[1], reverse=True)

    # Printing sorted clauses and their scores
    for clause, score in sorted_clauses:
        print(f"Clause: {clause}\nSimilarity Score: {score}\n")

    return all_scores


# Example usage
story = """
ذات يوم من الأيام كان هناك شقيقين يعملان سوياً ويعيشان معًا في مزرعتهما، وكان الشقيق الأكبر رجل متزوج وعنده أسرة كبيرة، وأما الشقيق الصغير كان غير متزوج، وفي نهاية كل موسم ويوم كان الأخان يتقاسمان كل المكاسب والحصاد بالتساوي، وكان الشقيقين يستطيعان الربح من أي شيء ينتجانه من المزرعة.

وفي أحد الأيام حدث الأخ الصغير نفسه: ليس عدلاً أن نتشارك أنا واخي الإنتاج والربح بشكل متساوي، فأنا أعيش وحدي واحتياجاتي تعد بسيطة وكذلك قليلة، لذا كان الأخ الأصغر كل يوم يأخذ كيسًا من الحبوب من السلة خاصته كل ليلة ويتسلل به عبر المزرعة التي بين منزل الشقيقين ويقوم بوضعه في صندوق أخيه الأكبر.

الغريب أنه في خلال ذلك الوقت، قال الأخ الأكبر في ذاته: ليس من العدل أن نتقاسم أنا واخي كل الإنتاج والربح بالتساوي، فأنا بعد كل شيء رجل متزوج ولدي أولادي وزوجتي للاهتمام بي عندما أكبر بعد عمر طويل، أما أخي فليس لديه من يعوله، وقد لا يتواجد أحد للاعتناء به في المستقبل.

فكان الشقيق الأكبر مساءاً وعند كل ليلة، كان الشقيق الأكبر يأخذ كيسًا من الحبوب خاصته ويقوم بوضعه في سلة أخيه، وقد استمر هذا الحال على نفس المنوال لسنوات وكان كلا الأخوين في شك من ناحية حقيقة أن إمداداتهم من الحبوب لم تقل أبدًا!، لكن في ليلة مظلمة للغاية، قابل الشقيقان بعضهما البعض وهما يتجهان نحو سلال بعضهما بالأكياس، تفاجأ الشقيقين ثم استوعبوا ما يحدث، أسقط الشقيقين أكياسهم وعانقوا بعضهم البعض.

"""
title = "الاخوان المزارعان"
# Remove diacritic
story = strip_tashkeel(story)
title = strip_tashkeel(title)
# all_clauses = [sentence_tokenize(story)]
# scores = score_clauses(all_clauses)
scores = calculate_clause_scores_arabic(story, title)

INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json:   0%|   …

INFO:stanza:Loading these models for language: ar (Arabic):
| Processor | Package       |
-----------------------------
| tokenize  | padt          |
| mwt       | padt          |
| pos       | padt_charlm   |
| lemma     | padt_nocharlm |
| depparse  | padt_charlm   |
| ner       | aqmar_charlm  |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Loading: lemma
INFO:stanza:Loading: depparse
INFO:stanza:Loading: ner
INFO:stanza:Done loading processors!


Clause: 
ذات يوم من الأيام كان هناك شقيقين يعملان سويا ويعيشان معا في مزرعتهما،
NER Score: 0


Clause: وكان الشقيق الأكبر رجل متزوج وعنده أسرة كبيرة،
NER Score: 0


Clause: وأما الشقيق الصغير كان غير متزوج،
NER Score: 0


Clause: وفي نهاية كل موسم ويوم كان الأخان يتقاسمان كل المكاسب والحصاد بالتساوي،
NER Score: 0


Clause: وكان الشقيقين يستطيعان الربح من أي شيء ينتجانه من المزرعة

NER Score: 0


Clause: وفي أحد الأيام حدث الأخ الصغير نفسه:
NER Score: 0


Clause: ليس عدلا أن نتشارك أنا واخي الإنتاج والربح بشكل متساوي،
NER Score: 0


Clause: فأنا أعيش وحدي واحتياجاتي تعد بسيطة وكذلك قليلة،
NER Score: 0


Clause: لذا كان الأخ الأصغر كل يوم يأخذ كيسا من الحبوب من السلة خاصته كل ليلة ويتسلل به عبر المزرعة التي بين منزل الشقيقين ويقوم بوضعه في صندوق أخيه الأكبر

NER Score: 0


Clause: الغريب أنه في خلال ذلك الوقت،
NER Score: 0


Clause: قال الأخ الأكبر في ذاته:
NER Score: 0


Clause: ليس من العدل أن نتقاسم أنا واخي كل الإنتاج والربح بالتساوي،
NER Score: 0


Clause: فأنا بعد كل شيء رجل متزوج 

In [None]:
def stem_tokens(tokens):
    """Stem a list of tokens."""
    return [stemmer.stem(token) for token in tokens]

def calculate_title_word_score(clause_stems, title_stems):
    """Count the number of title words present in the clause based on stemmed words only."""
    return sum(stem in title_stems for stem in clause_stems)

def calculate_clause_scores_arabic(story, title):
    # Initialize the Arabic pipeline
    nlp = stanza.Pipeline(lang='ar', processors='tokenize,mwt,pos,lemma,depparse,ner')

    # Process the title and stem title words
    title_doc = nlp(title)
    title_tokens = [word.text for sent in title_doc.sentences for word in sent.words]
    title_stems = set(stem_tokens(title_tokens))

    # Process clauses
    clauses = sentence_tokenize(story)
    for clause in clauses:
        if clause.strip():  # Ensure the clause is not just whitespace
            clause_doc = nlp(clause)
            clause_tokens = [word.text for sent in clause_doc.sentences for word in sent.words]
            clause_stems = stem_tokens(clause_tokens)

            title_word_score = calculate_title_word_score(clause_stems, title_stems)

            # Print clause and its title word score
            print(f"Clause: {clause}")
            print(f"Title Word Score: {title_word_score}")

story =  """
وجد المزارع سعيد ذات يوم كلبا على وشك الموت من الجوع، فأخذ سعد الكلب إلى بيته واعتنى به وبعد فترة طويلة وجد في الكلب الذي كان يسميه "ريكس" أفضل صديق له.

كان المزارع والكلب يقضيان اليوم معا وتعلم الكلب لغة صاحبه وبدأ يفهم كل حركاته وإشاراته وكان المزارع أيضا يفهم ما كان يريده الكلب.

وفي ذلك الوقت وصلت إلى المنطقة عصابة خطيرة من اللصوص، ففكر الفلاحون بوضع جرس في كل مزرعة يدق عند الشعور بالخطر وذلك للدفاع عن أنفسهم.

تسلح الجميع بالعصي وكانوا يأتون لمساعدة المزرعة المعتدى عليها.

وذات ليلة، بينما كان المزارع سعد يغط في نومه بسبب الجهد الكبير الذي بذله طوال اليوم، أحاط قطاع الطرق بالمنزل وحينها قفز الكلب إلى السقف ودق الجرس.

جاء الجيران بعد سماع الجرس حاملين العصي وكسروا ضلوع اللصوص وأجبروهم على الهروب.

وابتداء من تلك اللحظة، اعتبر كل سكان المنطقة الكلب صديقهم. ولم يتلق أي كلب ضربة حجر أو عصا من الأطفال الأشقياء لأنهم فهموا الاحترام الذي يستحقه كل من يساعد الإنسان بإخلاص ووفاء.

"""
title = "الاخوان المزارعان"

story = strip_tashkeel(story)
title = strip_tashkeel(title)
# Call the function with story and title
calculate_clause_scores_arabic(story, title)

NameError: ignored

# Feature 3: Calculate Clause Differentiality

In [None]:
all_clauses = [sentence_tokenize(story)]
scores = score_clauses(all_clauses)

Clause: صبي ذكي يدعى أحمد.
Similarity Score: 16.0

Clause: فقرر أن يعتني به ويربيه حتى يشفى

Similarity Score: 16.0

Clause: وهكذا،
Similarity Score: 16.0

Clause: ولكنه لم ينسى صديقه الطيب.
Similarity Score: 15.933333333333334

Clause: مرت الأيام،
Similarity Score: 15.8

Clause: وأصبح الطائر صحيحا وقويا.
Similarity Score: 15.777777777777779

Clause: وفي يوم مشرق،
Similarity Score: 15.749999999999998

Clause: تعلم أحمد درسا قيما عن الرحمة والصداقة التي لا تعرف حدودا.
Similarity Score: 15.698611111111111

Clause: 
كان يا مكان،
Similarity Score: 15.673809523809524

Clause: وجد أحمد طائرا جريحا بجانب الطريق،
Similarity Score: 15.491890054390055

Clause: في قديم الزمان،
Similarity Score: 15.465476190476192

Clause: وعاشا معا في سعادة ومحبة.
Similarity Score: 15.437698412698413

Clause: فتح الطائر جناحيه وحلق في السماء،
Similarity Score: 15.38494560994561

Clause: كل يوم كان يعود إلى أحمد ليلعب معه ويظهر امتنانه

Similarity Score: 15.360897435897435

Clause: كان أحمد سعيدا جدا بصداقتهما.
Si

# Feature 4: Dependency Parsing

In [None]:

# Load the Arabic pipeline with dependency parsing
nlp = stanza.Pipeline(lang='ar', processors='tokenize,mwt,pos,lemma,depparse')

# # Split the story into clauses using pyarabic
clauses = sentence_tokenize(story)

# Check for the presence of subject, verb, and object in each clause
for clause in clauses:
    if clause.strip():
        clause_doc = nlp(clause)
        has_subject, has_verb, has_object = False, False, False
        for sent in clause_doc.sentences:
            for word in sent.words:
                if word.deprel == 'nsubj':
                    has_subject = True
                elif word.deprel == 'root':
                    has_verb = True
                elif word.deprel in ['obj', 'iobj']:
                    has_object = True

        clause_score = 1 if has_subject and has_verb and has_object else 0
        print(f"Clause: {clause}\nSubject: {has_subject}, Verb: {has_verb}, Object: {has_object}, Score: {clause_score}\n")


INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json:   0%|   …

INFO:stanza:Loading these models for language: ar (Arabic):
| Processor | Package       |
-----------------------------
| tokenize  | padt          |
| mwt       | padt          |
| pos       | padt_charlm   |
| lemma     | padt_nocharlm |
| depparse  | padt_charlm   |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Loading: lemma
INFO:stanza:Loading: depparse
INFO:stanza:Done loading processors!


Clause: 
كان يا مكان،
Subject: True, Verb: True, Object: False, Score: 0

Clause: في قديم الزمان،
Subject: False, Verb: True, Object: False, Score: 0

Clause: صبي ذكي يدعى أحمد.
Subject: True, Verb: True, Object: False, Score: 0

Clause: كان أحمد يعيش مع أسرته.
Subject: True, Verb: True, Object: False, Score: 0

Clause: في يوم من الأيام،
Subject: False, Verb: True, Object: False, Score: 0

Clause: وجد أحمد طائرا جريحا بجانب الطريق،
Subject: True, Verb: True, Object: True, Score: 1

Clause: فقرر أن يعتني به ويربيه حتى يشفى

Subject: False, Verb: True, Object: True, Score: 0

Clause: مرت الأيام،
Subject: True, Verb: True, Object: False, Score: 0

Clause: وأصبح الطائر صحيحا وقويا.
Subject: True, Verb: True, Object: False, Score: 0

Clause: كان أحمد سعيدا جدا بصداقتهما.
Subject: True, Verb: True, Object: False, Score: 0

Clause: وفي يوم مشرق،
Subject: False, Verb: True, Object: False, Score: 0

Clause: فتح الطائر جناحيه وحلق في السماء،
Subject: True, Verb: True, Object: True, Score: 1

Cla

# Feature 5: Part-of-Speech Tagging

In [None]:
import stanza
def calculate_pos_scores(clauses, nlp):
    pos_scores = []
    for clause in clauses:
        if clause.strip():  # Ensure the clause is not just whitespace
            clause_doc = nlp(clause)
            print(f"Clause: {clause}")

            relevant_pos_tags = {'NOUN', 'ADJ','X'}
            pos_count = 0

            # Iterate through each word, count and print its POS type if it's relevant
            for sentence in clause_doc.sentences:
                for word in sentence.words:
                    print(f"Word: {word.text}, POS Type: {word.upos}")
                    if word.upos in relevant_pos_tags:
                        pos_count += 1

            # Assign score based on count of relevant POS tags
            pos_scores.append(pos_count)
            print(f"POS Score: {pos_count}\n")

        else:
            pos_scores.append(0)  # Append 0 score for whitespace clauses

    return pos_scores

def calculate_clause_scores_arabic(story, title):
    # Load the Arabic pipeline
    nlp = stanza.Pipeline(lang='ar', processors='tokenize,mwt,pos,lemma,depparse,ner')

    # Process the title and stem title words
    title_doc = nlp(title)
    stemmed_title_words = set(stem_tokens([word.text for sent in title_doc.sentences for word in sent.words]))
    clauses = sentence_tokenize(story)

    # Calculate POS scores for each clause
    pos_scores = calculate_pos_scores(clauses, nlp)

    # Calculate other scores and print results
    for clause in clauses:
        if clause.strip():  # Check for non-whitespace clause
            clause_doc = nlp(clause)
            clause_tokens = [word.text for sent in clause_doc.sentences for word in sent.words]
            stemmed_clause_tokens = stem_tokens(clause_tokens)

            #title_word_score = calculate_title_word_score(stemmed_clause_tokens, stemmed_title_words)

            ner_entities = [(ent.text, ent.type) for sent in clause_doc.sentences for ent in sent.ents]
            meaningful_ner_labels = ['PER', 'ORG', 'LOC']
            ner_detected = any(ner_type in meaningful_ner_labels for _, ner_type in ner_entities)
            ner_score = 1 if ner_detected else 0

story =  """
وجد المزارع سعيد ذات يوم كلبا على وشك الموت من الجوع، فأخذ سعد الكلب إلى بيته واعتنى به وبعد فترة طويلة وجد في الكلب الذي كان يسميه "ريكس" أفضل صديق له.

كان المزارع والكلب يقضيان اليوم معا وتعلم الكلب لغة صاحبه وبدأ يفهم كل حركاته وإشاراته وكان المزارع أيضا يفهم ما كان يريده الكلب.

وفي ذلك الوقت وصلت إلى المنطقة عصابة خطيرة من اللصوص، ففكر الفلاحون بوضع جرس في كل مزرعة يدق عند الشعور بالخطر وذلك للدفاع عن أنفسهم.

تسلح الجميع بالعصي وكانوا يأتون لمساعدة المزرعة المعتدى عليها.

وذات ليلة، بينما كان المزارع سعد يغط في نومه بسبب الجهد الكبير الذي بذله طوال اليوم، أحاط قطاع الطرق بالمنزل وحينها قفز الكلب إلى السقف ودق الجرس.

جاء الجيران بعد سماع الجرس حاملين العصي وكسروا ضلوع اللصوص وأجبروهم على الهروب.

وابتداء من تلك اللحظة، اعتبر كل سكان المنطقة الكلب صديقهم. ولم يتلق أي كلب ضربة حجر أو عصا من الأطفال الأشقياء لأنهم فهموا الاحترام الذي يستحقه كل من يساعد الإنسان بإخلاص ووفاء.

"""
calculate_clause_scores_arabic(story, title)

INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json:   0%|   …

INFO:stanza:Loading these models for language: ar (Arabic):
| Processor | Package       |
-----------------------------
| tokenize  | padt          |
| mwt       | padt          |
| pos       | padt_charlm   |
| lemma     | padt_nocharlm |
| depparse  | padt_charlm   |
| ner       | aqmar_charlm  |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Loading: lemma
INFO:stanza:Loading: depparse
INFO:stanza:Loading: ner
INFO:stanza:Done loading processors!


Clause: 
وجد المزارع سعيد ذات يوم كلبا على وشك الموت من الجوع،
Word: و, POS Type: CCONJ
Word: جد, POS Type: VERB
Word: المزارع, POS Type: NOUN
Word: سعيد, POS Type: X
Word: ذات, POS Type: NOUN
Word: يوم, POS Type: NOUN
Word: كلبا, POS Type: NOUN
Word: على, POS Type: ADP
Word: وشك, POS Type: NOUN
Word: الموت, POS Type: NOUN
Word: من, POS Type: ADP
Word: الجوع, POS Type: NOUN
Word: ،, POS Type: PUNCT
POS Score: 8

Clause: فأخذ سعد الكلب إلى بيته واعتنى به وبعد فترة طويلة وجد في الكلب الذي كان يسميه "ريكس" أفضل صديق له

Word: ف, POS Type: CCONJ
Word: أخذ, POS Type: VERB
Word: سعد, POS Type: NOUN
Word: الكلب, POS Type: NOUN
Word: إلى, POS Type: ADP
Word: بية, POS Type: NOUN
Word: ه, POS Type: PRON
Word: و, POS Type: CCONJ
Word: اعتنى, POS Type: VERB
Word: ب, POS Type: ADP
Word: ه, POS Type: PRON
Word: و, POS Type: CCONJ
Word: بعد, POS Type: ADP
Word: فترة, POS Type: NOUN
Word: طويلة, POS Type: ADJ
Word: وجد, POS Type: VERB
Word: في, POS Type: ADP
Word: الكلب, POS Type: NOUN
Word: الذي, POS

# Feature 6: Sentiment analysis

In [None]:
from transformers import pipeline
from pyarabic.araby import sentence_tokenize
model = pipeline('sentiment-analysis', model='CAMeL-Lab/bert-base-arabic-camelbert-mix-sentiment')

clauses = sentence_tokenize(story)
results = model(clauses)
for i, clause_result in enumerate(results):
    label = clause_result['label']
    score = clause_result['score']
    print(f"Clause {i + 1}:")
    print(f"Text: {clauses[i]}")
    print(f"Label: {label}")
    print(f"Score: {score}\n")


config.json:   0%|          | 0.00/842 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/86.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/305k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Clause 1:
Text: 
كان يا مكان،
Label: neutral
Score: 0.4297942817211151

Clause 2:
Text: في قديم الزمان،
Label: neutral
Score: 0.5913397073745728

Clause 3:
Text: صبي ذكي يدعى أحمد.
Label: neutral
Score: 0.8146809935569763

Clause 4:
Text: كان أحمد يعيش في قرية صغيرة مع أسرته.
Label: neutral
Score: 0.7267364859580994

Clause 5:
Text: في يوم من الأيام،
Label: neutral
Score: 0.6943511962890625

Clause 6:
Text: وجد أحمد طائرا جريحا بجانب الطريق،
Label: negative
Score: 0.6184368133544922

Clause 7:
Text: فقرر أن يعتني به ويربيه حتى يشفى

Label: positive
Score: 0.6280108690261841

Clause 8:
Text: مرت الأيام،
Label: negative
Score: 0.6302985548973083

Clause 9:
Text: وأصبح الطائر صحيحا وقويا.
Label: positive
Score: 0.9777284264564514

Clause 10:
Text: كان أحمد سعيدا جدا بصداقتهما.
Label: positive
Score: 0.9867877960205078

Clause 11:
Text: وفي يوم مشرق،
Label: positive
Score: 0.9843719601631165

Clause 12:
Text: فتح الطائر جناحيه وحلق في السماء،
Label: positive
Score: 0.565030038356781

Claus

In [None]:
from transformers import pipeline
from pyarabic.araby import sentence_tokenize

model = pipeline('text-classification', model='Ammar-alhaj-ali/arabic-MARBERT-sentiment')

sentences = ['استمتع الرجل بالحفلة', 'خدمة المطعم كانت محبطة','بكى الرجل بحزن']
print(f"{model(sentences)}\n")

results = model(clauses)
for i, clause_result in enumerate(results):
    label = clause_result['label']
    score = clause_result['score']
    print(f"Clause {i + 1}:")
    print(f"Text: {clauses[i]}")
    print(f"Label: {label}")
    print(f"Score: {score}\n")

# Feature 7: clause Length

In [None]:
def calculate_normalized_clause_length(clauses):

    # Find the length of each clause
    clause_lengths = [len(clause.split()) for clause in clauses]

    # Find the length of the longest clause
    max_clause_length = max(clause_lengths)

    # Calculate the normalized length for each clause
    normalized_lengths = [length / max_clause_length for length in clause_lengths]

    return normalized_lengths

normalized_lengths = calculate_normalized_clause_length(clauses)

for i, (clause, score) in enumerate(zip(clauses, normalized_lengths), 1):
    print(f"Clause{i}: {clause}\n Score: {score}\n")

Clause1: 
كان يا مكان في قديم الزمان،
 Score: 0.35294117647058826

Clause2: كان في غابة طاووس وفيل يتبادلان الكلام،
 Score: 0.4117647058823529

Clause3: وكل من الفيل والطاووس يظن أنه الأفضل من الآخر؛ فالطاووس يعتقد أنه هو الأفضل مفتخرا بجمال ريشه.
 Score: 1.0

Clause4: بينما الفيل فيرى أنه هو الأحسن مغرورا بقوة جسمه وطول خرطومه

 Score: 0.6470588235294118

Clause5: ظل الفيل والطاووس يناقشان الأمر دون أن يستسلم أحدهما للآخر إلى أن أحسا بالجوع،
 Score: 0.8235294117647058

Clause6: فتوجه الفيل والطاووس معا إلى حظيرة قريبة من الغابة،
 Score: 0.5294117647058824

Clause7: ودخل الفيل والطاووس إليها عبر ثقب في الحائط.
 Score: 0.47058823529411764

Clause8: فأخذ الفيل والطاووس يأكلان بفرح وسرور ما طاب ولذ من الأكل

 Score: 0.6470588235294118

Clause9: وبعد انتهاء الفيل والطاووس من الأكل والشرب،
 Score: 0.4117647058823529

Clause10: تابع الفيل والطاووس جدالهما دون توقف،
 Score: 0.35294117647058826

Clause11: إلى أن فاجأهما صاحبا الحظيرة اللذان كانا يحملان بيدهما بندقية،
 Score: 0.5882352941176471

# Feature 8: Nouns weight

In [None]:
!pip install stanza
!pip install python-bidi
!pip install pyarabic

Collecting python-bidi
  Downloading python_bidi-0.4.2-py2.py3-none-any.whl (30 kB)
Installing collected packages: python-bidi
Successfully installed python-bidi-0.4.2


In [None]:
from collections import Counter
from nltk.tokenize import word_tokenize
from bidi.algorithm import get_display
from pyarabic.araby import sentence_tokenize
import stanza
import nltk

nltk.download('punkt')

def stem_tokens(tokens):
    """Stem a list of tokens."""
    return [stemmer.stem(token) for token in tokens]

def calculate_word_probability(story):
    words = word_tokenize(story)
    stemmed_words = stem_tokens(words)
    word_counts = Counter(stemmed_words)
    total_words = len(stemmed_words)

    word_probabilities = {word: count / total_words for word, count in word_counts.items()}

    return word_probabilities

def calculate_noun_score(clause, word_probabilities, nlp):
    clause_doc = nlp(clause)
    nouns = [word.text for sentence in clause_doc.sentences for word in sentence.words if word.upos.startswith('N')]

    # Check if there are no nouns in the clause
    if len(nouns) == 0:
        return nouns, 0

    # Stem the nouns before calculating the score
    stemmed_nouns = stem_tokens(nouns)

    # Calculate noun score only for stemmed nouns present in word_probabilities
    noun_score = sum(word_probabilities.get(noun, 0) for noun in stemmed_nouns) / len(stemmed_nouns)

    return stemmed_nouns, noun_score

# # Example usage
story = """
كان هناك تاجر يعمل كل يوم من الصباح حتى المساء وكان لديه بنت صغيرة تجلس في المنزل الى حين عودة والدها من العمل وكان التاجر يأمن ابنته عند الكلب ويحميها من أي خطر.

في يوم من الأيام عاد التاجر من العمل ووجد الكلب خارج المنزل وفمه ملطخ بالدم. انصدم الأب وتوقع أن الكلب أكل ابنته، فقتل الأب الكلب. ولكن عندما دخل الأب المنزل، وجد ابنته بخير وكانت تلعب. تبين أن الكلب قد قتل الشخص الذي كاد أن يتعرض لها.

عندما أدرك الأب الخطأ الفادح الذي ارتكبه، شعر بالندم الشديد على قتل الكلب الذي كان في الحقيقة يحمي ابنته. تعلم الأب درسًا قاسيًا عن عدم التسرع في الحكم على الأمور دون معرفة كل الحقائق. ومن ذلك اليوم فصاعدًا، قرر أن يكرس المزيد من وقته لكونه والدًا متفهمًا ومتأنيًا، وأن يحمي ابنته بنفسه، متذكرًا الوفاء والشجاعة التي أظهرها كلبه الأمين."""

# Tokenize the story into clauses
clauses = sentence_tokenize(story)

# Calculate the word probabilities for the entire story
word_probabilities = calculate_word_probability(story)

# Load the Arabic pipeline
nlp = stanza.Pipeline(lang='ar', processors='tokenize,mwt,pos,lemma,depparse,ner')

# Print each noun and its score
for i, clause in enumerate(clauses):
    nouns, noun_score = calculate_noun_score(clause, word_probabilities, nlp)
    print(f"\nNouns in Clause {i + 1}:\n{nouns}\nNoun Score: {noun_score}\n")

    # Print the entire clause and its score
    print(f"Clause {i + 1}:{clause}\nTotal Score: {noun_score}\n")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json:   0%|   …

INFO:stanza:Loading these models for language: ar (Arabic):
| Processor | Package       |
-----------------------------
| tokenize  | padt          |
| mwt       | padt          |
| pos       | padt_charlm   |
| lemma     | padt_nocharlm |
| depparse  | padt_charlm   |
| ner       | aqmar_charlm  |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Loading: lemma
INFO:stanza:Loading: depparse
INFO:stanza:Loading: ner
INFO:stanza:Done loading processors!



Nouns in Clause 1:
['تجر', 'كل', 'يوم', 'صبح', 'ساء', 'بنت', 'نزل', 'حين', 'عود', 'ولد', 'عمل', 'تجر', 'ابن', 'كلب', 'خطر']
Noun Score: 0.01506849315068493

Clause 1:
كان هناك تاجر يعمل كل يوم من الصباح حتى المساء وكان لديه بنت صغيرة تجلس في المنزل الى حين عودة والدها من العمل وكان التاجر يأمن ابنته عند الكلب ويحميها من أي خطر

Total Score: 0.01506849315068493


Nouns in Clause 2:
['يوم', 'ايم', 'تجر', 'عمل', 'كلب', 'نزل', 'فم', 'لطخ', 'لدم']
Noun Score: 0.015220700152207

Clause 2:في يوم من الأيام عاد التاجر من العمل ووجد الكلب خارج المنزل وفمه ملطخ بالدم.
Total Score: 0.015220700152207


Nouns in Clause 3:
['لأب', 'كلب', 'ابن']
Noun Score: 0.0365296803652968

Clause 3:انصدم الأب وتوقع أن الكلب أكل ابنته،
Total Score: 0.0365296803652968


Nouns in Clause 4:
['لأب', 'كلب']
Noun Score: 0.0410958904109589

Clause 4:فقتل الأب الكلب.
Total Score: 0.0410958904109589


Nouns in Clause 5:
['لأب', 'نزل']
Noun Score: 0.023972602739726026

Clause 5:ولكن عندما دخل الأب المنزل،
Total Score: 0.023

# Code after the integration and adding sentences

In [None]:
import stanza
from pyarabic.araby import sentence_tokenize as arabic_sentence_tokenize, strip_tashkeel
import nltk
from nltk.stem.isri import ISRIStemmer
from transformers import pipeline
from collections import Counter
from nltk.tokenize import word_tokenize

# Initialize the Arabic stemmer
stemmer = ISRIStemmer()
nltk.download('punkt')

# Initialize Stanza pipeline for Arabic
nlp = stanza.Pipeline(lang='ar', processors='tokenize,mwt,pos,lemma,depparse,ner')

def stem_tokens(tokens):
    """Stem a list of tokens."""
    return [stemmer.stem(token) for token in tokens]

def calculate_word_probability(story):
    """Calculate word probabilities in a story."""
    words = word_tokenize(story)
    stemmed_words = stem_tokens(words)
    word_counts = Counter(stemmed_words)
    total_words = len(stemmed_words)
    word_probabilities = {word: count / total_words for word, count in word_counts.items()}
    return word_probabilities

def calculate_noun_score(clause, word_probabilities, nlp):
    """Calculate noun score for a clause."""
    clause_doc = nlp(clause)
    nouns = [word.text for sentence in clause_doc.sentences for word in sentence.words if word.upos.startswith('N')]
    if len(nouns) == 0:
        return nouns, 0
    stemmed_nouns = stem_tokens(nouns)
    noun_score = sum(word_probabilities.get(noun, 0) for noun in stemmed_nouns) / len(stemmed_nouns)
    return stemmed_nouns, noun_score

def calculate_title_word_score(clause, title, nlp):
    """Calculate score based on the presence of title words in the clause."""
    title_doc = nlp(title)
    clause_doc = nlp(clause)

    title_tokens = [word.text for sent in title_doc.sentences for word in sent.words if word.upos in ['NOUN', 'X', 'ADJ']]
    title_stems = set(stem_tokens(title_tokens))

    clause_tokens = [word.text for sent in clause_doc.sentences for word in sent.words if word.upos in ['NOUN', 'X', 'ADJ']]
    clause_stems = stem_tokens(clause_tokens)

    return sum(stem in title_stems for stem in clause_stems)

def stanza_sentence_tokenize(text):
    """Tokenize text into sentences using Stanza."""
    doc = nlp(text)
    sentences = [sentence.text for sentence in doc.sentences]
    return sentences

def calculate_similarity_matrix():
    """Calculate similarity matrix for clauses."""
    clauses = arabic_sentence_tokenize(story)
    stemmed_clauses = [stem_tokens(clause.split()) for clause in clauses]
    n = len(stemmed_clauses)
    similarity_matrix = [[0 for _ in range(n)] for _ in range(n)]

    for i in range(n):
        for j in range(n):
            if i != j and clauses[i].strip() and clauses[j].strip():
                set_i = set(stemmed_clauses[i])
                set_j = set(stemmed_clauses[j])
                unique_tokens_i = set_i - set_j
                unique_tokens_j = set_j - set_i
                total_unique_tokens = len(unique_tokens_i) + len(unique_tokens_j)
                total_tokens = len(set_i.union(set_j))
                similarity_matrix[i][j] = total_unique_tokens / total_tokens if total_tokens else 0

    return similarity_matrix

def calculate_normalized_clause_length(clauses):
    """Calculate normalized clause length."""
    clause_lengths = [len(clause.split()) for clause in clauses]
    max_clause_length = max(clause_lengths, default=1)
    normalized_lengths = [length / max_clause_length for length in clause_lengths]
    return normalized_lengths

def calculate_pos_scores(clauses, nlp):
    """Calculate POS scores for clauses."""
    pos_scores = []
    for clause in clauses:
        if clause.strip():
            clause_doc = nlp(clause)
            relevant_pos_tags = {'NOUN', 'ADJ', 'X'}
            pos_count = sum(word.upos in relevant_pos_tags for sentence in clause_doc.sentences for word in sentence.words)
            pos_scores.append(pos_count)
        else:
            pos_scores.append(0)
    return pos_scores

def process_and_score_clauses(sentences, title, nlp, sentiment_model, word_probabilities, title_stems):
    """Process and score clauses in sentences."""
    for sentence in sentences:
        print(f"Sentence: {sentence}")
        clauses = arabic_sentence_tokenize(sentence)
        similarity_matrix = calculate_similarity_matrix()
        pos_scores = calculate_pos_scores(clauses, nlp)
        normalized_lengths = calculate_normalized_clause_length(clauses)

        for i, clause in enumerate(clauses):
            if clause.strip():
                clause_doc = nlp(clause)
                clause_tokens = [word.text for sent in clause_doc.sentences for word in sent.words]
                clause_stems = stem_tokens(clause_tokens)
                title_word_score = calculate_title_word_score(clause, title, nlp)

                # Sentiment Analysis
                sentiment_result = sentiment_model(clause)[0]
                sentiment_score = 0
                sentiment_label = sentiment_result['label']
                if sentiment_label != 'neutral':
                   sentiment_score = sentiment_result['score']

                # Nouns weight
                nouns, noun_score = calculate_noun_score(clause, word_probabilities, nlp)

                # SVO Analysis
                has_subject, has_verb, has_object = False, False, False
                for sent in clause_doc.sentences:
                    for word in sent.words:
                        if word.deprel == 'nsubj':
                            has_subject = True
                        elif word.deprel == 'root':
                            has_verb = True
                        elif word.deprel in ['obj', 'iobj']:
                            has_object = True
                clause_svo_score = 1 if has_subject and has_verb and has_object else 0

                # NER Score
                ner_entities = [(ent.text, ent.type) for sent in clause_doc.sentences for ent in sent.ents]
                meaningful_ner_labels = ['PER', 'ORG', 'LOC']
                ner_score = sum(ner_type in meaningful_ner_labels for _, ner_type in ner_entities)

                # Similarity, POS, and Length Scores
                similarity_score = sum(similarity_matrix[i])
                pos_score = pos_scores[i]
                normalized_length = normalized_lengths[i]

                #overall score
                overall_score= clause_svo_score + title_word_score + ner_score + sentiment_score + noun_score + similarity_score + pos_score + normalized_length

                # Print the scores for each clause
                print(f"  Clause: {clause}")
                print(f"  SVO Score: {clause_svo_score}")
                print(f"  Title Word Score: {title_word_score}")
                print(f"  NER Score: {ner_score}")
                print(f"  Sentiment Label: {sentiment_label}")
                print(f"  Sentiment Score: {sentiment_score}")
                print(f"  Noun Score: {noun_score}")
                print(f"  Similarity Score: {similarity_score}")
                print(f"  POS Score: {pos_score}")
                print(f"  Normalized Length Score: {normalized_length}")
                print(f"  Overall Score: {overall_score}\n")

def calculate_clause_scores_arabic(story, title):
    """Segment a story into sentences using Stanza and process each sentence's clauses."""
    story = strip_tashkeel(story)
    title = strip_tashkeel(title)

    # Process the title and stem title words
    title_doc = nlp(title)
    title_tokens = [word.text for sent in title_doc.sentences for word in sent.words]
    title_stems = set(stem_tokens(title_tokens))

    # Word probabilities
    word_probabilities = calculate_word_probability(story)

    # Sentiment analysis model
    sentiment_model = pipeline('sentiment-analysis', model='CAMeL-Lab/bert-base-arabic-camelbert-mix-sentiment')

    # Segment story into sentences
    sentences = stanza_sentence_tokenize(story)

    # Process and score clauses
    process_and_score_clauses(sentences, title, nlp, sentiment_model, word_probabilities, title_stems)

# Example usage
story = """
كان يا مكان في قديم الزمان، كان في غابة طاووس وفيل يتبادلان الكلام، وكل من الفيل والطاووس يظن أنه الأفضل من الآخر؛ فالطاووس يعتقد أنه هو الأفضل مفتخرا بجمال ريشه. بينما الفيل فيرى أنه هو الأحسن مغرورا بقوة جسمه وطول خرطومه.

ظل الفيل والطاووس يناقشان الأمر دون أن يستسلم أحدهما للآخر إلى أن أحسا بالجوع، فتوجه الفيل والطاووس معا إلى حظيرة قريبة من الغابة، ودخل الفيل والطاووس إليها عبر ثقب في الحائط. فأخذ الفيل والطاووس يأكلان بفرح وسرور ما طاب ولذ من الأكل.

وبعد انتهاء الفيل والطاووس من الأكل والشرب، تابع الفيل والطاووس جدالهما دون توقف، إلى أن فاجأهما صاحبا الحظيرة اللذان كانا يحملان بيدهما بندقية، فقال أحدهما للآخر: سوف نقتل الفيل لخطورته، ونترك الطاووس يعيش في الحظيرة لجمال ريشه. ثم رد الرجل الثاني قائلا: أنت على صواب.

سمع الفيل والطاووس ما قاله الرجلان، فنظر الفيل والطاووس إلى بعضهما نظرة الوداع. وقتذاك صوب الرجلان بندقيتهما باتجاه الفيل وأطلقا النار عليه في آن واحد دون إصابته لحسن حظه، ففر الفيل باتجاه الغابة.

"""
title = "الفيل والطاووس"
calculate_clause_scores_arabic(story, title)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json:   0%|   …

INFO:stanza:Loading these models for language: ar (Arabic):
| Processor | Package       |
-----------------------------
| tokenize  | padt          |
| mwt       | padt          |
| pos       | padt_charlm   |
| lemma     | padt_nocharlm |
| depparse  | padt_charlm   |
| ner       | aqmar_charlm  |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Loading: lemma
INFO:stanza:Loading: depparse
INFO:stanza:Loading: ner
INFO:stanza:Done loading processors!


Sentence: كان يا مكان في قديم الزمان، كان في غابة طاووس وفيل يتبادلان الكلام، وكل من الفيل والطاووس يظن أنه الأفضل من الآخر؛ فالطاووس يعتقد أنه هو الأفضل مفتخرا بجمال ريشه. بينما الفيل فيرى أنه هو الأحسن مغرورا بقوة جسمه وطول خرطومه.
  Clause: كان يا مكان في قديم الزمان،
  SVO Score: 0
  Title Word Score: 0
  NER Score: 0
  Sentiment Label: positive
  Sentiment Score: 0.558352530002594
  Noun Score: 0.008771929824561403
  Similarity Score: 18.575757575757574
  POS Score: 3
  Normalized Length Score: 0.35294117647058826
  Overall Score: 22.495823212055317

  Clause: كان في غابة طاووس وفيل يتبادلان الكلام،
  SVO Score: 0
  Title Word Score: 1
  NER Score: 1
  Sentiment Label: neutral
  Sentiment Score: 0
  Noun Score: 0.005847953216374269
  Similarity Score: 17.778987782083753
  POS Score: 4
  Normalized Length Score: 0.4117647058823529
  Overall Score: 24.19660044118248

  Clause: وكل من الفيل والطاووس يظن أنه الأفضل من الآخر؛ فالطاووس يعتقد أنه هو الأفضل مفتخرا بجمال ريشه.
  SVO Score:

# TOPSIS

In [None]:
import stanza
from pyarabic.araby import sentence_tokenize, strip_tashkeel
import nltk
from nltk.stem.isri import ISRIStemmer
from transformers import pipeline
from collections import Counter
from nltk.tokenize import word_tokenize
import numpy as np


# Initialize the Arabic stemmer
stemmer = ISRIStemmer()
nltk.download('punkt')

# Initialize Stanza pipeline for Arabic
nlp = stanza.Pipeline(lang='ar', processors='tokenize,mwt,pos,lemma,depparse,ner')


# Stem a list of tokens
def stem_tokens(tokens):
    return [stemmer.stem(token) for token in tokens]

# Calculate word probabilities in a story
def calculate_word_probability(story):
    words = word_tokenize(story)
    stemmed_words = stem_tokens(words)
    word_counts = Counter(stemmed_words)
    total_words = len(stemmed_words)
    word_probabilities = {word: count / total_words for word, count in word_counts.items()}
    return word_probabilities

# Calculate noun score for a clause
def calculate_noun_score(clause, word_probabilities, nlp):
    clause_doc = nlp(clause)
    nouns = [word.text for sentence in clause_doc.sentences for word in sentence.words if word.upos.startswith('N')]
    if len(nouns) == 0:
        return nouns, 0
    stemmed_nouns = stem_tokens(nouns)
    noun_score = sum(word_probabilities.get(noun, 0) for noun in stemmed_nouns) / len(stemmed_nouns)
    return stemmed_nouns, noun_score

# Calculate score based on the presence of title words in the clause.
def calculate_title_word_score(clause, title, nlp):
    title_doc = nlp(title)
    clause_doc = nlp(clause)
    # I've added type X since the Stanza library assigned it to most nouns and adjectives if it fails to classify them
    title_tokens = [word.text for sent in title_doc.sentences for word in sent.words if word.upos in ['NOUN', 'X', 'ADJ']]
    title_stems = set(stem_tokens(title_tokens))

    clause_tokens = [word.text for sent in clause_doc.sentences for word in sent.words if word.upos in ['NOUN', 'X', 'ADJ']]
    clause_stems = stem_tokens(clause_tokens)

    return sum(stem in title_stems for stem in clause_stems)

# Tokenize text into sentences using Stanza
def stanza_sentence_tokenize(text):
    doc = nlp(text)
    sentences = [sentence.text for sentence in doc.sentences]
    return sentences

def calculate_dissimilarity_matrix():
    clauses = sentence_tokenize(story)
    stemmed_clauses = [stem_tokens(clause.split()) for clause in clauses]
    n = len(stemmed_clauses)
    similarity_matrix = [[0 for _ in range(n)] for _ in range(n)]

    for i in range(n):
        for j in range(n):
            if i != j and clauses[i].strip() and clauses[j].strip():
                set_i = set(stemmed_clauses[i])
                set_j = set(stemmed_clauses[j])
                unique_tokens_i = set_i - set_j
                unique_tokens_j = set_j - set_i
                total_unique_tokens = len(unique_tokens_i) + len(unique_tokens_j)
                total_tokens = len(set_i.union(set_j))
                similarity_matrix[i][j] = total_unique_tokens / total_tokens if total_tokens else 0

    return similarity_matrix

# Calculate normalized clause length
def calculate_normalized_clause_length(clauses):
    clause_lengths = [len(clause.split()) for clause in clauses]
    max_clause_length = max(clause_lengths, default=1)
    normalized_lengths = [length / max_clause_length for length in clause_lengths]
    return normalized_lengths

# Calculate POS scores for clauses
def calculate_pos_scores(clauses, nlp):
    pos_scores = []
    for clause in clauses:
        if clause.strip():
            clause_doc = nlp(clause)
            relevant_pos_tags = {'NOUN', 'ADJ', 'X'}
            pos_count = sum(word.upos in relevant_pos_tags for sentence in clause_doc.sentences for word in sentence.words)
            pos_scores.append(pos_count)
        else:
            pos_scores.append(0)
    return pos_scores

# Process and score clauses in sentences
def process_and_score_clauses(sentences, title, nlp, sentiment_model, word_probabilities, title_stems):
    scores_matrix = []
    for sentence in sentences:
        print(f"Sentence: {sentence}")
        clauses = sentence_tokenize(sentence)
        dissimilarity_matrix = calculate_dissimilarity_matrix()
        pos_scores = calculate_pos_scores(clauses, nlp)
        normalized_lengths = calculate_normalized_clause_length(clauses)

        for i, clause in enumerate(clauses):
            if clause.strip():
                clause_doc = nlp(clause)
                clause_tokens = [word.text for sent in clause_doc.sentences for word in sent.words]
                clause_stems = stem_tokens(clause_tokens)
                title_word_score = calculate_title_word_score(clause, title, nlp)

                # Sentiment Analysis
                sentiment_result = sentiment_model(clause)[0]
                sentiment_score = 0
                sentiment_label = sentiment_result['label']
                if sentiment_label != 'neutral':
                   sentiment_score = sentiment_result['score']

                # Nouns weight
                nouns, noun_score = calculate_noun_score(clause, word_probabilities, nlp)

                # SVO Analysis
                has_subject, has_verb, has_object = False, False, False
                for sent in clause_doc.sentences:
                    for word in sent.words:
                        if word.deprel == 'nsubj':
                            has_subject = True
                        elif word.deprel == 'root':
                            has_verb = True
                        elif word.deprel in ['obj', 'iobj']:
                            has_object = True
                clause_svo_score = 1 if has_subject and has_verb and has_object else 0

                # NER Score
                ner_entities = [(ent.text, ent.type) for sent in clause_doc.sentences for ent in sent.ents]
                meaningful_ner_labels = ['PER', 'ORG', 'LOC']
                ner_score = sum(ner_type in meaningful_ner_labels for _, ner_type in ner_entities)

                # Similarity, POS, and Length Scores
                dissimilarity_score = sum(dissimilarity_matrix[i])
                pos_score = pos_scores[i]
                normalized_length = normalized_lengths[i]

                #overall score
                overall_score= clause_svo_score + title_word_score + ner_score + sentiment_score + noun_score + dissimilarity_score + pos_score + normalized_length

                # Append the scores to the matrix
                scores_matrix.append([clause_svo_score,title_word_score,ner_score,sentiment_score,
                                    noun_score,dissimilarity_score,pos_score,normalized_length,
                                    ])

                # Print the scores for each clause
                print(f"  Clause: {clause}")
                print(f"  SVO Score: {clause_svo_score}")
                print(f"  Title Word Score: {title_word_score}")
                print(f"  NER Score: {ner_score}")
                print(f"  Sentiment Label: {sentiment_label}")
                print(f"  Sentiment Score: {sentiment_score}")
                print(f"  Noun Score: {noun_score}")
                print(f"  Dissimilarity Score: {dissimilarity_score}")
                print(f"  POS Score: {pos_score}")
                print(f"  Normalized Length Score: {normalized_length}")
                print(f"  Overall Score: {overall_score}\n")

    return scores_matrix

# Segment a story into sentences using Stanza and process each sentence's clauses
def calculate_clause_scores_arabic(story, title):
    story = strip_tashkeel(story)
    title = strip_tashkeel(title)

    # Process the title and stem title words
    title_doc = nlp(title)
    title_tokens = [word.text for sent in title_doc.sentences for word in sent.words]
    title_stems = set(stem_tokens(title_tokens))

    # Word probabilities
    word_probabilities = calculate_word_probability(story)

    # Sentiment analysis model
    sentiment_model = pipeline('sentiment-analysis', model='CAMeL-Lab/bert-base-arabic-camelbert-mix-sentiment')

    # Segment story into sentences
    sentences = stanza_sentence_tokenize(story)

    # Process and score clauses
    scores_matrix = process_and_score_clauses(sentences, title, nlp, sentiment_model, word_probabilities, title_stems)

    return scores_matrix

# Example usage
story = """
ذات يوم من الأيام كان هناك شقيقين يعملان سوياً ويعيشان معًا في مزرعتهما، وكان الشقيق الأكبر رجل متزوج وعنده أسرة كبيرة، وأما الشقيق الصغير كان غير متزوج، وفي نهاية كل موسم ويوم كان الأخان يتقاسمان كل المكاسب والحصاد بالتساوي، وكان الشقيقين يستطيعان الربح من أي شيء ينتجانه من المزرعة.

وفي أحد الأيام حدث الأخ الصغير نفسه: ليس عدلاً أن نتشارك أنا واخي الإنتاج والربح بشكل متساوي، فأنا أعيش وحدي واحتياجاتي تعد بسيطة وكذلك قليلة، لذا كان الأخ الأصغر كل يوم يأخذ كيسًا من الحبوب من السلة خاصته كل ليلة ويتسلل به عبر المزرعة التي بين منزل الشقيقين ويقوم بوضعه في صندوق أخيه الأكبر.

الغريب أنه في خلال ذلك الوقت، قال الأخ الأكبر في ذاته: ليس من العدل أن نتقاسم أنا واخي كل الإنتاج والربح بالتساوي، فأنا بعد كل شيء رجل متزوج ولدي أولادي وزوجتي للاهتمام بي عندما أكبر بعد عمر طويل، أما أخي فليس لديه من يعوله، وقد لا يتواجد أحد للاعتناء به في المستقبل.

فكان الشقيق الأكبر مساءاً وعند كل ليلة، كان الشقيق الأكبر يأخذ كيسًا من الحبوب خاصته ويقوم بوضعه في سلة أخيه، وقد استمر هذا الحال على نفس المنوال لسنوات وكان كلا الأخوين في شك من ناحية حقيقة أن إمداداتهم من الحبوب لم تقل أبدًا!، لكن في ليلة مظلمة للغاية، قابل الشقيقان بعضهما البعض وهما يتجهان نحو سلال بعضهما بالأكياس، تفاجأ الشقيقين ثم استوعبوا ما يحدث، أسقط الشقيقين أكياسهم وعانقوا بعضهم البعض.
"""
title = "الاخوان المزارعان"

#scores_matrix = np.array(calculate_clause_scores_arabic(story, title))
scores_matrix = np.array(calculate_clause_scores_arabic(story, title))
print(scores_matrix)

# normalize the decision matrix
def normalize_matrix(matrix):
    normalized_matrix = matrix / np.sqrt(np.sum(matrix**2, axis=0))

    print(f"\nnormalized_matrix :\n {normalized_matrix} \n")

    return normalized_matrix

# Calculate ideal and negative-ideal solutions
def calculate_ideal_negative_ideal(normalized_matrix):
    positive_ideal_solution = np.max(normalized_matrix, axis=0)
    negative_ideal_solution = np.min(normalized_matrix, axis=0)

    print(f"positive_ideal_solution : {positive_ideal_solution} \n negative_ideal_solution : {negative_ideal_solution} \n")

    return positive_ideal_solution, negative_ideal_solution

# Calculate separation measures for each alternative
def calculate_separation_measures(normalized_matrix, positive_ideal_solution, negative_ideal_solution):
    positive_separation = np.sqrt(np.sum((normalized_matrix - positive_ideal_solution)**2 , axis=1))
    negative_separation = np.sqrt(np.sum((normalized_matrix - negative_ideal_solution)**2 , axis=1))

    print(f"positive_separation : {positive_separation} \n negative_separation : {negative_separation} \n")

    return positive_separation, negative_separation

# Calculate relative closeness to the ideal solution
def calculate_relative_closeness(positive_separation, negative_separation):
    relative_closeness = negative_separation / (positive_separation + negative_separation)

    print(f"relative_closeness :\n {relative_closeness} \n")

    return relative_closeness

# Perform TOPSIS ranking
def topsis_ranking(decision_matrix):
    normalized_matrix = normalize_matrix(decision_matrix)
    positive_ideal_solution, negative_ideal_solution = calculate_ideal_negative_ideal(normalized_matrix)
    positive_separation, negative_separation = calculate_separation_measures(normalized_matrix, positive_ideal_solution, negative_ideal_solution)
    relative_closeness = calculate_relative_closeness(positive_separation, negative_separation)
    ranking = np.argsort(relative_closeness)[::-1]  # Higher relative closeness indicates a better rank
    return ranking


# Perform TOPSIS ranking
topsis_rank = topsis_ranking(scores_matrix)
print (topsis_rank)
# Print the ranked clauses
#for rank, clauses_index in enumerate(topsis_rank, start=1):
# Print the ranked clauses

# Tokenize the story into clauses
clauses = sentence_tokenize(story)

# Calculate the word probabilities for the entire story
word_probabilities = calculate_word_probability(story)

# Print each noun and its score
for i, clause in enumerate(clauses):
    nouns, noun_score = calculate_noun_score(clause, word_probabilities, nlp)
    print(f"\nNouns in Clause {i + 1}:\n{nouns}\nNoun Score: {noun_score}\n")

    # Print the entire clause and its score
    print(f"Clause {i + 1}:{clause}\nTotal Score: {noun_score}\n")




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json:   0%|   …

INFO:stanza:Loading these models for language: ar (Arabic):
| Processor | Package       |
-----------------------------
| tokenize  | padt          |
| mwt       | padt          |
| pos       | padt_charlm   |
| lemma     | padt_nocharlm |
| depparse  | padt_charlm   |
| ner       | aqmar_charlm  |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Loading: lemma
INFO:stanza:Loading: depparse
INFO:stanza:Loading: ner
INFO:stanza:Done loading processors!


Sentence: ذات يوم من الأيام كان هناك شقيقين يعملان سويا ويعيشان معا في مزرعتهما، وكان الشقيق الأكبر رجل متزوج وعنده أسرة كبيرة، وأما الشقيق الصغير كان غير متزوج، وفي نهاية كل موسم ويوم كان الأخان يتقاسمان كل المكاسب والحصاد بالتساوي، وكان الشقيقين يستطيعان الربح من أي شيء ينتجانه من المزرعة.
  Clause: ذات يوم من الأيام كان هناك شقيقين يعملان سويا ويعيشان معا في مزرعتهما،
  SVO Score: 0
  Title Word Score: 1
  NER Score: 0
  Sentiment Label: neutral
  Sentiment Score: 0
  Noun Score: 0.012875536480686697
  Dissimilarity Score: 19.714982595215464
  POS Score: 6
  Normalized Length Score: 1.0
  Overall Score: 27.727858131696152

  Clause: وكان الشقيق الأكبر رجل متزوج وعنده أسرة كبيرة،
  SVO Score: 0
  Title Word Score: 0
  NER Score: 0
  Sentiment Label: neutral
  Sentiment Score: 0
  Noun Score: 0.006437768240343348
  Dissimilarity Score: 19.763323236553727
  POS Score: 6
  Normalized Length Score: 0.6153846153846154
  Overall Score: 26.385145620178687

  Clause: وأما الشقيق الصغير كان غ

  normalized_matrix = matrix / np.sqrt(np.sum(matrix**2, axis=0))



Nouns in Clause 1:
['ذات', 'يوم', 'ايم', 'شقق', 'سوا', 'زرع']
Noun Score: 0.012875536480686697

Clause 1:
ذات يوم من الأيام كان هناك شقيقين يعملان سوياً ويعيشان معًا في مزرعتهما،
Total Score: 0.012875536480686697


Nouns in Clause 2:
['رجل', 'اسر']
Noun Score: 0.006437768240343348

Clause 2:وكان الشقيق الأكبر رجل متزوج وعنده أسرة كبيرة،
Total Score: 0.006437768240343348


Nouns in Clause 3:
['شقق', 'غير']
Noun Score: 0.023605150214592273

Clause 3:وأما الشقيق الصغير كان غير متزوج،
Total Score: 0.023605150214592273


Nouns in Clause 4:
['نهي', 'كل', 'وسم', 'يوم', 'كل', 'كسب', 'حصد', 'سوي']
Noun Score: 0.010729613733905581

Clause 4:وفي نهاية كل موسم ويوم كان الأخان يتقاسمان كل المكاسب والحصاد بالتساوي،
Total Score: 0.010729613733905581


Nouns in Clause 5:
['شقق', 'ربح', 'شيء', 'زرع']
Noun Score: 0.018240343347639486

Clause 5:وكان الشقيقين يستطيعان الربح من أي شيء ينتجانه من المزرعة

Total Score: 0.018240343347639486


Nouns in Clause 6:
['احد', 'ايم', 'لأخ', 'نفس']
Noun Score: 0.0096

In [None]:
!pip install topsispy

Collecting topsispy
  Downloading topsispy-0.0.1-py3-none-any.whl (3.7 kB)
Installing collected packages: topsispy
Successfully installed topsispy-0.0.1


In [None]:
>>> import topsispy as tp
>>> a =[
...     [250, 16, 12, 5],
...     [200, 16, 8, 3],
...     [300, 32, 16, 4],
...     [275, 32, 8, 4],
...     [225, 16, 16, 2]
... ]
>>> w = [0.25, 0.25, 0.25, 0.25]
>>> sign = [-1, 1, 1, 1]
>>> tp.topsis(a, w,sign)

TypeError: ignored

# test

In [None]:
import stanza
from pyarabic.araby import sentence_tokenize, strip_tashkeel
import nltk
from nltk.stem.isri import ISRIStemmer
from transformers import pipeline
from collections import Counter
from nltk.tokenize import word_tokenize
import numpy as np


# Initialize the Arabic stemmer
stemmer = ISRIStemmer()
nltk.download('punkt')

# Initialize Stanza pipeline for Arabic
nlp = stanza.Pipeline(lang='ar', processors='tokenize,mwt,pos,lemma,depparse,ner')

# Stem a list of tokens
def stem_tokens(tokens):
    return [stemmer.stem(token) for token in tokens]

# Calculate word probabilities in a story
def calculate_word_probability(story):
    words = word_tokenize(story)
    stemmed_words = stem_tokens(words)
    word_counts = Counter(stemmed_words)
    total_words = len(stemmed_words)
    word_probabilities = {word: count / total_words for word, count in word_counts.items()}
    return word_probabilities

# Calculate noun score for a clause
def calculate_noun_score(clause, word_probabilities, nlp):
    clause_doc = nlp(clause)
    nouns = [word.text for sentence in clause_doc.sentences for word in sentence.words if word.upos.startswith('N')]
    if len(nouns) == 0:
        return nouns, 0
    stemmed_nouns = stem_tokens(nouns)
    noun_score = sum(word_probabilities.get(noun, 0) for noun in stemmed_nouns) / len(stemmed_nouns)
    return stemmed_nouns, noun_score

# Calculate score based on the presence of title words in the clause
def calculate_title_word_score(clause, title, nlp):
    title_doc = nlp(title)
    clause_doc = nlp(clause)
    # I've added type X since the Stanza library assigned it to most nouns and adjectives if it fails to classify them
    title_tokens = [word.text for sent in title_doc.sentences for word in sent.words if word.upos in ['NOUN', 'X', 'ADJ']]
    title_stems = set(stem_tokens(title_tokens))

    clause_tokens = [word.text for sent in clause_doc.sentences for word in sent.words if word.upos in ['NOUN', 'X', 'ADJ']]
    clause_stems = stem_tokens(clause_tokens)

    return sum(stem in title_stems for stem in clause_stems)

# Tokenize text into sentences using Stanza
def stanza_sentence_tokenize(text):
    doc = nlp(text)
    sentences = [sentence.text for sentence in doc.sentences]
    return sentences

# Calculate dissimilarity matrix for clauses
def calculate_dissimilarity_matrix():

    clauses = sentence_tokenize(story)
    stemmed_clauses = [stem_tokens(clause.split()) for clause in clauses]
    n = len(stemmed_clauses)
    dissimilarity_matrix = [[0 for _ in range(n)] for _ in range(n)]

    for i in range(n):
        for j in range(n):
            if i != j and clauses[i].strip() and clauses[j].strip():
                set_i = set(stemmed_clauses[i])
                set_j = set(stemmed_clauses[j])
                unique_tokens_i = set_i - set_j
                unique_tokens_j = set_j - set_i
                total_unique_tokens = len(unique_tokens_i) + len(unique_tokens_j)
                total_tokens = len(set_i.union(set_j))
                dissimilarity_matrix[i][j] = total_unique_tokens / total_tokens if total_tokens else 0

    return dissimilarity_matrix

# Calculate normalized clause length
def calculate_normalized_clause_length(clauses):
    clause_lengths = [len(clause.split()) for clause in clauses]
    max_clause_length = max(clause_lengths, default=1)
    normalized_lengths = [length / max_clause_length for length in clause_lengths]
    return normalized_lengths

# Calculate POS scores for clauses
def calculate_pos_scores(clauses, nlp):
    pos_scores = []
    for clause in clauses:
        if clause.strip():
            clause_doc = nlp(clause)
            relevant_pos_tags = {'NOUN', 'ADJ', 'X'}
            pos_count = sum(word.upos in relevant_pos_tags for sentence in clause_doc.sentences for word in sentence.words)
            pos_scores.append(pos_count)
        else:
            pos_scores.append(0)
    return pos_scores

# Process and score clauses in sentences
def process_and_score_clauses(sentences, title, nlp, sentiment_model, word_probabilities, title_stems):
    scores_matrix = []
    for sentence in sentences:
        print(f"Sentence: {sentence}")
        clauses = sentence_tokenize(sentence)
        dissimilarity_matrix = calculate_dissimilarity_matrix()
        pos_scores = calculate_pos_scores(clauses, nlp)
        normalized_lengths = calculate_normalized_clause_length(clauses)

        for i, clause in enumerate(clauses):
            if clause.strip():
                clause_doc = nlp(clause)
                clause_tokens = [word.text for sent in clause_doc.sentences for word in sent.words]
                clause_stems = stem_tokens(clause_tokens)
                title_word_score = calculate_title_word_score(clause, title, nlp)

                # Sentiment Analysis
                sentiment_result = sentiment_model(clause)[0]
                sentiment_score = 0
                sentiment_label = sentiment_result['label']
                if sentiment_label != 'neutral':
                   sentiment_score = sentiment_result['score']

                # Nouns weight
                nouns, noun_score = calculate_noun_score(clause, word_probabilities, nlp)

                # SVO Analysis
                has_subject, has_verb, has_object = False, False, False
                for sent in clause_doc.sentences:
                    for word in sent.words:
                        if word.deprel == 'nsubj':
                            has_subject = True
                        elif word.deprel == 'root':
                            has_verb = True
                        elif word.deprel in ['obj', 'iobj']:
                            has_object = True
                clause_svo_score = 1 if has_subject and has_verb and has_object else 0

                # NER Score
                ner_entities = [(ent.text, ent.type) for sent in clause_doc.sentences for ent in sent.ents]
                meaningful_ner_labels = ['PER', 'ORG', 'LOC']
                ner_score = sum(ner_type in meaningful_ner_labels for _, ner_type in ner_entities)

                # Dissimilarity, POS, and Length Scores
                dissimilarity_score = sum(dissimilarity_matrix[i])
                pos_score = pos_scores[i]
                normalized_length = normalized_lengths[i]

                #overall score
                overall_score= clause_svo_score + title_word_score + ner_score + sentiment_score + noun_score + dissimilarity_score + pos_score + normalized_length

                # Append the scores to the matrix
                scores_matrix.append([clause_svo_score,title_word_score,ner_score,sentiment_score,
                                    noun_score,dissimilarity_score,pos_score,normalized_length,
                                    ])

                # Print the scores for each clause
                print(f"  Clause: {clause}")
                print(f"  SVO Score: {clause_svo_score}")
                print(f"  Title Word Score: {title_word_score}")
                print(f"  NER Score: {ner_score}")
                print(f"  Sentiment Label: {sentiment_label}")
                print(f"  Sentiment Score: {sentiment_score}")
                print(f"  Noun Score: {noun_score}")
                print(f"  Dissimilarity Score: {dissimilarity_score}")
                print(f"  POS Score: {pos_score}")
                print(f"  Normalized Length Score: {normalized_length}")
                print(f"  Overall Score: {overall_score}\n")

    return scores_matrix

# Segment a story into sentences using Stanza and process each sentence's clauses
def calculate_clause_scores_arabic(story, title):
    # Remove diacritics
    story = strip_tashkeel(story)
    title = strip_tashkeel(title)

    # Process the title and stem title words
    title_doc = nlp(title)
    title_tokens = [word.text for sent in title_doc.sentences for word in sent.words]
    title_stems = set(stem_tokens(title_tokens))

    # Word probabilities
    word_probabilities = calculate_word_probability(story)

    # Sentiment analysis model
    sentiment_model = pipeline('sentiment-analysis', model='CAMeL-Lab/bert-base-arabic-camelbert-mix-sentiment')

    # Segment story into sentences
    sentences = stanza_sentence_tokenize(story)

    # Process and score clauses
    scores_matrix = process_and_score_clauses(sentences, title, nlp, sentiment_model, word_probabilities, title_stems)

    return scores_matrix

# Example usage
story = """
كان يا مكان في قديم الزمان، كان في غابة طاووس وفيل يتبادلان الكلام، وكل من الفيل والطاووس يظن أنه الأفضل من الآخر؛ فالطاووس يعتقد أنه هو الأفضل مفتخرا بجمال ريشه. بينما الفيل فيرى أنه هو الأحسن مغرورا بقوة جسمه وطول خرطومه.

ظل الفيل والطاووس يناقشان الأمر دون أن يستسلم أحدهما للآخر إلى أن أحسا بالجوع، فتوجه الفيل والطاووس معا إلى حظيرة قريبة من الغابة، ودخل الفيل والطاووس إليها عبر ثقب في الحائط. فأخذ الفيل والطاووس يأكلان بفرح وسرور ما طاب ولذ من الأكل.

وبعد انتهاء الفيل والطاووس من الأكل والشرب، تابع الفيل والطاووس جدالهما دون توقف، إلى أن فاجأهما صاحبا الحظيرة اللذان كانا يحملان بيدهما بندقية، فقال أحدهما للآخر: سوف نقتل الفيل لخطورته، ونترك الطاووس يعيش في الحظيرة لجمال ريشه. ثم رد الرجل الثاني قائلا: أنت على صواب.

سمع الفيل والطاووس ما قاله الرجلان، فنظر الفيل والطاووس إلى بعضهما نظرة الوداع. وقتذاك صوب الرجلان بندقيتهما باتجاه الفيل وأطلقا النار عليه في آن واحد دون إصابته لحسن حظه، ففر الفيل باتجاه الغابة.

"""
title = "الاخوان المزارعان "

#scores_matrix = np.array(calculate_clause_scores_arabic(story, title))
scores_matrix = np.array(calculate_clause_scores_arabic(story, title))
print(scores_matrix)

# normalize the decision matrix
def normalize_matrix(matrix):
    normalized_matrix = matrix / np.sqrt(np.sum(matrix**2, axis=0))

    print(f"\nnormalized_matrix :\n {normalized_matrix} \n")

    return normalized_matrix

# Calculate ideal and negative-ideal solutions
def calculate_ideal_negative_ideal(normalized_matrix):
    positive_ideal_solution = np.max(normalized_matrix, axis=0)
    negative_ideal_solution = np.min(normalized_matrix, axis=0)

    print(f"positive_ideal_solution : {positive_ideal_solution} \n negative_ideal_solution : {negative_ideal_solution} \n")

    return positive_ideal_solution, negative_ideal_solution

# Calculate separation measures for each alternative
def calculate_separation_measures(normalized_matrix, positive_ideal_solution, negative_ideal_solution):
    positive_separation = np.sqrt(np.sum((normalized_matrix - positive_ideal_solution)**2 , axis=1))
    negative_separation = np.sqrt(np.sum((normalized_matrix - negative_ideal_solution)**2 , axis=1))

    print(f"positive_separation : {positive_separation} \n negative_separation : {negative_separation} \n")

    return positive_separation, negative_separation

# Calculate relative closeness to the ideal solution
def calculate_relative_closeness(positive_separation, negative_separation):
    relative_closeness = negative_separation / (positive_separation + negative_separation)

    print(f"relative_closeness :\n {relative_closeness} \n")

    return relative_closeness

# Perform TOPSIS ranking
def topsis_ranking(decision_matrix):
    normalized_matrix = normalize_matrix(decision_matrix)
    positive_ideal_solution, negative_ideal_solution = calculate_ideal_negative_ideal(normalized_matrix)
    positive_separation, negative_separation = calculate_separation_measures(normalized_matrix, positive_ideal_solution, negative_ideal_solution)
    relative_closeness = calculate_relative_closeness(positive_separation, negative_separation)
    ranking = np.argsort(relative_closeness)[::-1]  # Higher relative closeness indicates a better rank

    return ranking, relative_closeness

# Perform TOPSIS ranking
topsis_rank, relative_closeness = topsis_ranking(scores_matrix)

# Print the ranked clauses along with their scores
sentences = stanza_sentence_tokenize(story)
all_clauses = []
for sentence in sentences:
    clauses = sentence_tokenize(sentence)
    all_clauses.extend(clauses)

for rank in topsis_rank:
    clause = all_clauses[rank]
    score = relative_closeness[rank]
    print(f"Clause: {clause}\nScore: {score}\n")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json:   0%|   …

INFO:stanza:Loading these models for language: ar (Arabic):
| Processor | Package       |
-----------------------------
| tokenize  | padt          |
| mwt       | padt          |
| pos       | padt_charlm   |
| lemma     | padt_nocharlm |
| depparse  | padt_charlm   |
| ner       | aqmar_charlm  |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Loading: lemma
INFO:stanza:Loading: depparse
INFO:stanza:Loading: ner
INFO:stanza:Done loading processors!


Sentence: كان يا مكان في قديم الزمان، كان في غابة طاووس وفيل يتبادلان الكلام، وكل من الفيل والطاووس يظن أنه الأفضل من الآخر؛ فالطاووس يعتقد أنه هو الأفضل مفتخرا بجمال ريشه. بينما الفيل فيرى أنه هو الأحسن مغرورا بقوة جسمه وطول خرطومه.
  Clause: كان يا مكان في قديم الزمان،
  SVO Score: 0
  Title Word Score: 0
  NER Score: 0
  Sentiment Label: positive
  Sentiment Score: 0.558352530002594
  Noun Score: 0.008771929824561403
  Similarity Score: 18.575757575757574
  POS Score: 3
  Normalized Length Score: 0.35294117647058826
  Overall Score: 22.495823212055317

  Clause: كان في غابة طاووس وفيل يتبادلان الكلام،
  SVO Score: 0
  Title Word Score: 1
  NER Score: 1
  Sentiment Label: neutral
  Sentiment Score: 0
  Noun Score: 0.005847953216374269
  Similarity Score: 17.778987782083753
  POS Score: 4
  Normalized Length Score: 0.4117647058823529
  Overall Score: 24.19660044118248

  Clause: وكل من الفيل والطاووس يظن أنه الأفضل من الآخر؛ فالطاووس يعتقد أنه هو الأفضل مفتخرا بجمال ريشه.
  SVO Score:

# Test "each sentence"

In [None]:
#************************************************************************************************************************************************************
#تعديل هند على الكود عشان يطبع الجمل بترتيب الجملة وتحتها الكلوزز حقت كل جملة بترتيب ال TOPSIS
#************************************************************************************************************************************************************

import stanza
from pyarabic.araby import sentence_tokenize, strip_tashkeel
import nltk
from nltk.stem.isri import ISRIStemmer
from transformers import pipeline
from collections import Counter
from nltk.tokenize import word_tokenize
import numpy as np


# Initialize the Arabic stemmer
stemmer = ISRIStemmer()
nltk.download('punkt')

# Initialize Stanza pipeline for Arabic
nlp = stanza.Pipeline(lang='ar', processors='tokenize,mwt,pos,lemma,depparse,ner')


# Stem a list of tokens
def stem_tokens(tokens):
    return [stemmer.stem(token) for token in tokens]

# Calculate word probabilities in a story
def calculate_word_probability(story):
    words = word_tokenize(story)
    stemmed_words = stem_tokens(words)
    word_counts = Counter(stemmed_words)
    total_words = len(stemmed_words)
    word_probabilities = {word: count / total_words for word, count in word_counts.items()}
    return word_probabilities

# Calculate noun score for a clause
def calculate_noun_score(clause, word_probabilities, nlp):
    clause_doc = nlp(clause)
    nouns = [word.text for sentence in clause_doc.sentences for word in sentence.words if word.upos.startswith('N')]

    stemmed_nouns = stem_tokens(nouns)

    if len(nouns) == 0:
        return nouns, 0
    if len(stemmed_nouns) == 0:
        return stemmed_nouns, 0

    noun_score = sum(word_probabilities.get(noun, 0) for noun in stemmed_nouns) / len(stemmed_nouns)
    return stemmed_nouns, noun_score


# Calculate score based on the presence of title words in the clause.
def calculate_title_word_score(clause, title, nlp):
    title_doc = nlp(title)
    clause_doc = nlp(clause)
    # I've added type X since from our test, Stanza library assigned it to most nouns and adjectives if it fails to classify them
    title_tokens = [word.text for sent in title_doc.sentences for word in sent.words if word.upos in ['NOUN', 'X', 'ADJ']]
    title_stems = set(stem_tokens(title_tokens))

    clause_tokens = [word.text for sent in clause_doc.sentences for word in sent.words if word.upos in ['NOUN', 'X', 'ADJ']]
    clause_stems = stem_tokens(clause_tokens)

    return sum(stem in title_stems for stem in clause_stems)

# Tokenize text into sentences using Stanza
def stanza_sentence_tokenize(text):
    doc = nlp(text)
    sentences = [sentence.text for sentence in doc.sentences]
    return sentences

def calculate_dissimilarity_matrix(story):
    clauses = sentence_tokenize(story)
    stemmed_clauses = [stem_tokens(clause.split()) for clause in clauses]
    n = len(stemmed_clauses)
    dissimilarity_matrix = [[0 for _ in range(n)] for _ in range(n)]

    for i in range(n):
        for j in range(n):
            if i != j and clauses[i].strip() and clauses[j].strip():
                set_i = set(stemmed_clauses[i])
                set_j = set(stemmed_clauses[j])
                unique_tokens_i = set_i - set_j
                unique_tokens_j = set_j - set_i
                total_unique_tokens = len(unique_tokens_i) + len(unique_tokens_j)
                total_tokens = len(set_i.union(set_j))

                if total_tokens == 0:
                    dissimilarity_matrix[i][j] = 0
                else:
                    dissimilarity_matrix[i][j] = total_unique_tokens / total_tokens

    return dissimilarity_matrix

# Calculate normalized clause length
def calculate_normalized_clause_length(clauses):
    clause_lengths = [len(clause.split()) for clause in clauses]
    max_clause_length = max(clause_lengths, default=1)
    normalized_lengths = [length / max_clause_length for length in clause_lengths]
    return normalized_lengths

# Calculate POS scores for clauses
def calculate_pos_scores(clauses, nlp):
    pos_scores = []
    for clause in clauses:
        if clause.strip():
            clause_doc = nlp(clause)
            # I've added type X since from our test, Stanza library assigned it to most nouns and adjectives if it fails to classify them
            relevant_pos_tags = {'NOUN', 'ADJ', 'X'}
            pos_count = sum(word.upos in relevant_pos_tags for sentence in clause_doc.sentences for word in sentence.words)
            pos_scores.append(pos_count)
        else:
            pos_scores.append(0)
    return pos_scores

# Process and score clauses in sentences
def process_and_score_clauses(sentences, title, nlp, sentiment_model, word_probabilities, title_stems, story):
    scores_matrix = []
    for sentence in sentences:
        print(f"Sentence: {sentence}")
        clauses = sentence_tokenize(sentence)
        dissimilarity_matrix = calculate_dissimilarity_matrix(story)
        pos_scores = calculate_pos_scores(clauses, nlp)
        normalized_lengths = calculate_normalized_clause_length(clauses)

        for i, clause in enumerate(clauses):
            if clause.strip():
                clause_doc = nlp(clause)
                clause_tokens = [word.text for sent in clause_doc.sentences for word in sent.words]
                clause_stems = stem_tokens(clause_tokens)
                title_word_score = calculate_title_word_score(clause, title, nlp)

                # Sentiment Analysis
                sentiment_result = sentiment_model(clause)[0]
                sentiment_score = 0
                sentiment_label = sentiment_result['label']
                if sentiment_label != 'neutral':
                   sentiment_score = sentiment_result['score']

                # Nouns weight
                nouns, noun_score = calculate_noun_score(clause, word_probabilities, nlp)

                # SVO Analysis using Dependency Parsing
                has_subject, has_verb, has_object = False, False, False
                for sent in clause_doc.sentences:
                    for word in sent.words:
                        if word.deprel == 'nsubj':
                            has_subject = True
                        elif word.deprel == 'root':
                            has_verb = True
                        elif word.deprel in ['obj', 'iobj']:
                            has_object = True
                clause_svo_score = 1 if has_subject and has_verb and has_object else 0

                # NER Score
                ner_entities = [(ent.text, ent.type) for sent in clause_doc.sentences for ent in sent.ents]
                meaningful_ner_labels = ['PER', 'ORG', 'LOC']
                ner_score = sum(ner_type in meaningful_ner_labels for _, ner_type in ner_entities)

                # Dissimilarity, POS, and Length Scores
                dissimilarity_score = sum(dissimilarity_matrix[i])
                pos_score = pos_scores[i]
                normalized_length = normalized_lengths[i]

                #overall score
                overall_score= clause_svo_score + title_word_score + ner_score + sentiment_score + noun_score + dissimilarity_score + pos_score + normalized_length

                # Append the scores to the matrix
                scores_matrix.append([clause_svo_score,title_word_score,ner_score,sentiment_score,
                                    noun_score,dissimilarity_score,pos_score,normalized_length,
                                    ])

                # Print the scores for each clause
                print(f"  Clause: {clause}")
                print(f"  SVO Score: {clause_svo_score}")
                print(f"  Title Word Score: {title_word_score}")
                print(f"  NER Score: {ner_score}")
                print(f"  Sentiment Label: {sentiment_label}")
                print(f"  Sentiment Score: {sentiment_score}")
                print(f"  Noun Score: {noun_score}")
                print(f"  Dissimilarity Score: {dissimilarity_score}")
                print(f"  POS Score: {pos_score}")
                print(f"  Normalized Length Score: {normalized_length}")
                print(f"  Overall Score: {overall_score}\n")

    return scores_matrix

# Segment a story into sentences using Stanza and process each sentence's clauses
def calculate_clause_scores_arabic(story, title):
  # Remove diacritic
    story = strip_tashkeel(story)
    title = strip_tashkeel(title)

    # Process the title and stem title words
    title_doc = nlp(title)
    title_tokens = [word.text for sent in title_doc.sentences for word in sent.words]
    title_stems = set(stem_tokens(title_tokens))

    # Word probabilities
    word_probabilities = calculate_word_probability(story)

    # Sentiment analysis model
    sentiment_model = pipeline('sentiment-analysis', model='CAMeL-Lab/bert-base-arabic-camelbert-mix-sentiment')

    # Segment story into sentences
    sentences = stanza_sentence_tokenize(story)

    # Process and score clauses
    scores_matrix = process_and_score_clauses(sentences, title, nlp, sentiment_model, word_probabilities, title_stems, story)

    return scores_matrix

# Example usage
story = """
وجد المزارع سعد ذات يوم كلبا على وشك الموت من الجوع، فأخذ سعد الكلب إلى بيته واعتنى به وبعد فترة طويلة وجد في الكلب الذي كان يسميه "ريكس" أفضل صديق له.

كان المزارع والكلب يقضيان اليوم معا وتعلم الكلب لغة صاحبه وبدأ يفهم كل حركاته وإشاراته وكان المزارع أيضا يفهم ما كان يريده الكلب.

وفي ذلك الوقت وصلت إلى المنطقة عصابة خطيرة من اللصوص، ففكر الفلاحون بوضع جرس في كل مزرعة يدق عند الشعور بالخطر وذلك للدفاع عن أنفسهم.

تسلح الجميع بالعصي وكانوا يأتون لمساعدة المزرعة المعتدى عليها.

وذات ليلة، بينما كان المزارع سعد يغط في نومه بسبب الجهد الكبير الذي بذله طوال اليوم، أحاط قطاع الطرق بالمنزل وحينها قفز الكلب إلى السقف ودق الجرس.

جاء الجيران بعد سماع الجرس حاملين العصي وكسروا ضلوع اللصوص وأجبروهم على الهروب.

وابتداء من تلك اللحظة، اعتبر كل سكان المنطقة الكلب صديقهم. ولم يتلق أي كلب ضربة حجر أو عصا من الأطفال الأشقياء لأنهم فهموا الاحترام الذي يستحقه كل من يساعد الإنسان بإخلاص ووفاء.

"""
title = "الكلب وقطاع الطرق"


scores_matrix = np.array(calculate_clause_scores_arabic(story, title))
print(scores_matrix)

# normalize the decision matrix
def normalize_matrix(matrix):
    denominators = np.sqrt(np.sum(matrix**2, axis=0))
    normalized_matrix = np.zeros_like(matrix)

    for i in range(matrix.shape[1]):
        if denominators[i] == 0:
            normalized_matrix[:, i] = 0
        else:
            normalized_matrix[:, i] = matrix[:, i] / denominators[i]

    print(f"\nnormalized_matrix :\n {normalized_matrix} \n")
    return normalized_matrix

# Calculate ideal and negative-ideal solutions
def calculate_ideal_negative_ideal(normalized_matrix):
    positive_ideal_solution = np.max(normalized_matrix, axis=0)
    negative_ideal_solution = np.min(normalized_matrix, axis=0)

    print(f"positive_ideal_solution : {positive_ideal_solution} \n negative_ideal_solution : {negative_ideal_solution} \n")

    return positive_ideal_solution, negative_ideal_solution

# Calculate separation measures for each alternative
def calculate_separation_measures(normalized_matrix, positive_ideal_solution, negative_ideal_solution):
    positive_separation = np.sqrt(np.sum((normalized_matrix - positive_ideal_solution)**2 , axis=1))
    negative_separation = np.sqrt(np.sum((normalized_matrix - negative_ideal_solution)**2 , axis=1))

    print(f"positive_separation : {positive_separation} \n negative_separation : {negative_separation} \n")

    return positive_separation, negative_separation

# Calculate relative closeness to the ideal solution
def calculate_relative_closeness(positive_separation, negative_separation):
    relative_closeness = negative_separation / (positive_separation + negative_separation)

    print(f"relative_closeness :\n {relative_closeness} \n")

    return relative_closeness

# Perform TOPSIS ranking
def topsis_ranking(decision_matrix):
    normalized_matrix = normalize_matrix(decision_matrix)
    positive_ideal_solution, negative_ideal_solution = calculate_ideal_negative_ideal(normalized_matrix)
    positive_separation, negative_separation = calculate_separation_measures(normalized_matrix, positive_ideal_solution, negative_ideal_solution)
    relative_closeness = calculate_relative_closeness(positive_separation, negative_separation)
    ranking = np.argsort(relative_closeness)[::-1]  # Higher relative closeness indicates a better rank
    return ranking, relative_closeness


# Perform TOPSIS ranking
topsis_rank, relative_closeness = topsis_ranking(scores_matrix)
sentences = stanza_sentence_tokenize(story)
# Print the ranked clauses along with their scores
print("***************************************** This part of the code prints the sentences in the order of the story *****************************************")
print("=" * 150)
# Print each sentence along with its relevant clauses and scores
for sentence_idx, sentence in enumerate(sentences):
    print(f"Sentence {sentence_idx + 1}: {sentence}\n")
    sentence_clauses = sentence_tokenize(sentence)

    # Print relevant clauses and scores for the current sentence
    for clause_idx, rank in enumerate(topsis_rank):
        if rank < len(sentence_clauses) and sentence_clauses[rank].strip():  # Check if the rank is within the range of sentence_clauses
            clause = sentence_clauses[rank]
            score = relative_closeness[rank]
            clause_rank_in_sentence = sentence_clauses.index(clause) + 1  # Calculate the rank within the sentence
            print(f"  Clause {clause_rank_in_sentence}: {clause}\n  TOPSIS Score: {score}\n")

    print("=" * 150)  # Add a separator between sentences for better readability


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json:   0%|   …

INFO:stanza:Loading these models for language: ar (Arabic):
| Processor | Package       |
-----------------------------
| tokenize  | padt          |
| mwt       | padt          |
| pos       | padt_charlm   |
| lemma     | padt_nocharlm |
| depparse  | padt_charlm   |
| ner       | aqmar_charlm  |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Loading: lemma
INFO:stanza:Loading: depparse
INFO:stanza:Loading: ner
INFO:stanza:Done loading processors!


Sentence: وجد المزارع سعد ذات يوم كلبا على وشك الموت من الجوع، فأخذ سعد الكلب إلى بيته واعتنى به وبعد فترة طويلة وجد في الكلب الذي كان يسميه "ريكس" أفضل صديق له.
  Clause: وجد المزارع سعد ذات يوم كلبا على وشك الموت من الجوع،
  SVO Score: 1
  Title Word Score: 1
  NER Score: 1
  Sentiment Label: negative
  Sentiment Score: 0.7375519871711731
  Noun Score: 0.01596638655462185
  Dissimilarity Score: 11.198273026185321
  POS Score: 8
  Normalized Length Score: 0.55
  Overall Score: 23.501791399911117

  Clause: فأخذ سعد الكلب إلى بيته واعتنى به وبعد فترة طويلة وجد في الكلب الذي كان يسميه "ريكس" أفضل صديق له.
  SVO Score: 1
  Title Word Score: 2
  NER Score: 2
  Sentiment Label: positive
  Sentiment Score: 0.563670814037323
  Noun Score: 0.025490196078431376
  Dissimilarity Score: 11.323685866427804
  POS Score: 9
  Normalized Length Score: 1.0
  Overall Score: 26.912846876543558

Sentence: كان المزارع والكلب يقضيان اليوم معا وتعلم الكلب لغة صاحبه وبدأ يفهم كل حركاته وإشاراته وكان المزارع أ

# Test modification on POS/title words

In [None]:
#************************************************************************************************************************************************************
#تعديل حصة على الكود للتايتل وورد وال pos
#************************************************************************************************************************************************************


import stanza
from pyarabic.araby import sentence_tokenize, strip_tashkeel
import nltk
from nltk.stem.isri import ISRIStemmer
from transformers import pipeline
from collections import Counter
from nltk.tokenize import word_tokenize
import numpy as np


# Initialize the Arabic stemmer
stemmer = ISRIStemmer()
nltk.download('punkt')

# Initialize Stanza pipeline for Arabic
nlp = stanza.Pipeline(lang='ar', processors='tokenize,mwt,pos,lemma,depparse,ner')


# Stem a list of tokens
def stem_tokens(tokens):
    return [stemmer.stem(token) for token in tokens]

# Calculate word probabilities in a story
def calculate_word_probability(story):
    words = word_tokenize(story)
    stemmed_words = stem_tokens(words)
    word_counts = Counter(stemmed_words)
    total_words = len(stemmed_words)
    word_probabilities = {word: count / total_words for word, count in word_counts.items()}
    return word_probabilities

# Calculate noun score for a clause
def calculate_noun_score(clause, word_probabilities, nlp):
    clause_doc = nlp(clause)
    nouns = [word.text for sentence in clause_doc.sentences for word in sentence.words if word.upos.startswith('N')]

    stemmed_nouns = stem_tokens(nouns)

    if len(nouns) == 0:
        return nouns, 0
    if len(stemmed_nouns) == 0:
        return stemmed_nouns, 0

    noun_score = sum(word_probabilities.get(noun, 0) for noun in stemmed_nouns) / len(stemmed_nouns)
    return stemmed_nouns, noun_score


# Calculate score based on the presence of title words in the clause.
def calculate_title_word_score(clause, title, nlp):
    title_doc = nlp(title)
    clause_doc = nlp(clause)
    # I've added type X since from our test, Stanza library assigned it to most nouns and adjectives if it fails to classify them
    title_tokens = [word.text for sent in title_doc.sentences for word in sent.words if word.upos in ['NOUN', 'X', 'ADJ']]
    title_stems = set(stem_tokens(title_tokens))

    clause_tokens = [word.text for sent in clause_doc.sentences for word in sent.words if word.upos in ['NOUN', 'X', 'ADJ']]
    clause_stems = set(stem_tokens(clause_tokens))  # Convert clause stems to a set to remove duplicates

    return sum(stem in title_stems for stem in clause_stems)


# Tokenize text into sentences using Stanza
def stanza_sentence_tokenize(text):
    doc = nlp(text)
    sentences = [sentence.text for sentence in doc.sentences]
    return sentences

def calculate_dissimilarity_matrix(story):
    clauses = sentence_tokenize(story)
    stemmed_clauses = [stem_tokens(clause.split()) for clause in clauses]
    n = len(stemmed_clauses)
    dissimilarity_matrix = [[0 for _ in range(n)] for _ in range(n)]

    for i in range(n):
        for j in range(n):
            if i != j and clauses[i].strip() and clauses[j].strip():
                set_i = set(stemmed_clauses[i])
                set_j = set(stemmed_clauses[j])
                unique_tokens_i = set_i - set_j
                unique_tokens_j = set_j - set_i
                total_unique_tokens = len(unique_tokens_i) + len(unique_tokens_j)
                total_tokens = len(set_i.union(set_j))

                if total_tokens == 0:
                    dissimilarity_matrix[i][j] = 0
                else:
                    dissimilarity_matrix[i][j] = total_unique_tokens / total_tokens

    return dissimilarity_matrix

# Calculate normalized clause length
def calculate_normalized_clause_length(clauses):
    clause_lengths = [len(clause.split()) for clause in clauses]
    max_clause_length = max(clause_lengths, default=1)
    normalized_lengths = [length / max_clause_length for length in clause_lengths]
    return normalized_lengths

# Calculate POS scores for clauses
def calculate_pos_scores(clauses, nlp):
    pos_scores = []
    for clause in clauses:
        if clause.strip():
            clause_doc = nlp(clause)
            # I've added type X since from our test, Stanza library assigned it to most nouns and adjectives if it fails to classify them
            relevant_pos_tags = {'ADJ', 'VERB'}
            pos_count = sum(word.upos in relevant_pos_tags for sentence in clause_doc.sentences for word in sentence.words)
            pos_scores.append(pos_count)
        else:
            pos_scores.append(0)
    return pos_scores

# Process and score clauses in sentences
def process_and_score_clauses(sentences, title, nlp, sentiment_model, word_probabilities, title_stems, story):
    scores_matrix = []
    clause_scores = {}
    for sentence in sentences:
        print(f"Sentence: {sentence}")
        clauses = sentence_tokenize(sentence)
        dissimilarity_matrix = calculate_dissimilarity_matrix(story)
        pos_scores = calculate_pos_scores(clauses, nlp)
        normalized_lengths = calculate_normalized_clause_length(clauses)

        for i, clause in enumerate(clauses):
            if clause.strip():
                clause_doc = nlp(clause)
                clause_tokens = [word.text for sent in clause_doc.sentences for word in sent.words]
                clause_stems = stem_tokens(clause_tokens)
                title_word_score = calculate_title_word_score(clause, title, nlp)

                # Sentiment Analysis
                sentiment_result = sentiment_model(clause)[0]
                sentiment_score = 0
                sentiment_label = sentiment_result['label']
                if sentiment_label != 'neutral':
                   sentiment_score = sentiment_result['score']

                # Nouns weight
                nouns, noun_score = calculate_noun_score(clause, word_probabilities, nlp)

                # SVO Analysis using Dependency Parsing
                has_subject, has_verb, has_object = False, False, False
                for sent in clause_doc.sentences:
                    for word in sent.words:
                        if word.deprel == 'nsubj':
                            has_subject = True
                        elif word.deprel == 'root':
                            has_verb = True
                        elif word.deprel in ['obj', 'iobj']:
                            has_object = True
                clause_svo_score = 1 if has_subject and has_verb and has_object else 0

                # NER Score
                ner_entities = [(ent.text, ent.type) for sent in clause_doc.sentences for ent in sent.ents]
                meaningful_ner_labels = ['PER', 'ORG', 'LOC']
                ner_score = sum(ner_type in meaningful_ner_labels for _, ner_type in ner_entities)

                # Dissimilarity, POS, and Length Scores
                dissimilarity_score = sum(dissimilarity_matrix[i])
                pos_score = pos_scores[i]
                normalized_length = normalized_lengths[i]

                #overall score
                overall_score= clause_svo_score + title_word_score + ner_score + sentiment_score + noun_score + dissimilarity_score + pos_score + normalized_length

                clause_scores[clause] = overall_score  # Store the overall score for each clause
                # Append the scores to the matrix
                scores_matrix.append([clause_svo_score,title_word_score,ner_score,sentiment_score,
                                    noun_score,dissimilarity_score,pos_score, normalized_length,
                                    ])

                # Print the scores for each clause
                print(f"  Clause: {clause}")
                print(f"  SVO Score: {clause_svo_score}")
                print(f"  Title Word Score: {title_word_score}")
                print(f"  NER Score: {ner_score}")
                print(f"  Sentiment Label: {sentiment_label}")
                print(f"  Sentiment Score: {sentiment_score}")
                print(f"  Noun Score: {noun_score}")
                print(f"  Dissimilarity Score: {dissimilarity_score}")
                print(f"  POS Score: {pos_score}")
                print(f"  Normalized Length Score: {normalized_length}")
                print(f"  Overall Score: {overall_score}\n")

    return scores_matrix, clause_scores

# Segment a story into sentences using Stanza and process each sentence's clauses
def calculate_clause_scores_arabic(story, title):
    # Process the title and stem title words
    title_doc = nlp(title)
    title_tokens = [word.text for sent in title_doc.sentences for word in sent.words]
    title_stems = set(stem_tokens(title_tokens))

    # Word probabilities
    word_probabilities = calculate_word_probability(story)

    # Sentiment analysis model
    sentiment_model = pipeline('sentiment-analysis', model='CAMeL-Lab/bert-base-arabic-camelbert-mix-sentiment')

    # Segment story into sentences
    sentences = stanza_sentence_tokenize(story)

    # Process and score clauses
    scores_matrix = process_and_score_clauses(sentences, title, nlp, sentiment_model, word_probabilities, title_stems, story)

    return scores_matrix

# Example usage
story = """
جلس رجل أعمى على رصيف في أحد الشوارع، ووضع الرجل الأعمى قبعته أمامه، وبجانب الرجل الأعمى لوحة مكتوب عليها: "أنا رجل أعمى، أرجوكم ساعدوني"، فمر رجل إعلانات بالشارع الذي يجلس فيه الأعمى، فوجد رجل الاعلانات أن قبعة الأعمى لا تحتوي سوى على القليل من المال، فوضع رجل الاعلانات بعض النقود في القبعة، ثم -ودون أن يستأذن الأعمى- أخذ رجل الإعلانات اللوحة التي بجانبه وكتب عليها عبارةً أخرى، ثم أعاد رجل الإعلانات اللوحة إلى مكانها بجانب الأعمى وغادر.

بدأ الأعمى يلاحظ أن قبعته امتلأت بالنقود، فعرف أن السبب هو ما فعله ذلك الرجل بلوحته، فسأل أحد المارة عما كتب على اللوحة، فكانت الآتي: "إنّنا في فصل الربيع، ولكنني لا أستطيع رؤية جماله!".

"""
title = "الإعلان والأعمى"

# Remove diacritic
story = strip_tashkeel(story)
title = strip_tashkeel(title)

# normalize the decision matrix
def normalize_matrix(matrix):
    epsilon = 1e-10  # Small constant to prevent division by zero
    denominators = np.sqrt(np.sum(matrix**2, axis=0))
    normalized_matrix = np.zeros_like(matrix)

    for i in range(matrix.shape[1]):
        normalized_matrix[:, i] = matrix[:, i] / (denominators[i] + epsilon)

    return normalized_matrix

# Calculate ideal and negative-ideal solutions
def calculate_ideal_negative_ideal(normalized_matrix):
    positive_ideal_solution = np.max(normalized_matrix, axis=0)
    negative_ideal_solution = np.min(normalized_matrix, axis=0)

    print(f"positive_ideal_solution : {positive_ideal_solution} \n negative_ideal_solution : {negative_ideal_solution} \n")

    return positive_ideal_solution, negative_ideal_solution

# Calculate separation measures for each alternative
def calculate_separation_measures(normalized_matrix, positive_ideal_solution, negative_ideal_solution):
    positive_separation = np.sqrt(np.sum((normalized_matrix - positive_ideal_solution)**2 , axis=1))
    negative_separation = np.sqrt(np.sum((normalized_matrix - negative_ideal_solution)**2 , axis=1))

    print(f"positive_separation : {positive_separation} \n negative_separation : {negative_separation} \n")

    return positive_separation, negative_separation

# Calculate relative closeness to the ideal solution
def calculate_relative_closeness(positive_separation, negative_separation):
    relative_closeness = negative_separation / (positive_separation + negative_separation)

    print(f"relative_closeness :\n {relative_closeness} \n")

    return relative_closeness

# Perform TOPSIS ranking
def topsis_ranking(decision_matrix, clause_scores):
    normalized_matrix = normalize_matrix(decision_matrix)
    positive_ideal_solution, negative_ideal_solution = calculate_ideal_negative_ideal(normalized_matrix)
    positive_separation, negative_separation = calculate_separation_measures(normalized_matrix, positive_ideal_solution, negative_ideal_solution)
    relative_closeness = calculate_relative_closeness(positive_separation, negative_separation)

    # Create a dictionary to map clauses to their TOPSIS scores
    topsis_scores = {}
    for i, clause in enumerate(clause_scores.keys()):
        topsis_scores[clause] = relative_closeness[i]

    return topsis_scores



# # Perform TOPSIS ranking
# topsis_rank, relative_closeness = topsis_ranking(scores_matrix)

# # Print the ranked clauses along with their scores
# sentences = stanza_sentence_tokenize(story)
# all_clauses = []
# for sentence in sentences:
#     clauses = sentence_tokenize(sentence)
#     all_clauses.extend(clauses)

# for rank in topsis_rank:
#     clause = all_clauses[rank]
#     score = relative_closeness[rank]
#     print(f"Clause: {clause}\nScore: {score}\n")

scores_matrix, clause_scores = calculate_clause_scores_arabic(story, title)
scores_matrix = np.array(scores_matrix)

topsis_scores = topsis_ranking(scores_matrix, clause_scores)

# Print sentences with their clauses and TOPSIS scores
sentences = stanza_sentence_tokenize(story)
for sentence in sentences:
    print(f"----------------------------------------------------")
    print(f"Sentence: {sentence}")
    clauses = sentence_tokenize(sentence)
    for clause in clauses:
        score = topsis_scores.get(clause,0)  # Get the TOPSIS score for each clause
        print(f"  Clause: {clause}\n  TOPSIS Score: {score}\n")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json:   0%|   …

INFO:stanza:Loading these models for language: ar (Arabic):
| Processor | Package       |
-----------------------------
| tokenize  | padt          |
| mwt       | padt          |
| pos       | padt_charlm   |
| lemma     | padt_nocharlm |
| depparse  | padt_charlm   |
| ner       | aqmar_charlm  |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Loading: lemma
INFO:stanza:Loading: depparse
INFO:stanza:Loading: ner
INFO:stanza:Done loading processors!


Sentence: وجد المزارع سعد ذات يوم كلبا على وشك الموت من الجوع، فأخذ سعد الكلب إلى بيته واعتنى به وبعد فترة طويلة وجد في الكلب الذي كان يسميه "ريكس" أفضل صديق له.
  Clause: وجد المزارع سعد ذات يوم كلبا على وشك الموت من الجوع،
  SVO Score: 1
  Title Word Score: 1
  NER Score: 1
  Sentiment Label: negative
  Sentiment Score: 0.7375519871711731
  Noun Score: 0.01596638655462185
  Dissimilarity Score: 11.198273026185321
  POS Score: 1
  Normalized Length Score: 0.55
  Overall Score: 16.501791399911117

  Clause: فأخذ سعد الكلب إلى بيته واعتنى به وبعد فترة طويلة وجد في الكلب الذي كان يسميه "ريكس" أفضل صديق له.
  SVO Score: 1
  Title Word Score: 1
  NER Score: 2
  Sentiment Label: positive
  Sentiment Score: 0.563670814037323
  Noun Score: 0.025490196078431376
  Dissimilarity Score: 11.323685866427804
  POS Score: 7
  Normalized Length Score: 1.0
  Overall Score: 23.912846876543558

Sentence: كان المزارع والكلب يقضيان اليوم معا وتعلم الكلب لغة صاحبه وبدأ يفهم كل حركاته وإشاراته وكان المزارع أ

# Test modification on clause length


In [None]:
#************************************************************************************************************************************************************
#تعديل هند على طول الكلوز لو أعلى من الافرج ياخذ ١ لو أقل ياخذ ٠
#************************************************************************************************************************************************************

import stanza
from pyarabic.araby import sentence_tokenize, strip_tashkeel
import nltk
from nltk.stem.isri import ISRIStemmer
from transformers import pipeline
from collections import Counter
from nltk.tokenize import word_tokenize
import numpy as np


# Initialize the Arabic stemmer
stemmer = ISRIStemmer()
nltk.download('punkt')

# Initialize Stanza pipeline for Arabic
nlp = stanza.Pipeline(lang='ar', processors='tokenize,mwt,pos,lemma,depparse,ner')


# Stem a list of tokens
def stem_tokens(tokens):
    return [stemmer.stem(token) for token in tokens]

# Calculate word probabilities in a story
def calculate_word_probability(story):
    words = word_tokenize(story)
    stemmed_words = stem_tokens(words)
    word_counts = Counter(stemmed_words)
    total_words = len(stemmed_words)
    word_probabilities = {word: count / total_words for word, count in word_counts.items()}
    return word_probabilities

# Calculate noun score for a clause
def calculate_noun_score(clause, word_probabilities, nlp):
    clause_doc = nlp(clause)
    nouns = [word.text for sentence in clause_doc.sentences for word in sentence.words if word.upos.startswith('N')]

    stemmed_nouns = stem_tokens(nouns)

    if len(nouns) == 0:
        return nouns, 0
    if len(stemmed_nouns) == 0:
        return stemmed_nouns, 0

    noun_score = sum(word_probabilities.get(noun, 0) for noun in stemmed_nouns) / len(stemmed_nouns)
    return stemmed_nouns, noun_score


# Calculate score based on the presence of title words in the clause.
def calculate_title_word_score(clause, title, nlp):
    title_doc = nlp(title)
    clause_doc = nlp(clause)
    # I've added type X since from our test, Stanza library assigned it to most nouns and adjectives if it fails to classify them
    title_tokens = [word.text for sent in title_doc.sentences for word in sent.words if word.upos in ['NOUN', 'X', 'ADJ']]
    title_stems = set(stem_tokens(title_tokens))

    clause_tokens = [word.text for sent in clause_doc.sentences for word in sent.words if word.upos in ['NOUN', 'X', 'ADJ']]
    clause_stems = set(stem_tokens(clause_tokens))  # Convert clause stems to a set to remove duplicates

    return sum(stem in title_stems for stem in clause_stems)


# Tokenize text into sentences using Stanza
def stanza_sentence_tokenize(text):
    doc = nlp(text)
    sentences = [sentence.text for sentence in doc.sentences]
    return sentences

def calculate_dissimilarity_matrix(story):
    clauses = sentence_tokenize(story)
    stemmed_clauses = [stem_tokens(clause.split()) for clause in clauses]
    n = len(stemmed_clauses)
    dissimilarity_matrix = [[0 for _ in range(n)] for _ in range(n)]

    for i in range(n):
        for j in range(n):
            if i != j and clauses[i].strip() and clauses[j].strip():
                set_i = set(stemmed_clauses[i])
                set_j = set(stemmed_clauses[j])
                unique_tokens_i = set_i - set_j
                unique_tokens_j = set_j - set_i
                total_unique_tokens = len(unique_tokens_i) + len(unique_tokens_j)
                total_tokens = len(set_i.union(set_j))

                if total_tokens == 0:
                    dissimilarity_matrix[i][j] = 0
                else:
                    dissimilarity_matrix[i][j] = total_unique_tokens / total_tokens

    return dissimilarity_matrix

# Calculate normalized clause length
def calculate_normalized_clause_length(clauses):
    clauses_lengths = [len(clause.split()) for clause in clauses]
    # Calculate the average length of clauses
    average_clause_length =sum(clauses_lengths) / len(clauses_lengths) if len(clauses_lengths) > 0 else 1
    # Compare each clause's length with the average
    normalized_lengths = [1 if length >= average_clause_length else 0 for length in clauses_lengths]
    return normalized_lengths

# Calculate POS scores for clauses
def calculate_pos_scores(clauses, nlp):
    pos_scores = []
    for clause in clauses:
        if clause.strip():
            clause_doc = nlp(clause)
            # I've added type X since from our test, Stanza library assigned it to most nouns and adjectives if it fails to classify them
            relevant_pos_tags = {'ADJ', 'VERB'}
            pos_count = sum(word.upos in relevant_pos_tags for sentence in clause_doc.sentences for word in sentence.words)
            pos_scores.append(pos_count)
        else:
            pos_scores.append(0)
    return pos_scores

# Process and score clauses in sentences
def process_and_score_clauses(sentences, title, nlp, sentiment_model, word_probabilities, title_stems, story):
    scores_matrix = []
    clause_scores = {}
    for sentence in sentences:
        print(f"Sentence: {sentence}")
        clauses = sentence_tokenize(sentence)
        dissimilarity_matrix = calculate_dissimilarity_matrix(story)
        pos_scores = calculate_pos_scores(clauses, nlp)
        normalized_lengths = calculate_normalized_clause_length(clauses)

        for i, clause in enumerate(clauses):
            if clause.strip():
                clause_doc = nlp(clause)
                clause_tokens = [word.text for sent in clause_doc.sentences for word in sent.words]
                clause_stems = stem_tokens(clause_tokens)
                title_word_score = calculate_title_word_score(clause, title, nlp)

                # Sentiment Analysis
                sentiment_result = sentiment_model(clause)[0]
                sentiment_score = 0
                sentiment_label = sentiment_result['label']
                if sentiment_label != 'neutral':
                   sentiment_score = sentiment_result['score']

                # Nouns weight
                nouns, noun_score = calculate_noun_score(clause, word_probabilities, nlp)

                # SVO Analysis using Dependency Parsing
                has_subject, has_verb, has_object = False, False, False
                for sent in clause_doc.sentences:
                    for word in sent.words:
                        if word.deprel == 'nsubj':
                            has_subject = True
                        elif word.deprel == 'root':
                            has_verb = True
                        elif word.deprel in ['obj', 'iobj']:
                            has_object = True
                clause_svo_score = 1 if has_subject and has_verb and has_object else 0

                # NER Score
                ner_entities = [(ent.text, ent.type) for sent in clause_doc.sentences for ent in sent.ents]
                meaningful_ner_labels = ['PER', 'ORG', 'LOC']
                ner_score = sum(ner_type in meaningful_ner_labels for _, ner_type in ner_entities)

                # Dissimilarity, POS, and Length Scores
                dissimilarity_score = sum(dissimilarity_matrix[i])
                pos_score = pos_scores[i]
                normalized_length = normalized_lengths[i]

                #overall score
                overall_score= clause_svo_score + title_word_score + ner_score + sentiment_score + noun_score + dissimilarity_score + pos_score + normalized_length

                clause_scores[clause] = overall_score  # Store the overall score for each clause
                # Append the scores to the matrix
                scores_matrix.append([clause_svo_score,title_word_score,ner_score,sentiment_score,
                                    noun_score,dissimilarity_score,pos_score, normalized_length,
                                    ])

                # Print the scores for each clause
                print(f"  Clause: {clause}")
                print(f"  SVO Score: {clause_svo_score}")
                print(f"  Title Word Score: {title_word_score}")
                print(f"  NER Score: {ner_score}")
                print(f"  Sentiment Label: {sentiment_label}")
                print(f"  Sentiment Score: {sentiment_score}")
                print(f"  Noun Score: {noun_score}")
                print(f"  Dissimilarity Score: {dissimilarity_score}")
                print(f"  POS Score: {pos_score}")
                print(f"  Normalized Length Score: {normalized_length}")
                print(f"  Overall Score: {overall_score}\n")

    return scores_matrix, clause_scores

# Segment a story into sentences using Stanza and process each sentence's clauses
def calculate_clause_scores_arabic(story, title):
  # Remove diacritic
    story = strip_tashkeel(story)
    title = strip_tashkeel(title)

    # Process the title and stem title words
    title_doc = nlp(title)
    title_tokens = [word.text for sent in title_doc.sentences for word in sent.words]
    title_stems = set(stem_tokens(title_tokens))

    # Word probabilities
    word_probabilities = calculate_word_probability(story)

    # Sentiment analysis model
    sentiment_model = pipeline('sentiment-analysis', model='CAMeL-Lab/bert-base-arabic-camelbert-mix-sentiment')

    # Segment story into sentences
    sentences = stanza_sentence_tokenize(story)

    # Process and score clauses
    scores_matrix = process_and_score_clauses(sentences, title, nlp, sentiment_model, word_probabilities, title_stems, story)

    return scores_matrix

# Example usage
story = """
وجد المزارع سعد ذات يوم كلبا على وشك الموت من الجوع، فأخذ سعد الكلب إلى بيته واعتنى به وبعد فترة طويلة وجد في الكلب الذي كان يسميه "ريكس" أفضل صديق له.

كان المزارع والكلب يقضيان اليوم معا وتعلم الكلب لغة صاحبه وبدأ يفهم كل حركاته وإشاراته وكان المزارع أيضا يفهم ما كان يريده الكلب.

وفي ذلك الوقت وصلت إلى المنطقة عصابة خطيرة من اللصوص، ففكر الفلاحون بوضع جرس في كل مزرعة يدق عند الشعور بالخطر وذلك للدفاع عن أنفسهم.

تسلح الجميع بالعصي وكانوا يأتون لمساعدة المزرعة المعتدى عليها.

وذات ليلة، بينما كان المزارع سعد يغط في نومه بسبب الجهد الكبير الذي بذله طوال اليوم، أحاط قطاع الطرق بالمنزل وحينها قفز الكلب إلى السقف ودق الجرس.

جاء الجيران بعد سماع الجرس حاملين العصي وكسروا ضلوع اللصوص وأجبروهم على الهروب.

وابتداء من تلك اللحظة، اعتبر كل سكان المنطقة الكلب صديقهم. ولم يتلق أي كلب ضربة حجر أو عصا من الأطفال الأشقياء لأنهم فهموا الاحترام الذي يستحقه كل من يساعد الإنسان بإخلاص ووفاء.

"""
title = "الكلب وقطاع الطرق"

# Remove diacritic
story = strip_tashkeel(story)
title = strip_tashkeel(title)

def euclidean_normalization(matrix):
    denominator = np.sqrt(np.sum(matrix**2, axis=0))
    denominator[denominator == 0] = 1
    normalized_matrix = matrix / denominator
    return normalized_matrix

# normalize the decision matrix
def normalize_matrix(matrix):
    """denominators = np.sqrt(np.sum(matrix**2, axis=0))
    normalized_matrix = np.zeros_like(matrix)

    for i in range(matrix.shape[1]):
        if denominators[i] == 0:
            normalized_matrix[:, i] = 0
        else:
            normalized_matrix[:, i] = matrix[:, i] / denominators[i]"""

    denominator = np.sqrt(np.sum(matrix**2, axis=0))
    denominator[denominator == 0] = 1
    normalized_matrix = matrix / denominator

    print(f"\nnormalized_matrix :\n {normalized_matrix} \n")
    return normalized_matrix

# Calculate ideal and negative-ideal solutions
def calculate_ideal_negative_ideal(normalized_matrix):
    positive_ideal_solution = np.max(normalized_matrix, axis=0)
    negative_ideal_solution = np.min(normalized_matrix, axis=0)

    print(f"positive_ideal_solution : {positive_ideal_solution} \n negative_ideal_solution : {negative_ideal_solution} \n")

    return positive_ideal_solution, negative_ideal_solution

# Calculate separation measures for each alternative
def calculate_separation_measures(normalized_matrix, positive_ideal_solution, negative_ideal_solution):
    positive_separation = np.sqrt(np.sum((normalized_matrix - positive_ideal_solution)**2 , axis=1))
    negative_separation = np.sqrt(np.sum((normalized_matrix - negative_ideal_solution)**2 , axis=1))

    print(f"positive_separation : {positive_separation} \n negative_separation : {negative_separation} \n")

    return positive_separation, negative_separation

# Calculate relative closeness to the ideal solution
def calculate_relative_closeness(positive_separation, negative_separation):
    relative_closeness = negative_separation / (positive_separation + negative_separation)

    print(f"relative_closeness :\n {relative_closeness} \n")

    return relative_closeness

# Perform TOPSIS ranking
def topsis_ranking(decision_matrix, clause_scores):
    normalized_matrix = normalize_matrix(decision_matrix)
    positive_ideal_solution, negative_ideal_solution = calculate_ideal_negative_ideal(normalized_matrix)
    positive_separation, negative_separation = calculate_separation_measures(normalized_matrix, positive_ideal_solution, negative_ideal_solution)
    relative_closeness = calculate_relative_closeness(positive_separation, negative_separation)

    # Create a dictionary to map clauses to their TOPSIS scores
    topsis_scores = {}
    for i, clause in enumerate(clause_scores.keys()):
        topsis_scores[clause] = relative_closeness[i]

    return topsis_scores



# # Perform TOPSIS ranking
# topsis_rank, relative_closeness = topsis_ranking(scores_matrix)

# # Print the ranked clauses along with their scores
# sentences = stanza_sentence_tokenize(story)
# all_clauses = []
# for sentence in sentences:
#     clauses = sentence_tokenize(sentence)
#     all_clauses.extend(clauses)

# for rank in topsis_rank:
#     clause = all_clauses[rank]
#     score = relative_closeness[rank]
#     print(f"Clause: {clause}\nScore: {score}\n")

scores_matrix, clause_scores = calculate_clause_scores_arabic(story, title)
scores_matrix = np.array(scores_matrix)

print(f"scores_matrix:{scores_matrix}")
topsis_scores = topsis_ranking(scores_matrix, clause_scores)

print(f"topsis_scores :{topsis_scores}")

# Print sentences with their clauses and TOPSIS scores
"""sentences = stanza_sentence_tokenize(story)
for sentence in sentences:
    print(f"----------------------------------------------------")
    print(f"Sentence: {sentence}")
    clauses = sentence_tokenize(sentence)
    for clause in clauses:
        score = topsis_scores.get(clause, 0)  # Get the TOPSIS score for each clause
        print(f"  Clause: {clause}\n  TOPSIS Score: {score}\n")"""

# Print each sentence, then print its clauses in order of their score in TOPSIS
sentences = stanza_sentence_tokenize(story)
for sentence in sentences:
    print(f"----------------------------------------------------")
    print(f"Sentence: {sentence}")
    clauses = sentence_tokenize(sentence)

    # Create a list to store clauses with their TOPSIS scores
    clause_scores = []
    for clause in clauses:
        score = topsis_scores.get(clause, 0)  # Get the TOPSIS score for each clause
        clause_scores.append((clause, score))

    # Sort the clauses based on their TOPSIS scores (x[1] == TOPSIS scores in clause_scores)
    sorted_clause_scores = sorted(clause_scores, key=lambda x: x[1], reverse=True)

    # Print the clauses in order of their TOPSIS scores
    for clause_score in sorted_clause_scores:
        clause, score = clause_score
        print(f"  Clause: {clause}\n  TOPSIS Score: {score}\n")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json:   0%|   …

Downloading https://huggingface.co/stanfordnlp/stanza-ar/resolve/v1.6.0/models/tokenize/padt.pt:   0%|        …

Downloading https://huggingface.co/stanfordnlp/stanza-ar/resolve/v1.6.0/models/mwt/padt.pt:   0%|          | 0…

Downloading https://huggingface.co/stanfordnlp/stanza-ar/resolve/v1.6.0/models/pos/padt_charlm.pt:   0%|      …

Downloading https://huggingface.co/stanfordnlp/stanza-ar/resolve/v1.6.0/models/lemma/padt_nocharlm.pt:   0%|  …

Downloading https://huggingface.co/stanfordnlp/stanza-ar/resolve/v1.6.0/models/depparse/padt_charlm.pt:   0%| …

Downloading https://huggingface.co/stanfordnlp/stanza-ar/resolve/v1.6.0/models/ner/aqmar_charlm.pt:   0%|     …

Downloading https://huggingface.co/stanfordnlp/stanza-ar/resolve/v1.6.0/models/forward_charlm/ccwiki.pt:   0%|…

Downloading https://huggingface.co/stanfordnlp/stanza-ar/resolve/v1.6.0/models/pretrain/fasttextwiki.pt:   0%|…

Downloading https://huggingface.co/stanfordnlp/stanza-ar/resolve/v1.6.0/models/pretrain/conll17.pt:   0%|     …

Downloading https://huggingface.co/stanfordnlp/stanza-ar/resolve/v1.6.0/models/backward_charlm/ccwiki.pt:   0%…

INFO:stanza:Loading these models for language: ar (Arabic):
| Processor | Package       |
-----------------------------
| tokenize  | padt          |
| mwt       | padt          |
| pos       | padt_charlm   |
| lemma     | padt_nocharlm |
| depparse  | padt_charlm   |
| ner       | aqmar_charlm  |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Loading: lemma
INFO:stanza:Loading: depparse
INFO:stanza:Loading: ner
INFO:stanza:Done loading processors!


config.json:   0%|          | 0.00/842 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/86.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/305k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Sentence: وجد المزارع سعد ذات يوم كلبا على وشك الموت من الجوع، فأخذ سعد الكلب إلى بيته واعتنى به وبعد فترة طويلة وجد في الكلب الذي كان يسميه "ريكس" أفضل صديق له.
  Clause: وجد المزارع سعد ذات يوم كلبا على وشك الموت من الجوع،
  SVO Score: 1
  Title Word Score: 1
  NER Score: 1
  Sentiment Label: negative
  Sentiment Score: 0.7375519871711731
  Noun Score: 0.01596638655462185
  Dissimilarity Score: 11.198273026185321
  POS Score: 1
  Normalized Length Score: 0
  Overall Score: 15.951791399911116

  Clause: فأخذ سعد الكلب إلى بيته واعتنى به وبعد فترة طويلة وجد في الكلب الذي كان يسميه "ريكس" أفضل صديق له.
  SVO Score: 1
  Title Word Score: 1
  NER Score: 2
  Sentiment Label: positive
  Sentiment Score: 0.563670814037323
  Noun Score: 0.025490196078431376
  Dissimilarity Score: 11.323685866427804
  POS Score: 7
  Normalized Length Score: 1
  Overall Score: 23.912846876543558

Sentence: كان المزارع والكلب يقضيان اليوم معا وتعلم الكلب لغة صاحبه وبدأ يفهم كل حركاته وإشاراته وكان المزارع أيضا ي

In [None]:
!pip install topsispy
from topsispy import topsis
import numpy as np
import stanza
from pyarabic.araby import sentence_tokenize, strip_tashkeel
import nltk
from nltk.stem.isri import ISRIStemmer
from transformers import pipeline
from collections import Counter
from nltk.tokenize import word_tokenize
import numpy as np


# Initialize the Arabic stemmer
stemmer = ISRIStemmer()
nltk.download('punkt')

# Initialize Stanza pipeline for Arabic
nlp = stanza.Pipeline(lang='ar', processors='tokenize,mwt,pos,lemma,depparse,ner')


# Stem a list of tokens
def stem_tokens(tokens):
    return [stemmer.stem(token) for token in tokens]

# Calculate word probabilities in a story
def calculate_word_probability(story):
    words = word_tokenize(story)
    stemmed_words = stem_tokens(words)
    word_counts = Counter(stemmed_words)
    total_words = len(stemmed_words)
    word_probabilities = {word: count / total_words for word, count in word_counts.items()}
    return word_probabilities

# Calculate noun score for a clause
def calculate_noun_score(clause, word_probabilities, nlp):
    clause_doc = nlp(clause)
    nouns = [word.text for sentence in clause_doc.sentences for word in sentence.words if word.upos.startswith('N')]

    stemmed_nouns = stem_tokens(nouns)

    if len(nouns) == 0:
        return nouns, 0
    if len(stemmed_nouns) == 0:
        return stemmed_nouns, 0

    noun_score = sum(word_probabilities.get(noun, 0) for noun in stemmed_nouns) / len(stemmed_nouns)
    return stemmed_nouns, noun_score


# Calculate score based on the presence of title words in the clause.
def calculate_title_word_score(clause, title, nlp):
    title_doc = nlp(title)
    clause_doc = nlp(clause)
    # I've added type X since from our test, Stanza library assigned it to most nouns and adjectives if it fails to classify them
    title_tokens = [word.text for sent in title_doc.sentences for word in sent.words if word.upos in ['NOUN', 'X', 'ADJ']]
    title_stems = set(stem_tokens(title_tokens))

    clause_tokens = [word.text for sent in clause_doc.sentences for word in sent.words if word.upos in ['NOUN', 'X', 'ADJ']]
    clause_stems = set(stem_tokens(clause_tokens))  # Convert clause stems to a set to remove duplicates

    return sum(stem in title_stems for stem in clause_stems)


# Tokenize text into sentences using Stanza
def stanza_sentence_tokenize(text):
    doc = nlp(text)
    sentences = [sentence.text for sentence in doc.sentences]
    return sentences

def calculate_dissimilarity_matrix(story):
    clauses = sentence_tokenize(story)
    stemmed_clauses = [stem_tokens(clause.split()) for clause in clauses]
    n = len(stemmed_clauses)
    dissimilarity_matrix = [[0 for _ in range(n)] for _ in range(n)]

    for i in range(n):
        for j in range(n):
            if i != j and clauses[i].strip() and clauses[j].strip():
                set_i = set(stemmed_clauses[i])
                set_j = set(stemmed_clauses[j])
                unique_tokens_i = set_i - set_j
                unique_tokens_j = set_j - set_i
                total_unique_tokens = len(unique_tokens_i) + len(unique_tokens_j)
                total_tokens = len(set_i.union(set_j))

                if total_tokens == 0:
                    dissimilarity_matrix[i][j] = 0
                else:
                    dissimilarity_matrix[i][j] = total_unique_tokens / total_tokens

    return dissimilarity_matrix

# Calculate normalized clause length
def calculate_normalized_clause_length(clauses):
    clauses_lengths = [len(clause.split()) for clause in clauses]
    # Calculate the average length of clauses
    average_clause_length =sum(clauses_lengths) / len(clauses_lengths) if len(clauses_lengths) > 0 else 1
    # Compare each clause's length with the average
    normalized_lengths = [1 if length >= average_clause_length else 0 for length in clauses_lengths]
    return normalized_lengths

# Calculate POS scores for clauses
def calculate_pos_scores(clauses, nlp):
    pos_scores = []
    for clause in clauses:
        if clause.strip():
            clause_doc = nlp(clause)
            # I've added type X since from our test, Stanza library assigned it to most nouns and adjectives if it fails to classify them
            relevant_pos_tags = {'ADJ', 'VERB'}
            pos_count = sum(word.upos in relevant_pos_tags for sentence in clause_doc.sentences for word in sentence.words)
            pos_scores.append(pos_count)
        else:
            pos_scores.append(0)
    return pos_scores

# Process and score clauses in sentences
def process_and_score_clauses(sentences, title, nlp, sentiment_model, word_probabilities, title_stems, story):
    scores_matrix = []
    clause_scores = {}
    for sentence in sentences:
        print(f"Sentence: {sentence}")
        clauses = sentence_tokenize(sentence)
        dissimilarity_matrix = calculate_dissimilarity_matrix(story)
        pos_scores = calculate_pos_scores(clauses, nlp)
        normalized_lengths = calculate_normalized_clause_length(clauses)

        for i, clause in enumerate(clauses):
            if clause.strip():
                clause_doc = nlp(clause)
                clause_tokens = [word.text for sent in clause_doc.sentences for word in sent.words]
                clause_stems = stem_tokens(clause_tokens)
                title_word_score = calculate_title_word_score(clause, title, nlp)

                # Sentiment Analysis
                sentiment_result = sentiment_model(clause)[0]
                sentiment_score = 0
                sentiment_label = sentiment_result['label']
                if sentiment_label != 'neutral':
                   sentiment_score = sentiment_result['score']

                # Nouns weight
                nouns, noun_score = calculate_noun_score(clause, word_probabilities, nlp)

                # SVO Analysis using Dependency Parsing
                has_subject, has_verb, has_object = False, False, False
                for sent in clause_doc.sentences:
                    for word in sent.words:
                        if word.deprel == 'nsubj':
                            has_subject = True
                        elif word.deprel == 'root':
                            has_verb = True
                        elif word.deprel in ['obj', 'iobj']:
                            has_object = True
                clause_svo_score = 1 if has_subject and has_verb and has_object else 0

                # NER Score
                ner_entities = [(ent.text, ent.type) for sent in clause_doc.sentences for ent in sent.ents]
                meaningful_ner_labels = ['PER', 'ORG', 'LOC']
                ner_score = sum(ner_type in meaningful_ner_labels for _, ner_type in ner_entities)

                # Dissimilarity, POS, and Length Scores
                dissimilarity_score = sum(dissimilarity_matrix[i])
                pos_score = pos_scores[i]
                normalized_length = normalized_lengths[i]

                #overall score
                overall_score= clause_svo_score + title_word_score + ner_score + sentiment_score + noun_score + dissimilarity_score + pos_score + normalized_length

                clause_scores[clause] = overall_score  # Store the overall score for each clause
                # Append the scores to the matrix
                scores_matrix.append([clause_svo_score,title_word_score,ner_score,sentiment_score,
                                    noun_score,dissimilarity_score,pos_score, normalized_length,
                                    ])

                # Print the scores for each clause
                print(f"  Clause: {clause}")
                print(f"  SVO Score: {clause_svo_score}")
                print(f"  Title Word Score: {title_word_score}")
                print(f"  NER Score: {ner_score}")
                print(f"  Sentiment Label: {sentiment_label}")
                print(f"  Sentiment Score: {sentiment_score}")
                print(f"  Noun Score: {noun_score}")
                print(f"  Dissimilarity Score: {dissimilarity_score}")
                print(f"  POS Score: {pos_score}")
                print(f"  Normalized Length Score: {normalized_length}")
                print(f"  Overall Score: {overall_score}\n")

    return scores_matrix, clause_scores

# Segment a story into sentences using Stanza and process each sentence's clauses
def calculate_clause_scores_arabic(story, title):
  # Remove diacritic
    story = strip_tashkeel(story)
    title = strip_tashkeel(title)

    # Process the title and stem title words
    title_doc = nlp(title)
    title_tokens = [word.text for sent in title_doc.sentences for word in sent.words]
    title_stems = set(stem_tokens(title_tokens))

    # Word probabilities
    word_probabilities = calculate_word_probability(story)

    # Sentiment analysis model
    sentiment_model = pipeline('sentiment-analysis', model='CAMeL-Lab/bert-base-arabic-camelbert-mix-sentiment')

    # Segment story into sentences
    sentences = stanza_sentence_tokenize(story)

    # Process and score clauses
    scores_matrix = process_and_score_clauses(sentences, title, nlp, sentiment_model, word_probabilities, title_stems, story)

    return scores_matrix

# Example usage
story = """
جلس رجل أعمى على رصيف في أحد الشوارع، ووضع الرجل الأعمى قبعته أمامه، وبجانب الرجل الأعمى لوحة مكتوب عليها: "أنا رجل أعمى، أرجوكم ساعدوني"، فمر رجل إعلانات بالشارع الذي يجلس فيه الأعمى، فوجد رجل الاعلانات أن قبعة الأعمى لا تحتوي سوى على القليل من المال، فوضع رجل الاعلانات بعض النقود في القبعة، ثم -ودون أن يستأذن الأعمى- أخذ رجل الإعلانات اللوحة التي بجانبه وكتب عليها عبارةً أخرى، ثم أعاد رجل الإعلانات اللوحة إلى مكانها بجانب الأعمى وغادر.

بدأ الأعمى يلاحظ أن قبعته امتلأت بالنقود، فعرف أن السبب هو ما فعله ذلك الرجل بلوحته، فسأل أحد المارة عما كتب على اللوحة، فكانت الآتي: "إنّنا في فصل الربيع، ولكنني لا أستطيع رؤية جماله!".

"""
title = "الإعلان والأعمى"


# normalize the decision matrix
def normalize_matrix(matrix):
    denominators = np.sqrt(np.sum(matrix**2, axis=0))
    normalized_matrix = np.zeros_like(matrix)

    for i in range(matrix.shape[1]):
        if denominators[i] == 0:
            normalized_matrix[:, i] = 0
        else:
            normalized_matrix[:, i] = matrix[:, i] / denominators[i]

    print(f"\nnormalized_matrix :\n {normalized_matrix} \n")
    return normalized_matrix

# Calculate ideal and negative-ideal solutions
def calculate_ideal_negative_ideal(normalized_matrix):
    positive_ideal_solution = np.max(normalized_matrix, axis=0)
    negative_ideal_solution = np.min(normalized_matrix, axis=0)

    print(f"positive_ideal_solution : {positive_ideal_solution} \n negative_ideal_solution : {negative_ideal_solution} \n")

    return positive_ideal_solution, negative_ideal_solution

# Calculate separation measures for each alternative
def calculate_separation_measures(normalized_matrix, positive_ideal_solution, negative_ideal_solution):
    positive_separation = np.sqrt(np.sum((normalized_matrix - positive_ideal_solution)**2 , axis=1))
    negative_separation = np.sqrt(np.sum((normalized_matrix - negative_ideal_solution)**2 , axis=1))

    print(f"positive_separation : {positive_separation} \n negative_separation : {negative_separation} \n")

    return positive_separation, negative_separation

# Calculate relative closeness to the ideal solution
def calculate_relative_closeness(positive_separation, negative_separation):
    relative_closeness = negative_separation / (positive_separation + negative_separation)

    print(f"relative_closeness :\n {relative_closeness} \n")

    return relative_closeness

# Perform TOPSIS ranking
def topsis_ranking(decision_matrix, clause_scores):
    normalized_matrix = normalize_matrix(decision_matrix)
    positive_ideal_solution, negative_ideal_solution = calculate_ideal_negative_ideal(normalized_matrix)
    positive_separation, negative_separation = calculate_separation_measures(normalized_matrix, positive_ideal_solution, negative_ideal_solution)
    relative_closeness = calculate_relative_closeness(positive_separation, negative_separation)

    # Create a dictionary to map clauses to their TOPSIS scores
    topsis_scores = {}
    for i, clause in enumerate(clause_scores.keys()):
        topsis_scores[clause] = relative_closeness[i]

    return topsis_scores



# # Perform TOPSIS ranking
# topsis_rank, relative_closeness = topsis_ranking(scores_matrix)

# # Print the ranked clauses along with their scores
# sentences = stanza_sentence_tokenize(story)
# all_clauses = []
# for sentence in sentences:
#     clauses = sentence_tokenize(sentence)
#     all_clauses.extend(clauses)

# for rank in topsis_rank:
#     clause = all_clauses[rank]
#     score = relative_closeness[rank]
#     print(f"Clause: {clause}\nScore: {score}\n")

scores_matrix, clause_scores = calculate_clause_scores_arabic(story, title)
normalized_data = scores_matrix / np.linalg.norm(scores_matrix, axis=0)
num_criteria = normalized_data.shape[1]
weights = [1 / num_criteria] * num_criteria
#scores_matrix = np.array(scores_matrix)
# Define the significance of each criterion (1 for profit, -1 for cost)
significance = [1, 1, 1, 1,1, 1, 1, 1]
# Use the topsis function
result = topsis(normalized_data, weights,significance)

# Print the result
print(result)





[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json:   0%|   …

INFO:stanza:Loading these models for language: ar (Arabic):
| Processor | Package       |
-----------------------------
| tokenize  | padt          |
| mwt       | padt          |
| pos       | padt_charlm   |
| lemma     | padt_nocharlm |
| depparse  | padt_charlm   |
| ner       | aqmar_charlm  |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Loading: lemma
INFO:stanza:Loading: depparse
INFO:stanza:Loading: ner
INFO:stanza:Done loading processors!


Sentence: جلس رجل أعمى على رصيف في أحد الشوارع، ووضع الرجل الأعمى قبعته أمامه، وبجانب الرجل الأعمى لوحة مكتوب عليها: "أنا رجل أعمى، أرجوكم ساعدوني"، فمر رجل إعلانات بالشارع الذي يجلس فيه الأعمى، فوجد رجل الاعلانات أن قبعة الأعمى لا تحتوي سوى على القليل من المال، فوضع رجل الاعلانات بعض النقود في القبعة، ثم -ودون أن يستأذن الأعمى- أخذ رجل الإعلانات اللوحة التي بجانبه وكتب عليها عبارة أخرى، ثم أعاد رجل الإعلانات اللوحة إلى مكانها بجانب الأعمى وغادر.
  Clause: جلس رجل أعمى على رصيف في أحد الشوارع،
  SVO Score: 0
  Title Word Score: 1
  NER Score: 0
  Sentiment Label: neutral
  Sentiment Score: 0
  Noun Score: 0.028925619834710745
  Dissimilarity Score: 13.539006826506826
  POS Score: 2
  Normalized Length Score: 1
  Overall Score: 17.567932446341537

  Clause: ووضع الرجل الأعمى قبعته أمامه،
  SVO Score: 1
  Title Word Score: 1
  NER Score: 0
  Sentiment Label: neutral
  Sentiment Score: 0
  Noun Score: 0.05371900826446281
  Dissimilarity Score: 13.486368310052521
  POS Score: 2
  Normalize

  normalized_data = scores_matrix / np.linalg.norm(scores_matrix, axis=0)


# 26 Nov meeting notes

In [None]:
#************************************************************************************************************************************************************

#************************************************************************************************************************************************************
import stanza
from pyarabic.araby import sentence_tokenize, strip_tashkeel
import nltk
from nltk.stem.isri import ISRIStemmer
from transformers import pipeline
from collections import Counter
from nltk.tokenize import word_tokenize
import numpy as np


# Initialize the Arabic stemmer
stemmer = ISRIStemmer()
nltk.download('punkt')

# Initialize Stanza pipeline for Arabic
nlp = stanza.Pipeline(lang='ar', processors='tokenize,mwt,pos,lemma,depparse,ner')


# Stem a list of tokens
def stem_tokens(tokens):
    return [stemmer.stem(token) for token in tokens]

# Calculate word probabilities in a story
def calculate_word_probability(story):
    words = word_tokenize(story)
    stemmed_words = stem_tokens(words)
    word_counts = Counter(stemmed_words)
    total_words = len(stemmed_words)
    word_probabilities = {word: count / total_words for word, count in word_counts.items()}
    return word_probabilities

# Calculate noun score for a clause
def calculate_noun_score(clause, word_probabilities, nlp):
    clause_doc = nlp(clause)
    nouns = [word.text for sentence in clause_doc.sentences for word in sentence.words if word.upos.startswith('N')]

    stemmed_nouns = stem_tokens(nouns)

    if len(nouns) == 0:
        return nouns, 0
    if len(stemmed_nouns) == 0:
        return stemmed_nouns, 0

    noun_score = sum(word_probabilities.get(noun, 0) for noun in stemmed_nouns) / len(stemmed_nouns)
    return stemmed_nouns, noun_score

# Calculate score based on the presence of title words in the clause.
def calculate_title_word_score(clause, title, nlp):
    title_doc = nlp(title)
    clause_doc = nlp(clause)
    # I've added type X since from our test, Stanza library assigned it to most nouns and adjectives if it fails to classify them
    title_tokens = [word.text for sent in title_doc.sentences for word in sent.words if word.upos in ['NOUN', 'X', 'ADJ']]
    title_stems = set(stem_tokens(title_tokens))

    clause_tokens = [word.text for sent in clause_doc.sentences for word in sent.words if word.upos in ['NOUN', 'X', 'ADJ']]
    clause_stems = set(stem_tokens(clause_tokens))  # Convert clause stems to a set to remove duplicates

    return sum(stem in title_stems for stem in clause_stems)

# Tokenize text into sentences using Stanza
def stanza_sentence_tokenize(text):
    doc = nlp(text)
    sentences = [sentence.text for sentence in doc.sentences]
    return sentences

def calculate_dissimilarity_matrix(story):
    clauses = sentence_tokenize(story)
    stemmed_clauses = [stem_tokens(clause.split()) for clause in clauses]
    n = len(stemmed_clauses)
    dissimilarity_matrix = [[0 for _ in range(n)] for _ in range(n)]

    for i in range(n):
        for j in range(n):
            if i != j and clauses[i].strip() and clauses[j].strip():
                set_i = set(stemmed_clauses[i])
                set_j = set(stemmed_clauses[j])
                unique_tokens_i = set_i - set_j
                unique_tokens_j = set_j - set_i
                total_unique_tokens = len(unique_tokens_i) + len(unique_tokens_j)
                total_tokens = len(set_i.union(set_j))

                if total_tokens == 0:
                    dissimilarity_matrix[i][j] = 0
                else:
                    dissimilarity_matrix[i][j] = total_unique_tokens / total_tokens

    return dissimilarity_matrix

# Calculate normalized clause length
def calculate_normalized_clause_length(clauses):
    clause_lengths = [len(clause.split()) for clause in clauses]
    max_clause_length = max(clause_lengths, default=1)
    normalized_lengths = [length / max_clause_length for length in clause_lengths]
    return normalized_lengths

# Calculate POS scores for clauses
def calculate_pos_scores(clauses, nlp):
    pos_scores = []
    for clause in clauses:
        if clause.strip():
            clause_doc = nlp(clause)
            # I've added type X since from our test, Stanza library assigned it to most nouns and adjectives if it fails to classify them
            relevant_pos_tags = {'ADJ', 'VERB'}
            pos_count = sum(word.upos in relevant_pos_tags for sentence in clause_doc.sentences for word in sentence.words)
            pos_scores.append(pos_count)
        else:
            pos_scores.append(0)
    return pos_scores

# Process and score clauses in sentences
def process_and_score_clauses(sentences, title, nlp, sentiment_model, word_probabilities, title_stems, story):
    scores_matrix = []
    clause_scores = {}
    for sentence in sentences:
        print(f"Sentence: {sentence}")
        clauses = sentence_tokenize(sentence)
        dissimilarity_matrix = calculate_dissimilarity_matrix(story)
        pos_scores = calculate_pos_scores(clauses, nlp)
        normalized_lengths = calculate_normalized_clause_length(clauses)

        for i, clause in enumerate(clauses):
            if clause.strip():
                clause_doc = nlp(clause)
                clause_tokens = [word.text for sent in clause_doc.sentences for word in sent.words]
                clause_stems = stem_tokens(clause_tokens)
                title_word_score = calculate_title_word_score(clause, title, nlp)

                # Sentiment Analysis
                sentiment_result = sentiment_model(clause)[0]
                sentiment_score = 0
                sentiment_label = sentiment_result['label']
                if sentiment_label != 'neutral':
                   sentiment_score = sentiment_result['score']

                # Nouns weight
                nouns, noun_score = calculate_noun_score(clause, word_probabilities, nlp)

                # SVO Analysis using Dependency Parsing
                has_subject, has_verb, has_object = False, False, False
                for sent in clause_doc.sentences:
                    for word in sent.words:
                        if word.deprel == 'nsubj':
                            has_subject = True
                        elif word.deprel == 'root':
                            has_verb = True
                        elif word.deprel in ['obj', 'iobj']:
                            has_object = True
                clause_svo_score = 1 if has_subject and has_verb and has_object else 0

                # NER Score
                ner_entities = [(ent.text, ent.type) for sent in clause_doc.sentences for ent in sent.ents]
                meaningful_ner_labels = ['PER', 'ORG', 'LOC']
                ner_score = sum(ner_type in meaningful_ner_labels for _, ner_type in ner_entities)

                # Dissimilarity, POS, and Length Scores
                dissimilarity_score = sum(dissimilarity_matrix[i])
                pos_score = pos_scores[i]
                normalized_length = normalized_lengths[i]

                #overall score
                overall_score= clause_svo_score + title_word_score + ner_score + sentiment_score + noun_score + dissimilarity_score + pos_score + normalized_length

                clause_scores[clause] = overall_score  # Store the overall score for each clause
                # Append the scores to the matrix
                scores_matrix.append([clause_svo_score,ner_score,sentiment_score,title_word_score,
                                    noun_score,dissimilarity_score,pos_score, normalized_length])
                                    #overall_score ])

                # Print the scores for each clause
                print(f"  Clause: {clause}")
                print(f"  SVO Score: {clause_svo_score}")
                print(f"  Title Word Score: {title_word_score}")
                print(f"  NER Score: {ner_score}")
                print(f"  Sentiment Label: {sentiment_label}")
                print(f"  Sentiment Score: {sentiment_score}")
                print(f"  Noun Score: {noun_score}")
                print(f"  Dissimilarity Score: {dissimilarity_score}")
                print(f"  POS Score: {pos_score}")
                print(f"  Normalized Length Score: {normalized_length}")
                print(f"  Overall Score: {overall_score}\n")

    return scores_matrix, clause_scores

# Segment a story into sentences using Stanza and process each sentence's clauses
def calculate_clause_scores_arabic(story,title):

    # Process the title and stem title words
    title_doc = nlp(title)
    title_tokens = [word.text for sent in title_doc.sentences for word in sent.words]
    title_stems = set(stem_tokens(title_tokens))

    # Word probabilities
    word_probabilities = calculate_word_probability(story)

    # Sentiment analysis model
    sentiment_model = pipeline('sentiment-analysis', model='CAMeL-Lab/bert-base-arabic-camelbert-mix-sentiment')

    # Segment story into sentences
    sentences = stanza_sentence_tokenize(story)

    # Process and score clauses
    scores_matrix = process_and_score_clauses(sentences, title, nlp, sentiment_model, word_probabilities,title_stems, story)

    return scores_matrix

# Example usage
story = """
كان يا مكان في قديم الزمان، كان في غابة طاووس وفيل يتبادلان الكلام، وكل من الفيل والطاووس يظن أنه الأفضل من الآخر؛ فالطاووس يعتقد أنه هو الأفضل مفتخرا بجمال ريشه. بينما الفيل فيرى أنه هو الأحسن مغرورا بقوة جسمه وطول خرطومه.

ظل الفيل والطاووس يناقشان الأمر دون أن يستسلم أحدهما للآخر إلى أن أحسا بالجوع، فتوجه الفيل والطاووس معا إلى حظيرة قريبة من الغابة، ودخل الفيل والطاووس إليها عبر ثقب في الحائط. فأخذ الفيل والطاووس يأكلان بفرح وسرور ما طاب ولذ من الأكل.

وبعد انتهاء الفيل والطاووس من الأكل والشرب، تابع الفيل والطاووس جدالهما دون توقف، إلى أن فاجأهما صاحبا الحظيرة اللذان كانا يحملان بيدهما بندقية، فقال أحدهما للآخر: سوف نقتل الفيل لخطورته، ونترك الطاووس يعيش في الحظيرة لجمال ريشه. ثم رد الرجل الثاني قائلا: أنت على صواب.

سمع الفيل والطاووس ما قاله الرجلان، فنظر الفيل والطاووس إلى بعضهما نظرة الوداع. وقتذاك صوب الرجلان بندقيتهما باتجاه الفيل وأطلقا النار عليه في آن واحد دون إصابته لحسن حظه، ففر الفيل باتجاه الغابة.
"""

title = " الفيل الطاووس"

# Remove diacritic
story = strip_tashkeel(story)

# normalize the decision matrix
def normalize_matrix(matrix):
    epsilon = 1e-10  # Small constant to prevent division by zero
    denominators = np.sqrt(np.sum(matrix**2, axis=0))
    normalized_matrix = np.zeros_like(matrix)

    for i in range(matrix.shape[1]):
        normalized_matrix[:, i] = matrix[:, i] / (denominators[i] + epsilon)

    return normalized_matrix

# Calculate ideal and negative-ideal solutions
def calculate_ideal_negative_ideal(normalized_matrix):
    positive_ideal_solution = np.max(normalized_matrix, axis=0)
    negative_ideal_solution = np.min(normalized_matrix, axis=0)

    print(f"positive_ideal_solution : {positive_ideal_solution} \n negative_ideal_solution : {negative_ideal_solution} \n")

    return positive_ideal_solution, negative_ideal_solution

# Calculate separation measures for each alternative
def calculate_separation_measures(normalized_matrix, positive_ideal_solution, negative_ideal_solution):
    positive_separation = np.sqrt(np.sum((normalized_matrix - positive_ideal_solution)**2 , axis=1))
    negative_separation = np.sqrt(np.sum((normalized_matrix - negative_ideal_solution)**2 , axis=1))

    print(f"positive_separation : {positive_separation} \n negative_separation : {negative_separation} \n")

    return positive_separation, negative_separation

# Calculate relative closeness to the ideal solution
def calculate_relative_closeness(positive_separation, negative_separation):
    relative_closeness = negative_separation / (positive_separation + negative_separation)

    print(f"relative_closeness :\n {relative_closeness} \n")

    return relative_closeness

# Perform TOPSIS ranking
def topsis_ranking(decision_matrix, clause_scores):
    # Normalize the decision matrix
    normalized_matrix = normalize_matrix(decision_matrix)

    # Apply weights to the criteria
    # 2/9 to the most important dissimilarity_score,pos_score
    weights = np.array([1/9, 1/9, 1/9, 1/9, 1/9, 2/9, 2/9, 1/9])  # Adjust these weights as needed
    weighted_matrix = normalized_matrix * weights

    # Calculate ideal and negative-ideal solutions
    positive_ideal_solution, negative_ideal_solution = calculate_ideal_negative_ideal(weighted_matrix)

    # Calculate separation measures
    positive_separation, negative_separation = calculate_separation_measures(weighted_matrix, positive_ideal_solution, negative_ideal_solution)

    # Calculate relative closeness
    relative_closeness = calculate_relative_closeness(positive_separation, negative_separation)

    # Map relative closeness to clauses
    topsis_scores = {}
    for i, clause in enumerate(clause_scores.keys()):
        topsis_scores[clause] = relative_closeness[i]

    return topsis_scores



scores_matrix, clause_scores = calculate_clause_scores_arabic(story, title)
scores_matrix = np.array(scores_matrix)

topsis_scores = topsis_ranking(scores_matrix, clause_scores)

# Print sentences with their clauses and TOPSIS scores
sentences = stanza_sentence_tokenize(story)
for sentence in sentences:
    print(f"----------------------------------------------------")
    print(f"Sentence: {sentence}")
    clauses = sentence_tokenize(sentence)

    # Create a list to store clauses with their TOPSIS scores
    clause_scores = []
    for clause in clauses:
        score = topsis_scores.get(clause, 0)  # Get the TOPSIS score for each clause
        clause_scores.append((clause, score))

    # Sort the clauses based on their TOPSIS scores (x[1] == TOPSIS scores in clause_scores)
    sorted_clause_scores = sorted(clause_scores, key=lambda x: x[1], reverse=True)

    # Print the clauses in order of their TOPSIS scores
    for clause_score in sorted_clause_scores:
        clause, score = clause_score
        print(f"  Clause: {clause}\n  TOPSIS Score: {score}\n")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json:   0%|   …

INFO:stanza:Loading these models for language: ar (Arabic):
| Processor | Package       |
-----------------------------
| tokenize  | padt          |
| mwt       | padt          |
| pos       | padt_charlm   |
| lemma     | padt_nocharlm |
| depparse  | padt_charlm   |
| ner       | aqmar_charlm  |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Loading: lemma
INFO:stanza:Loading: depparse
INFO:stanza:Loading: ner
INFO:stanza:Done loading processors!


Sentence: كان يا مكان في قديم الزمان، كان في غابة طاووس وفيل يتبادلان الكلام، وكل من الفيل والطاووس يظن أنه الأفضل من الآخر؛ فالطاووس يعتقد أنه هو الأفضل مفتخرا بجمال ريشه. بينما الفيل فيرى أنه هو الأحسن مغرورا بقوة جسمه وطول خرطومه.
  Clause: كان يا مكان في قديم الزمان،
  SVO Score: 0
  Title Word Score: 0
  NER Score: 0
  Sentiment Label: positive
  Sentiment Score: 0.558352530002594
  Noun Score: 0.008771929824561403
  Dissimilarity Score: 18.575757575757574
  POS Score: 2
  Normalized Length Score: 0.35294117647058826
  Overall Score: 21.495823212055317

  Clause: كان في غابة طاووس وفيل يتبادلان الكلام،
  SVO Score: 0
  Title Word Score: 1
  NER Score: 1
  Sentiment Label: neutral
  Sentiment Score: 0
  Noun Score: 0.005847953216374269
  Dissimilarity Score: 17.878987782083755
  POS Score: 2
  Normalized Length Score: 0.4117647058823529
  Overall Score: 22.29660044118248

  Clause: وكل من الفيل والطاووس يظن أنه الأفضل من الآخر؛ فالطاووس يعتقد أنه هو الأفضل مفتخرا بجمال ريشه.
  SVO 