In [1]:
import nltk
import re
import heapq  
import pickle
import pandas as pd
import numpy as np
from string import punctuation
from nltk.corpus import stopwords
punctuation = punctuation + '\n'
from nltk.stem.isri import ISRIStemmer
from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import RidgeClassifier

In [2]:
categories = ['Finance', 'Medical', 'Politic', 'Religion', 'Sports', 'Tech']

In [59]:
def nltk_summarizer(input_text, number_of_sentence):
    stopWords = set(nltk.corpus.stopwords.words("arabic") + nltk.corpus.stopwords.words("english"))
    word_frequencies = {}  
    for word in nltk.word_tokenize(input_text):  
        if word not in stopWords:
            if word not in punctuation:
                if word not in word_frequencies.keys():
                    word_frequencies[word] = 1
                else:
                    word_frequencies[word] += 1

    maximum_frequncy = max(word_frequencies.values())

    for word in word_frequencies.keys():  
        word_frequencies[word] = (word_frequencies[word]/maximum_frequncy)

    sentence_list = nltk.sent_tokenize(input_text)
    sentence_scores = {}  
    for sent in sentence_list:  
        for word in nltk.word_tokenize(sent.lower()):
            if word in word_frequencies.keys():
                if len(sent.split(' ')) < 30:
                    if sent not in sentence_scores.keys():
                        sentence_scores[sent] = word_frequencies[word]
                    else:
                        sentence_scores[sent] += word_frequencies[word]

    summary_sentences = heapq.nlargest(number_of_sentence, sentence_scores, key=sentence_scores.get)

    summary = ' '.join(summary_sentences)  
    return summary

In [4]:
def delete_links(input_text):
    pettern  = r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))'''
    out_text = re.sub(pettern, ' ', input_text)
    return out_text

In [5]:
def replace_letters(input_text):
    replace = {"أ": "ا","ة": "ه","إ": "ا","آ": "ا","": ""}
    replace = dict((re.escape(k), v) for k, v in replace.items()) 
    pattern = re.compile("|".join(replace.keys()))
    out_text = pattern.sub(lambda m: replace[re.escape(m.group(0))], input_text)
    return out_text

In [6]:
# def clean_text(input_text):
#     replace = r'[/(){}\[\]|@âÂ,;\?\'\"\*…؟–’،!&\+-:؛-]'
#     out_text = re.sub(replace, " ", input_text)
#     words = nltk.word_tokenize(out_text)
#     words = [word for word in words if word.isalpha()]
#     out_text = ' '.join(words)
#     return out_text

In [26]:
def remove_vowelization(input_text):
    vowelization = re.compile(""" ّ|َ|ً|ُ|ٌ|ِ|ٍ|ْ|ـ""", re.VERBOSE)
    out_text = re.sub(vowelization, '', input_text)
    return out_text

In [8]:
def delete_stopwords(input_text):
    stop_words = set(nltk.corpus.stopwords.words("arabic") + nltk.corpus.stopwords.words("english"))
    tokenizer = nltk.tokenize.WhitespaceTokenizer()
    tokens = tokenizer.tokenize(input_text)
    wnl = nltk.WordNetLemmatizer()
    lemmatizedTokens =[wnl.lemmatize(t) for t in tokens]
    out_text = [w for w in lemmatizedTokens if not w in stop_words]
    out_text = ' '.join(out_text)
    return out_text

In [9]:
# def stem_text(input_text):
#     st = ISRIStemmer()
#     tokenizer = nltk.tokenize.WhitespaceTokenizer()
#     tokens = tokenizer.tokenize(input_text)
#     out_text = [st.stem(w) for w in tokens]
#     out_text = ' '.join(out_text)
#     return out_text

In [14]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sarahshehri/nltk_data...


True

In [16]:
import nltk
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/sarahshehri/nltk_data...


True

In [121]:
import os

# Define the directory path for reading articles
input_folder_path = '/Users/sarahshehri/Downloads/NEWS/Cleaned-Data/Tech'

# Define the directory path for writing summarized text
output_folder_path = '/Users/sarahshehri/Downloads/NEWS/HHHFFF'

# Check if the input directory exists
if os.path.exists(input_folder_path):
    # Check if the output directory exists, if not create it
    if not os.path.exists(output_folder_path):
        os.makedirs(output_folder_path)

    # List all files in the input directory
    files = os.listdir(input_folder_path)

    # Iterate over each file in the directory
    for file_name in files:
        file_path = os.path.join(input_folder_path, file_name)
        if file_path.endswith(".txt"):  # Consider only text files
            # Open the article file and read its contents
            with open(file_path, 'r', encoding='utf-8') as article_file:
                article_text = article_file.read()
                
                # Summarize the article text
                summarized_text = text_prepare(article_text)
                
                # Define the file path to store the summarized text
                summarized_file_path = os.path.join(output_folder_path, f'summarized_{file_name}')
                
                # Write the summarized text to the file
                with open(summarized_file_path, 'w', encoding='utf-8') as summarized_file:
                    summarized_file.write(summarized_text)
                    
                print(f"Summarized text for {file_name} has been saved to:", summarized_file_path)

else:
    print("Input directory not found:", input_folder_path)

Summarized text for 3644.txt has been saved to: /Users/sarahshehri/Downloads/NEWS/HHHFFF/summarized_3644.txt
Summarized text for 5235.txt has been saved to: /Users/sarahshehri/Downloads/NEWS/HHHFFF/summarized_5235.txt
Summarized text for 1053.txt has been saved to: /Users/sarahshehri/Downloads/NEWS/HHHFFF/summarized_1053.txt
Summarized text for 1735.txt has been saved to: /Users/sarahshehri/Downloads/NEWS/HHHFFF/summarized_1735.txt
Summarized text for 5553.txt has been saved to: /Users/sarahshehri/Downloads/NEWS/HHHFFF/summarized_5553.txt
Summarized text for 4895.txt has been saved to: /Users/sarahshehri/Downloads/NEWS/HHHFFF/summarized_4895.txt
Summarized text for 3122.txt has been saved to: /Users/sarahshehri/Downloads/NEWS/HHHFFF/summarized_3122.txt
Summarized text for 5547.txt has been saved to: /Users/sarahshehri/Downloads/NEWS/HHHFFF/summarized_5547.txt
Summarized text for 4881.txt has been saved to: /Users/sarahshehri/Downloads/NEWS/HHHFFF/summarized_4881.txt
Summarized text for

In [110]:
def text_prepare(input_text):
    out_text = delete_links(input_text)
#    out_text = clean_text(out_text)
    out_text = delete_stopwords(out_text)
    out_text = replace_letters(out_text)
    out_text = remove_vowelization(out_text)
 #   out_text = stem_text(out_text)

    return out_text

In [123]:
text = "دبي   الخليج:اعلنت مؤسسه الجليله عن فتح باب تقديم الطلبات للدوره الثانيه من برنامجي منح البحث الطبي والزماله البحثيه اللذين يهدفان الي الارتقاء بامكانات قطاع الرعايه الصحيه في دوله الامارات من خلال دعم علماء وباحثي الطب الحيوي الموهوبين من ذوي الكفاءه العاليه  وقال البروفيسور سهام الدين كلداري استاذ الكيمياء الحيويه والبيولوجيا الجزيئيه ورئيس لجنه الاستشارات العلميه في مؤسسه الجليله: بينما نمضي قدما في تطوير مركز ابحاث مؤسسه الجليله الذي يعد اول مركز ابحاث مستقل متعدد التخصصات في دوله الامارات فاننا نتطلع لدعم المع المواهب المحليه في مجال ابحاث الطب الحيوي وتمكينهم من الدراسه والتدريب في ارقي مؤسسات التعليم الطبي بما يساعدهم علي تحقيق التطور علي المستوين الشخصي والاكاديمي من جانبه قال الدكتور عبدالكريم سلطان العلماء الرئيس التنفيذي لمؤسسه الجليله: قدمنا خلال العام الماضي منحا بحثيه لسته عشر باحثا طموحا بعد عمليه اختيار ومراجعه موسعه قام بها خبراء من اشهر المؤسسات الطبيه العالميه ونحن سعداء بتقديم هذه المنح لمساعدتهم علي تكريس جهودهم وقتهم في سبيل اجراء ابحاث طبيه مبتكره تعود بالنفع علي قطاع الرعايه الصحيه محليا واقليميا  ومن خلال دعم الباحثين الموهوبين تواصل مؤسسه الجليله المضي قدما في تحقيق رسالتها الراميه الي تحسين حياه الافراد في دوله الامارات عبر ارساء الاسس لبيئه داعمه لابحاث الطب الحيوي وتعزيز امكاناتنا الطبيه المحليه بما يسهم في توفير علاجات متخصصه للمرضي داخل الدوله دون الحاجه للسفر للخارج"

In [124]:
from sumy.summarizers.lsa import LsaSummarizer as Summarizer
from sumy.nlp.tokenizers import Tokenizer
from sumy.parsers.plaintext import PlaintextParser

def lsa_summarizer(input_text, number_of_sentence):
    summarizer = Summarizer()
    parser = PlaintextParser.from_string(input_text, Tokenizer("arabic"))
    summary = summarizer(parser.document, number_of_sentence)
    return summary

# Example usage:
number_of_sentence = 2

lsa_summary = lsa_summarizer(text, number_of_sentence)
lsa_summary_text = ""
for sentence in lsa_summary:
    lsa_summary_text += str(sentence)

print("\nLSA Summary:")
print(lsa_summary_text)

# Convert the summary tuple to a string
lsa_summary_text = ' '.join([str(sentence) for sentence in lsa_summary])

# Split the summarized text into words and count the number of words
summarized_words = lsa_summary_text.split()
num_summarized_words = len(summarized_words)
print("Number of words in summarized text:", num_summarized_words)

# Split the original text into words and count the number of words
original_words = text.split()
num_original_words = len(original_words)
print("Number of words in original text:", num_original_words)
print(text)


LSA Summary:
دبي   الخليج:اعلنت مؤسسه الجليله عن فتح باب تقديم الطلبات للدوره الثانيه من برنامجي منح البحث الطبي والزماله البحثيه اللذين يهدفان الي الارتقاء بامكانات قطاع الرعايه الصحيه في دوله الامارات من خلال دعم علماء وباحثي الطب الحيوي الموهوبين من ذوي الكفاءه العاليه  وقال البروفيسور سهام الدين كلداري استاذ الكيمياء الحيويه والبيولوجيا الجزيئيه ورئيس لجنه الاستشارات العلميه في مؤسسه الجليله:قدمنا خلال العام الماضي منحا بحثيه لسته عشر باحثا طموحا بعد عمليه اختيار ومراجعه موسعه قام بها خبراء من اشهر المؤسسات الطبيه العالميه ونحن سعداء بتقديم هذه المنح لمساعدتهم علي تكريس جهودهم وقتهم في سبيل اجراء ابحاث طبيه مبتكره تعود بالنفع علي قطاع الرعايه الصحيه محليا واقليميا  ومن خلال دعم الباحثين الموهوبين تواصل مؤسسه الجليله المضي قدما في تحقيق رسالتها الراميه الي تحسين حياه الافراد في دوله الامارات عبر ارساء الاسس لبيئه داعمه لابحاث الطب الحيوي وتعزيز امكاناتنا الطبيه المحليه بما يسهم في توفير علاجات متخصصه للمرضي داخل الدوله دون الحاجه للسفر للخارج
Number of words in summarized text: 151

In [122]:
from sumy.summarizers.lsa import LsaSummarizer as Summarizer
from sumy.nlp.tokenizers import Tokenizer
from sumy.parsers.plaintext import PlaintextParser
import os

# Define the directory path for reading articles
input_folder_path = '/Users/sarahshehri/Downloads/NEWS/HHHFFF'

# Define the directory path for writing summarized text
output_folder_path = '/Users/sarahshehri/Downloads/NEWS/Summariz-folders/TechSummarizing'

# Example usage:
number_of_sentence = 2

# Check if the input directory exists
if os.path.exists(input_folder_path):
    # Check if the output directory exists, if not create it
    if not os.path.exists(output_folder_path):
        os.makedirs(output_folder_path)

    # List all files in the input directory
    files = os.listdir(input_folder_path)

    # Iterate over each file in the directory
    for file_name in files:
        file_path = os.path.join(input_folder_path, file_name)
        if file_path.endswith(".txt"):  # Consider only text files
            # Open the article file and read its contents
            with open(file_path, 'r', encoding='utf-8') as article_file:
                article_text = article_file.read()
                
                # Summarize the article text
                summary = lsa_summarizer(article_text, number_of_sentence)
                summarized_text = ""
                for sentence in summary:
                    summarized_text += str(sentence) + ' '

                # Define the file path to store the summarized text
                summarized_file_path = os.path.join(output_folder_path, f'summarized_{file_name}')
                
                # Write the summarized text to the file
                with open(summarized_file_path, 'w', encoding='utf-8') as summarized_file:
                    summarized_file.write(summarized_text)
                    
                print(f"Summarized text for {file_name} has been saved to:", summarized_file_path)

else:
    print("Input directory not found:", input_folder_path)

Summarized text for summarized_2987.txt has been saved to: /Users/sarahshehri/Downloads/NEWS/Summariz-folders/TechSummarizing/summarized_summarized_2987.txt
Summarized text for summarized_3441.txt has been saved to: /Users/sarahshehri/Downloads/NEWS/Summariz-folders/TechSummarizing/summarized_summarized_3441.txt
Summarized text for summarized_5030.txt has been saved to: /Users/sarahshehri/Downloads/NEWS/Summariz-folders/TechSummarizing/summarized_summarized_5030.txt
Summarized text for summarized_1256.txt has been saved to: /Users/sarahshehri/Downloads/NEWS/Summariz-folders/TechSummarizing/summarized_summarized_1256.txt
Summarized text for summarized_0148.txt has been saved to: /Users/sarahshehri/Downloads/NEWS/Summariz-folders/TechSummarizing/summarized_summarized_0148.txt
Summarized text for summarized_1530.txt has been saved to: /Users/sarahshehri/Downloads/NEWS/Summariz-folders/TechSummarizing/summarized_summarized_1530.txt
Summarized text for summarized_5756.txt has been saved to:

  warn(message % (words_count, sentences_count))


Summarized text for summarized_5719.txt has been saved to: /Users/sarahshehri/Downloads/NEWS/Summariz-folders/TechSummarizing/summarized_summarized_5719.txt
Summarized text for summarized_2076.txt has been saved to: /Users/sarahshehri/Downloads/NEWS/Summariz-folders/TechSummarizing/summarized_summarized_2076.txt
Summarized text for summarized_2710.txt has been saved to: /Users/sarahshehri/Downloads/NEWS/Summariz-folders/TechSummarizing/summarized_summarized_2710.txt
Summarized text for summarized_4361.txt has been saved to: /Users/sarahshehri/Downloads/NEWS/Summariz-folders/TechSummarizing/summarized_summarized_4361.txt
Summarized text for summarized_0107.txt has been saved to: /Users/sarahshehri/Downloads/NEWS/Summariz-folders/TechSummarizing/summarized_summarized_0107.txt
Summarized text for summarized_1219.txt has been saved to: /Users/sarahshehri/Downloads/NEWS/Summariz-folders/TechSummarizing/summarized_summarized_1219.txt
Summarized text for summarized_0113.txt has been saved to:

In [50]:
!pip install arabic_reshaper python-bidi

Collecting arabic_reshaper
  Downloading arabic_reshaper-3.0.0-py3-none-any.whl.metadata (12 kB)
Collecting python-bidi
  Downloading python_bidi-0.4.2-py2.py3-none-any.whl.metadata (4.6 kB)
Downloading arabic_reshaper-3.0.0-py3-none-any.whl (20 kB)
Downloading python_bidi-0.4.2-py2.py3-none-any.whl (30 kB)
Installing collected packages: arabic_reshaper, python-bidi
Successfully installed arabic_reshaper-3.0.0 python-bidi-0.4.2
