In [1]:
import csv
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from urllib.parse import urljoin
import time
import random

# Function to parse the HTML content using BeautifulSoup and extract links
def extract_links(html):
    soup = BeautifulSoup(html, 'html.parser')
    links = soup.find_all('a', class_='u-clickable-card__link', href=True)
    return [link['href'] for link in links]

# Function to click the "See More" button using Selenium
def click_see_more(driver, max_clicks=5):
    click_count = 0
    try:
        while click_count < max_clicks:
            show_more_button = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.XPATH, '//button[@class="show-more-button big-margin"]'))
            )
            driver.execute_script("arguments[0].click();", show_more_button)
            print("Clicked 'See More' button successfully")
            click_count += 1
            time.sleep(5)  # Wait for some time after clicking the button
    except Exception as e:
        print(f"Error: Could not find or click the 'See More' button: {e}")

# Function to fetch content and extract text from <p> tags
def fetch_content(base_url, url):
    try:
        full_url = urljoin(base_url, url)
        response = requests.get(full_url)
        response.raise_for_status()  # Raise exception for HTTP errors
        html = response.text
        soup = BeautifulSoup(html, 'html.parser')
        paragraphs = soup.find_all('p')
        text = '\n'.join([p.get_text() for p in paragraphs])
        return text
    except Exception as e:
        print(f"Skipping {url}: {e}")
        return None


def score_ai_relevance(content):
    ai_keywords = [
        'ذكاء اصطناعي', 'الذكاء الاصطناعي', 'للذكاء الاصطناعي', 'تعلم الآلة', 'تعلم الآلي', 'تعلم عميق', 'شبكة عصبية',
        'خوارزمية', 'تحليل البيانات', 'البيانات الضخمة', 'روبوتات', 'تشات جي بي تي', 'تقنية', 'برمجة الكمبيوتر',
        'الذكاء الصناعي', 'تعلم الآلات', 'الشبكات العصبية', 'تحليل البيانات الكبيرة', 'الذكاء الاصطناعي المتقدم',
         'الذكاء' ,'برمجة الشبكات العصبية', 'تقنيات الذكاء الاصطناعي', 'التعلم الآلي',
    ]

    nbr = 0
    score = 0
    text_lower = content.lower()
    for keyword in ai_keywords:
        if keyword in text_lower:
            nbr += 1
    if nbr > 2:
        score = random.randint(7, 10)
    elif nbr < 2 and nbr > 0:
        score = random.randint(3, 6)
    elif nbr == 0:
        score = random.randint(0, 2)

    return score


def scrape_links(url, max_clicks=5, append=False):
    # Initialize WebDriver
    driver = webdriver.Chrome()  # You may need to adjust this based on your WebDriver setup

    try:
        driver.get(url)
        time.sleep(2)  # Wait for page to load

        # Extract initial links
        html = driver.page_source
        links = extract_links(html)

        # Click "See More" button to get additional links
        click_see_more(driver, max_clicks)

        # Extract links again after clicking "See More"
        html = driver.page_source
        additional_links = extract_links(html)

        # Combine initial links with additional links
        links.extend(additional_links)

        # Open the CSV file in append mode if specified, otherwise in write mode
        file_mode = 'a' if append else 'w'
        with open(r'articles.csv', file_mode, newline='', encoding='utf-8') as csvfile:
            fieldnames = ['Text', 'Score']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            if not append:
                writer.writeheader()  # Write header only if not appending

            for link in links:
                content = fetch_content(url, link)
                if content is not None:
                    # Generate a score for the content
                    score = score_ai_relevance(content)
                    print(f"Text Score: {score}")
                    print("Text from", link)
                    print(content)
                    print("-" * 50)
                    writer.writerow({'Text': content, 'Score': score})

    finally:
        driver.quit()  # Close the WebDriver once done

# Example usage
url1 = "https://www.aljazeera.net/tech/"
max_clicks1 = 7
scrape_links(url1, max_clicks1, append=False)  # First call, do not append

url2 = "https://www.aljazeera.net/"
max_clicks2 = 5
scrape_links(url2, max_clicks2, append=True)  # Second call, append to file


Clicked 'See More' button successfully
Clicked 'See More' button successfully
Clicked 'See More' button successfully
Clicked 'See More' button successfully
Clicked 'See More' button successfully
Clicked 'See More' button successfully
Clicked 'See More' button successfully
Text Score: 10
Text from /tech/2024/5/26/%d8%a7%d8%ae%d8%aa%d9%8a%d8%a7%d8%b1%d9%8a%d8%a9-%d8%a3%d9%88-%d9%87%d9%88%d8%a7%d9%8a%d8%a9-%d9%87%d9%83%d8%b0%d8%a7-%d9%8a%d8%aa%d8%b5%d9%88%d8%b1-%d8%a5%d9%8a%d9%84%d9%88%d9%86
في مؤتمر "فيفا تك 2024" الذي عقد في باريس، رسم إيلون ماسك صورة مستقبلية مثيرة لمجتمع جديد تنتهي فيه الوظائف التقليدية بفضل تطور الذكاء الاصطناعي والروبوتات.
وأوضح ماسك عبر مداخلة بالفيديو، أن الذكاء الاصطناعي قد يلغي الحاجة إلى العمل البشري مما يقودنا إلى تأسيس مجتمع يسوده الوفرة والحرية، وفقا لشبكة "سي إن إن".
وقدم إيلون ماسك رؤيته حول هذا التأثير الهائل للذكاء الاصطناعي خلال كلمته في مؤتمر "فيفا تك"، وصرح قائلا: "ربما لن يحظى أي منا بأي وظيفة"، متصورا مستقبلا تصبح فيه الوظائف اختيارية. وذكر أن البشر

In [1]:
import csv
import re
import string
from nltk.tokenize import word_tokenize
import stanza
import pandas as pd

# Initialize Stanza pipeline for lemmatization
stanza.download('ar')
nlp = stanza.Pipeline(lang='ar', processors='tokenize,mwt,pos,lemma')

# Read the stop words file
with open('list.txt', 'r', encoding='utf-8') as file:
    stop_words = set(file.read().splitlines())

# Functions for text preprocessing
def normalize_arabic(text):
    text = re.sub("[إأآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ؤ", "ء", text)
    text = re.sub("ئ", "ء", text)
    text = re.sub("ة", "ه", text)
    text = re.sub("گ", "ك", text)
    return text

def remove_diacritics(text):
    arabic_diacritics = re.compile("""
                                     ّ    | # Tashdid
                                     َ    | # Fatha
                                     ً    | # Tanwin Fath
                                     ُ    | # Damma
                                     ٌ    | # Tanwin Damm
                                     ِ    | # Kasra
                                     ٍ    | # Tanwin Kasr
                                     ْ    | # Sukun
                                     ـ     # Tatwil/Kashida
                                 """, re.VERBOSE)
    return re.sub(arabic_diacritics, '', text)

def remove_punctuations(text):
    arabic_punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ'''
    english_punctuations = string.punctuation
    punctuations_list = arabic_punctuations + english_punctuations
    translator = str.maketrans('', '', punctuations_list)
    return text.translate(translator)

def remove_repeating_char(text):
    return re.sub(r'(.)\1+', r'\1', text)

# Load the CSV file into a DataFrame
df = pd.read_csv('articles.csv')

# Process each document in the DataFrame
lemmatized_texts = []
for index, row in df.iterrows():
    cleaned_text = row['Text']
    cleaned_text = normalize_arabic(cleaned_text)
    cleaned_text = remove_punctuations(cleaned_text)
    cleaned_text = remove_repeating_char(cleaned_text)
    
    # Lemmatization
    doc = nlp(cleaned_text)
    lemmatized_tokens = [word.lemma for sentence in doc.sentences for word in sentence.words]

    # Remove diacritics from lemmatized tokens
    lemmatized_tokens_without_diacritics = [remove_diacritics(token) for token in lemmatized_tokens]

    # Remove stop words
    lemmatized_tokens_without_stopwords = [word for word in lemmatized_tokens_without_diacritics if word not in stop_words]

    # Join tokens back to a single string
    lemmatized_text = ' '.join(lemmatized_tokens_without_stopwords)
    lemmatized_texts.append(lemmatized_text)

# Add the lemmatized text to the DataFrame
df['Lemmatized_Text'] = lemmatized_texts


print("Text preprocessing completed and updated in articles.csv")


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-05-26 19:26:36 INFO: Downloaded file to C:\Users\IMANE\stanza_resources\resources.json
2024-05-26 19:26:36 INFO: Downloading default packages for language: ar (Arabic) ...
2024-05-26 19:26:39 INFO: File exists: C:\Users\IMANE\stanza_resources\ar\default.zip
2024-05-26 19:26:45 INFO: Finished downloading models and saved to C:\Users\IMANE\stanza_resources
2024-05-26 19:26:45 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-05-26 19:26:49 INFO: Downloaded file to C:\Users\IMANE\stanza_resources\resources.json
2024-05-26 19:26:50 INFO: Loading these models for language: ar (Arabic):
| Processor | Package       |
-----------------------------
| tokenize  | padt          |
| mwt       | padt          |
| pos       | padt_charlm   |
| lemma     | padt_nocharlm |

2024-05-26 19:26:50 INFO: Using device: cpu
2024-05-26 19:26:50 INFO: Loading: tokenize
2024-05-26 19:26:53 INFO: Loading: mwt
2024-05-26 19:26:53 INFO: Loading: pos
2024-05-26 19:26:54 INFO: Loading: lemma
2024-05-26 19:26:54 INFO: Done loading processors!


Text preprocessing completed and updated in articles.csv


In [5]:
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences



# Prepare input and output data
texts = df['Lemmatized_Text'].astype(str).tolist()  # Convert to list of strings
labels = df['Score'].values  # Use the score as the label for regression

# Tokenization and padding
max_words = 10000
max_len = 100

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
data = pad_sequences(sequences, maxlen=max_len)

# Split data into training and test sets
x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)

AttributeError: module 'tensorflow.core.framework.types_pb2' has no attribute 'SerializedDType'

In [6]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, GRU, LSTM, Dense, Bidirectional, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Define a function to build models
def build_model(model_type, input_length, vocab_size, embedding_dim, units):
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=input_length))
    if model_type == 'RNN':
        model.add(SimpleRNN(units))
    elif model_type == 'Bidirectional RNN':
        model.add(Bidirectional(SimpleRNN(units)))
    elif model_type == 'GRU':
        model.add(GRU(units))
    elif model_type == 'Bidirectional GRU':
        model.add(Bidirectional(GRU(units)))
    elif model_type == 'LSTM':
        model.add(LSTM(units))
    elif model_type == 'Bidirectional LSTM':
        model.add(Bidirectional(LSTM(units)))
    model.add(Dense(units, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1))  # Output layer for regression
    model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mean_absolute_error'])
    return model

# Hyperparameters
embedding_dim = 100
units = 128
batch_size = 32
epochs = 10

# Create a dictionary to store models and their histories
models = {}
histories = {}

# List of model types to train
model_types = ['RNN', 'Bidirectional RNN', 'GRU', 'Bidirectional GRU', 'LSTM', 'Bidirectional LSTM']

# Train and evaluate each model
for model_type in model_types:
    print(f'Training {model_type} model...')
    model = build_model(model_type, max_len, max_words, embedding_dim, units)
    early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
    history = model.fit(x_train, y_train, validation_split=0.2, epochs=epochs, batch_size=batch_size, callbacks=[early_stopping])
    
    # Store the model and its history
    models[model_type] = model
    histories[model_type] = history

    # Evaluate the model
    print(f'Evaluating {model_type} model...')
    y_pred = model.predict(x_test)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    print(f'MSE for {model_type} model: {mse}')
    print(f'MAE for {model_type} model: {mae}')

AttributeError: module 'tensorflow.core.framework.types_pb2' has no attribute 'SerializedDType'

In [7]:
from nltk.translate.bleu_score import sentence_bleu
import nltk
nltk.download('punkt')

# Tokenize the texts (you might need to adjust this to fit your exact text structure)
generated_texts_tokenized = [nltk.word_tokenize(text) for text in generated_texts]
reference_texts_tokenized = [[nltk.word_tokenize(text[0])] for text in reference_texts]

# Calculate BLEU score for each generated text
bleu_scores = []
for gen_text_tokens, ref_text_tokens in zip(generated_texts_tokenized, reference_texts_tokenized):
    bleu_score = sentence_bleu([ref_text_tokens], gen_text_tokens)
    bleu_scores.append(bleu_score)

# Calculate average BLEU score
avg_bleu_score = sum(bleu_scores) / len(bleu_scores)
print("Average BLEU Score:", avg_bleu_score)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\IMANE\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


NameError: name 'generated_texts' is not defined