In [None]:
pip install beautifulsoup4


In [None]:
pip install nltk


In [26]:
import requests
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, GRU, Dense


nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")

# Function to preprocess text
def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words("arabic"))
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]

    # Stemming
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in stemmed_tokens]

    # Discretization (if needed)

    return " ".join(lemmatized_tokens)

# List of URLs to scrape
urls = [
    "https://khiyam.com/news/article.php?articleID=4880",
    "https://mawdoo3.com/%D8%A3%D8%B3%D8%A6%D9%84%D8%A9_%D8%AB%D9%82%D8%A7%D9%81%D9%8A%D8%A9_%D9%85%D9%86%D9%88%D8%B9%D8%A9",
    "https://mawdoo3.com/%D8%A3%D8%B3%D8%A6%D9%84%D8%A9_%D8%B9%D8%A7%D9%85%D8%A9_%D8%B3%D9%87%D9%84%D8%A9_%D9%85%D8%B9_%D8%A7%D9%84%D8%A3%D8%AC%D9%88%D8%A8%D8%A9"
]

# Initialize an empty list to store the dataset
preprocessed_dataset = []

# Iterate over each URL
for url in urls:
    try:
        # Make the request
        response = requests.get(url)
        response.raise_for_status()  # Raise an HTTPError for bad responses

        # Check the HTTP status code
        if response.status_code == 200:
            print(f"Request for {url} was successful.")

            # Create a BeautifulSoup object
            soup = BeautifulSoup(response.text, "html.parser")

            # Extract text from <p> tags
            text_data = [paragraph.get_text() for paragraph in soup.find_all("p")]

            # Verify the extracted data
            print("Extracted Text Data:")
            print(text_data)

            # Add the preprocessed data to the dataset
            preprocessed_dataset.extend([(preprocess_text(text), 6) for text in text_data])

        else:
            print(f"Request for {url} failed with status code {response.status_code}.")

    except requests.RequestException as e:
        print(f"Error during scraping {url}: {e}")

# Display the final preprocessed dataset
print("Final Preprocessed Dataset:")
print(preprocessed_dataset)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\damia\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\damia\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\damia\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Request for https://khiyam.com/news/article.php?articleID=4880 was successful.
Extracted Text Data:
['\nحسين الدقور 21\\10\\2008\n\n', 'نجيب محفوظ: الأديب العربي الذي نال جائزة نوبل للآداب عام 1988م', '', '\nالسلام عليكم ورحمة الله ', '\n', 'كنت اتصفح ولفت نظري هذه المعلومات العامة واحببت ان أعممها لما في ذلك من فائدة وكما يقول المثل: "العلم ان لم ينفع لا يضرّ". ', 'لهذا السبب احببت ارسلها للموقع.\n', '\n', 'مع اطيب تحياتي وتمنياتي للجميع بالتوفيق\n', ' ', 'اخوكم المخلص حســــين الدقور (ابو علي)\n', '--------------------------------------\n', '\n', '1- من هو الأديب العربي الذي نال جائزة نوبل للآداب عام 1988م؟\n', 'نجيب محفوظ.\n', '2- ما المقصود بالعنفقه؟\n', 'الشعر أسفل الشفه وفوق الذقن.\n', '3- كم يبلغ العدد من الرهط؟\n', 'من 10 إلى 30 فرد\n', '4- من هي خطيبة النساء وهي ثاني إمرأة بايعت الرسول صلى الله عليه وسلم في بيعة العقبة الثانية؟\n', 'أسماء بنت يزيد بن السكن الأنصاريه.\n', '5- على أي شيء أطلق العرب اسم الفرصاد؟\n', 'التوت.\n', '6- ماذا تعني الكلمة التالية: أفلاطون؟\n', 'الشخص عر

In [27]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, GRU, Dense
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Split the dataset into training and testing sets
train_data, test_data = train_test_split(preprocessed_dataset, test_size=0.2, random_state=42)


# Convert text data to sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text for text, _ in train_data])
X_train = tokenizer.texts_to_sequences([text for text, _ in train_data])
X_test = tokenizer.texts_to_sequences([text for text, _ in test_data])

# Pad sequences to have consistent length
X_train = pad_sequences(X_train, maxlen=max_length, padding='post')
X_test = pad_sequences(X_test, maxlen=max_length, padding='post')

# Define the model
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length))
model.add(RNN/Bidirectional/GRU/LSTM(units=128, activation='relu'))
model.add(Dense(units=1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# Evaluate the model
y_pred = model.predict(X_test)
y_pred_binary = (y_pred > 0.5).astype(int)
print("Accuracy:", accuracy_score(y_test, y_pred_binary))
print(classification_report(y_test, y_pred_binary))


NameError: name 'Tokenizer' is not defined