In [8]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [9]:
base_url = "https://www.caasimada.net/category/wararka/"
page_number = 1
headlines = []
article_contents = []

while page_number <= 5:
    # Create the URL for the current page
    url = base_url + "page/" + str(page_number) + "/"
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find the headline news elements on the page
    headline_elements = soup.find_all('h3', class_='entry-title td-module-title')

    # If no headline elements found, exit the loop
    if not headline_elements:
        break

    for headline in headline_elements:
        link_element = headline.find('a')
        if link_element:
            article_url = link_element['href']
            article_response = requests.get(article_url)
            article_soup = BeautifulSoup(article_response.content, 'html.parser')

            article_paragraphs = article_soup.find_all('p')
            article_text = ' '.join([p.get_text().strip() for p in article_paragraphs if 'title heading-typo' not in p.get('class', []) and 'bs--gdpr-low' not in p.parent.get('class', [])])
            if article_text:
                headlines.append(link_element.text.strip())
                article_contents.append(article_text)

    # Increment the page number for the next iteration
    page_number += 1

In [13]:

# Create a DataFrame from the extracted data
data = {'Headline': headlines, 'Content': article_contents}
df = pd.DataFrame(data)

# Export the DataFrame to a CSV file
df.to_csv('caasimadda_news.csv', index=False)


In [29]:
import pandas as pd
import numpy as np
import re
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')  # For tokenization
nltk.download('wordnet')  # For lemmatization

# Load your dataset
df = pd.read_csv('caasimadda_news.csv')  # Adjust path accordingly
# Assuming stopwords_list.csv has been loaded
stopwords_df = pd.read_csv('stopwords_list.csv')
stopwords = stopwords_df['words'].tolist()

# Preprocessing function with your code integrated
def preprocess(text):
    text = re.sub(r'(\w+)\.(\w+)', r'\1 \2', text)  # Split words separated by periods
    text = re.sub(r'(\w+),(\w+)', r'\1 \2', text)  # Split words separated by commas
    text = re.sub(r'[^a-zA-Z\s-]', '', text, re.I | re.A)
    text = re.sub(r'\b\d+\b', '', text)
    text = text.lower()
    text = text.replace('-', '')  # Remove dashes
    text = text.strip()

    # Tokenize the text
    tokens = word_tokenize(text)

    # Remove stopwords
    tokens = [token for token in tokens if token.lower() not in stopwords]

    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    return ' '.join(tokens)  # Return a string for further processing

# Sentence Tokenization and Preprocessing of Each Sentence
def tokenize_and_preprocess_sentences(text):
    sentences = sent_tokenize(text)
    preprocessed_sentences = [preprocess(sentence) for sentence in sentences]
    # Filter out any empty sentences
    preprocessed_sentences = [sentence for sentence in preprocessed_sentences if sentence]
    return sentences, preprocessed_sentences  # Return original and preprocessed sentences

# Constructing the TextRank similarity matrix
def build_similarity_matrix(preprocessed_sentences):
    vectorizer = TfidfVectorizer()
    if not preprocessed_sentences:
        return np.zeros((0, 0))  # Return an empty matrix if there are no sentences
    sentence_vectors = vectorizer.fit_transform(preprocessed_sentences).toarray()
    similarity_matrix = cosine_similarity(sentence_vectors)
    return similarity_matrix

# TextRank Algorithm
def text_rank(similarity_matrix):
    if similarity_matrix.shape[0] == 0:
        return np.array([])  # Return an empty array if the similarity matrix is empty
    nx_graph = np.array(similarity_matrix)
    scores = np.array([1] * len(similarity_matrix))  # Initial score for each sentence
    beta = 0.85  # Damping factor
    for _ in range(10):  # Iteration for convergence
        scores_new = (1-beta) + beta * np.matmul(nx_graph, scores)
        if np.allclose(scores, scores_new, atol=1e-6):
            break
        scores = scores_new
    return scores

# Generate summaries
def generate_summary(content):
    if not isinstance(content, str):
        return ''  # Return an empty summary for invalid content
    original_sentences, preprocessed_sentences = tokenize_and_preprocess_sentences(content)
    if not preprocessed_sentences:
        return ''  # Return an empty summary if there are no preprocessed sentences
    similarity_matrix = build_similarity_matrix(preprocessed_sentences)
    sentence_scores = text_rank(similarity_matrix)
    if len(sentence_scores) == 0:
        return ''  # Return an empty summary if there are no sentence scores
    ranked_sentence_indices = np.argsort(-sentence_scores)  # Sort sentences by score
    top_sentence_indices = ranked_sentence_indices[:3]  # Select top 3 sentences for summary
    summary = ' '.join([original_sentences[index] for index in top_sentence_indices])
    return summary

# Applying the summarization
df['Summary'] = df['Content'].apply(generate_summary)

# Display the first few summaries
print(df[['Headline', 'Summary']].head())


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


                                            Headline  \
0  Muxuu R/wasaare ku-xigeenka kusoo arkay xerada...   
1  Xil. Abiib oo laga gudbiyey dacwad culus iyo x...   
2  Maxaan ka naqaanaa ninka Trump u noqonaya ku-x...   
3  Galmudug oo sheegtay danta laga lahaa hubkii l...   
4  Sidee ayey ku timid guusha uu ku faanay R/W Xa...   

                                             Summary  
0  Muqdisho (Caasimada Online) – Ra’iisul wasaare...  
1  Maxamed Cabdi Afyare, oo ah sharci-yaqaan ayaa...  
2  Markaas ka dib, Mr Vance ayaa bartiisa X ee ho...  
3  Dhuusamareeb (Caasimada Online) –Dowlad Gobole...  
4  Ra’iisul Wasaaraha oo shalay ka qeyb-galay mun...  


In [22]:
df.to_csv('summarized1.csv')