# Scraping Misbar

- the goal of this notebook is to scrape fact checked news related to sudan from a fact checking website called Misbar.
- the urls for the required news are precolleced from the website using Instant Data Scraper extension.

## Imported libraries

In [2]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import time
import numpy as np
import json
import re

## Loading the file that contain news urls

In [22]:
urls_df = pd.read_csv("misbar_urls.csv")

num_news = urls_df.shape[0]
print("number of news:",num_news)

urls_df.head()

number of news: 63


Unnamed: 0,news_url,image_url
0,https://www.misbar.com/factcheck/2025/08/28/%D...,https://assets.misbar.com/styles/large_425x240...
1,https://www.misbar.com/factcheck/2025/08/26/%D...,https://assets.misbar.com/styles/small_scale_1...
2,https://www.misbar.com/factcheck/2025/08/23/%D...,https://assets.misbar.com/styles/large_425x240...
3,https://www.misbar.com/factcheck/2025/08/19/%D...,https://assets.misbar.com/styles/small_scale_1...
4,https://www.misbar.com/factcheck/2025/08/07/%D...,https://assets.misbar.com/styles/large_425x240...


## asign an id number of each news

In [23]:
news_id = np.arange(1,num_news+1)
urls_df['news_id'] = news_id
#re-arange columns
urls_df = urls_df[["news_id","news_url","image_url"]]
urls_df.head()

Unnamed: 0,news_id,news_url,image_url
0,1,https://www.misbar.com/factcheck/2025/08/28/%D...,https://assets.misbar.com/styles/large_425x240...
1,2,https://www.misbar.com/factcheck/2025/08/26/%D...,https://assets.misbar.com/styles/small_scale_1...
2,3,https://www.misbar.com/factcheck/2025/08/23/%D...,https://assets.misbar.com/styles/large_425x240...
3,4,https://www.misbar.com/factcheck/2025/08/19/%D...,https://assets.misbar.com/styles/small_scale_1...
4,5,https://www.misbar.com/factcheck/2025/08/07/%D...,https://assets.misbar.com/styles/large_425x240...


In [24]:
# replace misbar_urls.csv file with the indexed one

urls_df.to_csv("misbar_urls.csv",index=False)

## Load the url file

In [3]:
urls_df = pd.read_csv("misbar_urls.csv")

## Scraping Misbar

Define a function that scrape a single news

In [14]:

def scrape_news(news_id, url, error_log_file):
    try:
        html_text = requests.get(url).text
        soup = BeautifulSoup(html_text, 'lxml')

        news_info = {
            'news_id': news_id,
            'title': 'undefined',
            'description': 'undefined',
            'claim_reviewed': 'undefined',
            'type': 'undefined',
            'label': 'undefined',
            'date': 'undefined'
        }
        accounts_list = []

        # TITLE
        title_tag = soup.find('div', class_="blog--article_title")
        if title_tag:
            news_info['title'] = title_tag.text.strip()
        else:
            error_log_file.write(f"[TITLE_ERROR] {news_id}\n")
            return None, None  # Skip news

        # DESCRIPTION
        try:
            outer_div = soup.find('div', class_="deep-dive--article_story")
            news_info['description'] = outer_div.find('div', class_="section-text").text.strip()
        except:
            pass  # Don't log, just ignore

        # LABEL
        label_tag = soup.find('div', class_="deep-dive--article_classification")
        if label_tag:
            news_info['label'] = label_tag.text.strip()
        else:
            error_log_file.write(f"[LABEL_ERROR] {news_id}\n")
            return None, None  # Skip news

        # JSON meta (ld+json)
        try:
            json_string = soup.find('script', type='application/ld+json').text
            meta_info = json.loads(json_string)
            news_info['date'] = meta_info.get("datePublished", "")[:10]
            news_info['claim_reviewed'] = meta_info.get("claimReviewed", 'undefined')
            if not news_info['date']:
                error_log_file.write(f"[DATE_ERROR] {news_id}\n")
                return None, None  # Skip news
        except:
            error_log_file.write(f"[DATE_ERROR] {news_id}\n")
            return None, None  # Skip news

        # SUBSECTION (type)
        try:
            uncleaned_string = '"""' + soup.find('script', id="dataLayerScript").text + '"""'
            pattern = r"\{([^}]*)\}"
            extracted_content = re.findall(pattern, uncleaned_string)[0]
            json_string = '{' + extracted_content.replace("'", '"') + '}'
            data = json.loads(json_string)
            news_info['type'] = data.get("subsection", 'undefined')
        except:
            pass  # Don't log

        # ACCOUNTS
        try:
            accounts_div = soup.find('div', class_="deep-dive--article_posted-on")
            a_tags = accounts_div.find_all('a')
            for a in a_tags:
                account = a['href']
                accounts_list.append({'news_id': news_id, 'accounts': account})
        except:
            pass  # Don't log

        return news_info, accounts_list

    except Exception as e:
        error_log_file.write(f"[GENERAL_ERROR] {news_id}: {str(e)}\n")
        return None, None  # Skip news on any major error


Scrape all news in urls_df file

In [15]:
all_news_info = []
all_accounts = []

with open("error_log.txt", "a", encoding="utf-8") as error_log_file:
    for i, row in urls_df.iterrows():
        news_info, accounts_list = scrape_news(row['news_id'], row["news_url"], error_log_file)

        if news_info and accounts_list is not None:
            all_news_info.append(news_info)
            all_accounts.extend(accounts_list)
            print(f"Done from scraping news with id = {row['news_id']}\n")
            
            time.sleep(5)


Done from scraping news with id = 1

Done from scraping news with id = 2

Done from scraping news with id = 3

Done from scraping news with id = 4

Done from scraping news with id = 5

Done from scraping news with id = 6

Done from scraping news with id = 7

Done from scraping news with id = 8

Done from scraping news with id = 9

Done from scraping news with id = 10

Done from scraping news with id = 11

Done from scraping news with id = 12

Done from scraping news with id = 13

Done from scraping news with id = 14

Done from scraping news with id = 15

Done from scraping news with id = 16

Done from scraping news with id = 17

Done from scraping news with id = 18

Done from scraping news with id = 19

Done from scraping news with id = 20

Done from scraping news with id = 21

Done from scraping news with id = 22

Done from scraping news with id = 23

Done from scraping news with id = 24

Done from scraping news with id = 25

Done from scraping news with id = 26

Done from scraping ne

In [19]:
# Dataframe contains news info
news_df = pd.DataFrame(all_news_info)
accounts_df = pd.DataFrame(all_accounts)

print("number of scraped news = ",news_df.shape[0])
print("number of accounts collected = ",accounts_df.shape[0])

number of scraped news =  59
number of accounts collected =  314


In [21]:
#showing first row on news_df
news_df.head()

Unnamed: 0,news_id,title,description,claim_reviewed,type,label,date
0,1,الفيديو قديم وليس لمظاهرة سياسية نُظمت خلال اح...,مقطع فيديو نشرته حسابات وصفحات على مواقع التوا...,فيديو لحشود في السودان خرجت تحتفل بالمولد النب...,سياسة,مضلل,2025-08-28
1,2,الفيديو ليس لمرتزق كولومبي يقاتل في صفوف الدعم...,مقطع فيديو متداول عبر حسابات وصفحات على مواقع ...,مقطع فيديو متداول على أنه يُظهر مرتزقًا كولومب...,أخبار,مضلل,2025-08-26
2,3,الفيديو ليس لمرتزقة كولومبيين يشاركون في معارك...,مقطع فيديو نشرته حسابات وصفحات على مواقع التوا...,فيديو يُظهر مرتزقة من دولة كولومبيا يقاتلون مع...,أخبار,مضلل,2025-08-23
3,4,الفيديو ليس لمرتزقة كولومبيين يقاتلون في صفوف ...,مقطع فيديو متداول، حديثًا، ادعي ناشروه أنه لمر...,مرتزقة كولومبيون يقاتلون في صفوف الدعم السريع ...,أخبار,مضلل,2025-08-19
4,5,الصورة من استهداف إسرائيلي في مطار صنعاء وليست...,صورة متداولة حديثًا على موقعي فيسبوك وإكس، ادع...,صورة توثق تدمير سلاح الجو السوداني طائرة عسكري...,أخبار,مضلل,2025-08-07


In [24]:
#showing first rows on accounts_df
accounts_df.head()

Unnamed: 0,news_id,accounts
0,1,https://www.facebook.com/reel/1303881981474509
1,1,https://x.com/Dr_AmeenMakki/status/19600442747...
2,1,https://www.facebook.com/100066322586674/video...
3,1,https://www.facebook.com/reel/613876958265329
4,1,https://www.facebook.com/reel/760527776622792


In [27]:
# function returns number of undefined values in a specific column

def count_undefined(column):
    vec = np.where(column == 'undefined')
    count = np.sum(vec)
    return(count)

In [34]:
columns_names = news_df.columns
for column in columns_names:
    print(f'number of undefined values in {column} column = ',count_undefined(news_df[column]))

number of undefined values in news_id column =  0
number of undefined values in title column =  0
number of undefined values in description column =  0
number of undefined values in claim_reviewed column =  0
number of undefined values in type column =  0
number of undefined values in label column =  0
number of undefined values in date column =  0


## Saving news info and accounts

In [36]:
news_df.to_csv("news_info_Misbar.csv",encoding="utf-8-sig",index=False)
accounts_df.to_csv("accounts_Misbar.csv",encoding="utf-8-sig",index=False)

## investegate news in error log file

In [43]:
err_arr = np.loadtxt("error_log.txt",delimiter=' ',dtype=str)

In [54]:
err_news_id = err_arr[:,1].astype(int)
err_labels = np.unique(err_arr[:,0])
print(f'types of errors are : {err_labels}')
print(f'news_id with errors : {err_news_id}')

types of errors are : ['[LABEL_ERROR]']
news_id with errors : [30 38 47 53]


In [62]:
# find urls of news with no label 
condtion = urls_df['news_id'].isin(err_news_id)
err_df = urls_df[condtion]
num_err_news = np.sum(condtion)

for i in range(num_err_news):
    print('news_id',err_df['news_id'].iloc[i])
    print('url : ',err_df['news_url'].iloc[i])

news_id 30
url :  https://www.misbar.com/editorial/2025/03/16/%D8%A3%D8%AB%D8%B1-%D8%A7%D9%84%D8%AA%D8%B6%D9%84%D9%8A%D9%84-%D8%A7%D9%84%D9%85%D8%B9%D9%84%D9%88%D9%85%D8%A7%D8%AA%D9%8A-%D9%88%D8%A7%D9%84%D8%AA%D8%B2%D9%8A%D9%8A%D9%81-%D8%A7%D9%84%D8%B9%D9%85%D9%8A%D9%82-%D8%B9%D9%84%D9%89-%D8%AD%D9%8A%D8%A7%D8%A9-%D8%A7%D9%84%D8%B3%D9%88%D8%AF%D8%A7%D9%86%D9%8A%D9%8A%D9%86-%D8%A3%D8%AB%D9%86%D8%A7%D8%A1-%D8%A7%D9%84%D8%AD%D8%B1%D8%A8
news_id 38
url :  https://www.misbar.com/editorial/2025/01/29/%D8%A3%D8%A8%D8%B1%D8%B2-%D8%A7%D9%84%D8%A7%D8%AF%D8%B9%D8%A7%D8%A1%D8%A7%D8%AA-%D8%A7%D9%84%D9%85%D8%B6%D9%84%D9%84%D8%A9-%D8%B9%D9%86-%D8%A7%D9%84%D8%A7%D8%B4%D8%AA%D8%A8%D8%A7%D9%83%D8%A7%D8%AA-%D8%A7%D9%84%D8%AC%D8%A7%D8%B1%D9%8A%D8%A9-%D9%81%D9%8A-%D8%A7%D9%84%D8%B3%D9%88%D8%AF%D8%A7%D9%86
news_id 47
url :  https://www.misbar.com/editorial/2023/05/01/%D9%85%D8%A4%D8%B4%D8%B1-%D9%85%D8%B3%D8%A8%D8%A7%D8%B1-%D9%84%D8%A3%D8%A8%D8%B1%D8%B2-%D8%A7%D9%84%D8%A3%D8%AE%D8%A8%D8%A7%D8%B1-%D8%A7%D9%84

They are articles investigating the situation in sudan but not a specific news

In [64]:
# save them in a separate file
err_df.to_csv("news_no_labels.csv",index=False)

Done.