# Kijkcijfers Project
Naam: Jens Demeyer
Studentennummer: 202398958 Vak: Machine Learning
Datum: [Datum van indienen]

## Deel 1: Scrapen van data

curl request om te testen of api werkt en om te kijken in welk formaat de json staat

In [1]:
import requests
from datetime import datetime
import json

# Stap 1: Bouw de API URL met huidige datum
vandaag = 20180127
api_url = f"https://api.cim.be/api/cim_tv_public_results_daily_views?dateDiff={vandaag}&reportType=north"

# Stap 2: Haal data op
try:
    response = requests.get(api_url)
    
    if response.status_code == 200:
        # Stap 3: Parse de JSON data
        data = response.json()
        print("data:", data)
        
    else:
        print(f"Fout bij het ophalen van de gegevens: {response.status_code}")
except requests.exceptions.RequestException as e:
    print(f"Fout bij het ophalen van de gegevens: {e}")

    response_json = json.dumps(data, indent=4)

with open('data.json', 'w') as file:
    json.dump(data, file, indent=4)

data: {'@context': '/api/contexts/CimTvPublicResultsDailyView', '@id': '/api/cim_tv_public_results_daily_views', '@type': 'hydra:Collection', 'hydra:member': [{'@id': '/api/cim_tv_public_results_daily_views/44150', '@type': 'CimTvPublicResultsDailyView', 'id': 44150, 'reportType': 'north', 'dateImport': '2018-01-29T00:00:00.000000', 'dateResult': '2018-01-27T00:00:00.000000', 'ranking': '1', 'description': 'HET 7 UUR-JOURNAAL', 'category': None, 'channel': 'EEN', 'dateDiff': '2018-01-27T00:00:00.000000', 'startTime': '19:00:04', 'rLength': '00:31:57', 'ratePerc': None, 'rateInK': '803.889', 'shr': None, 'rateInKAll': None, 'live': 0}, {'@id': '/api/cim_tv_public_results_daily_views/44151', '@type': 'CimTvPublicResultsDailyView', 'id': 44151, 'reportType': 'north', 'dateImport': '2018-01-29T00:00:00.000000', 'dateResult': '2018-01-27T00:00:00.000000', 'ranking': '2', 'description': 'FC DE KAMPIOENEN', 'category': None, 'channel': 'EEN', 'dateDiff': '2018-01-27T00:00:00.000000', 'startTi

In [25]:
import requests
import csv
from datetime import datetime

def get_viewership_data(date_yyyymmdd):
    api_url = f"https://api.cim.be/api/cim_tv_public_results_daily_views?dateDiff={date_yyyymmdd}&reportType=north"
    
    try:
        response = requests.get(api_url)
        if response.status_code == 200:
            data = response.json()
            return data.get('hydra:member', [])
        else:
            print(f"Fout bij ophalen data voor {date_yyyymmdd}: HTTP {response.status_code}")
            return []
    except requests.exceptions.RequestException as e:
        print(f"Fout bij ophalen data voor {date_yyyymmdd}: {e}")
        return []

def save_to_csv(data, filename='kijkcijfers.csv'):
    if not data:
        print("Geen data om op te slaan")
        return
    
    # Selecteer alleen de gewenste velden
    selected_fields = ['dateDiff', 'ranking', 'description', 'channel', 'startTime', 'rLength', 'rateInK']
    
    with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=selected_fields)
        writer.writeheader()
        
        for item in data:
            # Maak een nieuw dict met alleen de gewenste velden
            row = {field: item.get(field) for field in selected_fields}
            
            # Formatteer dateDiff naar YYYY-MM-DD
            if 'dateDiff' in row and row['dateDiff']:
                try:
                    dt = datetime.strptime(row['dateDiff'], '%Y-%m-%dT%H:%M:%S.%fZ')
                    row['dateDiff'] = dt.strftime('%Y-%m-%d')
                except ValueError:
                    pass
            
            writer.writerow(row)
    
    print(f"Data opgeslagen in {filename} ({len(data)} records)")

date_str = '20250518'  # Format: YYYYMMDD
data = get_viewership_data(date_str)

if data:
    save_to_csv(data)
else:
    print("Geen data gevonden voor deze datum")

Data opgeslagen in kijkcijfers.csv (20 records)


In [6]:
import requests
import csv
from datetime import datetime, timedelta
import time
from tqdm import tqdm
import os

def get_viewership_data(date_yyyymmdd, max_retries=3):
    """Haal kijkcijferdata op voor specifieke datum"""
    api_url = f"https://api.cim.be/api/cim_tv_public_results_daily_views?dateDiff={date_yyyymmdd}&reportType=north"
    
    for attempt in range(max_retries):
        try:
            response = requests.get(api_url, timeout=30)
            if response.status_code == 200:
                return response.json().get('hydra:member', [])
            return []
        except Exception as e:
            if attempt == max_retries - 1:
                print(f"\nFout voor {date_yyyymmdd}: {str(e)[:100]}...")
            time.sleep(2 ** attempt)
    return []

def append_to_csv(data, filename='kijkcijfers.csv'):
    """Voeg data toe aan CSV zonder duplicate checking"""
    if not data:
        return
    
    fields = ['dateDiff', 'ranking', 'description', 'channel', 'startTime', 'rLength', 'rateInK']
    file_exists = os.path.isfile(filename)
    
    with open(filename, 'a', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fields)
        if not file_exists:
            writer.writeheader()
        
        for item in data:
            row = {field: item.get(field) for field in fields}
            if 'dateDiff' in row and row['dateDiff']:
                try:
                    dt = datetime.strptime(row['dateDiff'], '%Y-%m-%dT%H:%M:%S.%fZ')
                    row['dateDiff'] = dt.strftime('%Y-%m-%d')
                except ValueError:
                    pass
            writer.writerow(row)

def scrape_with_progress(start_date, end_date):
    """Scrape data voor een periode met voortgangsbalk"""
    date_range = [start_date + timedelta(days=x) for x in range((end_date - start_date).days + 1)]
    
    with tqdm(total=len(date_range), desc="Data scrapen", unit="dag") as pbar:
        for date in date_range:
            date_str = date.strftime('%Y%m%d')
            data = get_viewership_data(date_str)
            append_to_csv(data)
            pbar.update(1)
            time.sleep(0.5)  # Mild rate limiting

    print(f"\nData succesvol toegevoegd aan kijkcijfers.csv")

# Voorbeeldgebruik:
if __name__ == "__main__":
    scrape_with_progress(
        start_date=datetime(2019, 11, 1),
        end_date=datetime(2025, 5, 20)
    )

Data scrapen: 100%|██████████| 2028/2028 [29:59<00:00,  1.13dag/s]  


Data succesvol toegevoegd aan kijkcijfers.csv



