# Kijkcijfers Project
Naam: Jens Demeyer
Studentennummer: 202398958 Vak: Machine Learning
Datum: [Datum van indienen]

## Deel 1: Scrapen van data

curl request om te testen of api werkt en om te kijken in welk formaat de json staat

In [1]:
import requests
from datetime import datetime
import json

# Stap 1: Bouw de API URL met huidige datum
vandaag = 20180127
api_url = f"https://api.cim.be/api/cim_tv_public_results_daily_views?dateDiff={vandaag}&reportType=north"

# Stap 2: Haal data op
try:
    response = requests.get(api_url)
    
    if response.status_code == 200:
        # Stap 3: Parse de JSON data
        data = response.json()
        print("data:", data)
        
    else:
        print(f"Fout bij het ophalen van de gegevens: {response.status_code}")
except requests.exceptions.RequestException as e:
    print(f"Fout bij het ophalen van de gegevens: {e}")

    response_json = json.dumps(data, indent=4)

with open('data.json', 'w') as file:
    json.dump(data, file, indent=4)

data: {'@context': '/api/contexts/CimTvPublicResultsDailyView', '@id': '/api/cim_tv_public_results_daily_views', '@type': 'hydra:Collection', 'hydra:member': [{'@id': '/api/cim_tv_public_results_daily_views/44150', '@type': 'CimTvPublicResultsDailyView', 'id': 44150, 'reportType': 'north', 'dateImport': '2018-01-29T00:00:00.000000', 'dateResult': '2018-01-27T00:00:00.000000', 'ranking': '1', 'description': 'HET 7 UUR-JOURNAAL', 'category': None, 'channel': 'EEN', 'dateDiff': '2018-01-27T00:00:00.000000', 'startTime': '19:00:04', 'rLength': '00:31:57', 'ratePerc': None, 'rateInK': '803.889', 'shr': None, 'rateInKAll': None, 'live': 0}, {'@id': '/api/cim_tv_public_results_daily_views/44151', '@type': 'CimTvPublicResultsDailyView', 'id': 44151, 'reportType': 'north', 'dateImport': '2018-01-29T00:00:00.000000', 'dateResult': '2018-01-27T00:00:00.000000', 'ranking': '2', 'description': 'FC DE KAMPIOENEN', 'category': None, 'channel': 'EEN', 'dateDiff': '2018-01-27T00:00:00.000000', 'startTi

In [25]:
import requests
import csv
from datetime import datetime

def get_viewership_data(date_yyyymmdd):
    api_url = f"https://api.cim.be/api/cim_tv_public_results_daily_views?dateDiff={date_yyyymmdd}&reportType=north"
    
    try:
        response = requests.get(api_url)
        if response.status_code == 200:
            data = response.json()
            return data.get('hydra:member', [])
        else:
            print(f"Fout bij ophalen data voor {date_yyyymmdd}: HTTP {response.status_code}")
            return []
    except requests.exceptions.RequestException as e:
        print(f"Fout bij ophalen data voor {date_yyyymmdd}: {e}")
        return []

def save_to_csv(data, filename='kijkcijfers.csv'):
    if not data:
        print("Geen data om op te slaan")
        return
    
    # Selecteer alleen de gewenste velden
    selected_fields = ['dateDiff', 'ranking', 'description', 'channel', 'startTime', 'rLength', 'rateInK']
    
    with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=selected_fields)
        writer.writeheader()
        
        for item in data:
            # Maak een nieuw dict met alleen de gewenste velden
            row = {field: item.get(field) for field in selected_fields}
            
            # Formatteer dateDiff naar YYYY-MM-DD
            if 'dateDiff' in row and row['dateDiff']:
                try:
                    dt = datetime.strptime(row['dateDiff'], '%Y-%m-%dT%H:%M:%S.%fZ')
                    row['dateDiff'] = dt.strftime('%Y-%m-%d')
                except ValueError:
                    pass
            
            writer.writerow(row)
    
    print(f"Data opgeslagen in {filename} ({len(data)} records)")

date_str = '20250518'  # Format: YYYYMMDD
data = get_viewership_data(date_str)

if data:
    save_to_csv(data)
else:
    print("Geen data gevonden voor deze datum")

Data opgeslagen in kijkcijfers.csv (20 records)


In [23]:
import requests
import csv
from datetime import datetime, timedelta
import time
import os
from tqdm import tqdm

# Configureer requests session
session = requests.Session()
session.timeout = 30

def get_viewership_data(date_yyyymmdd, max_retries=3):
    api_url = f"https://api.cim.be/api/cim_tv_public_results_daily_views?dateDiff={date_yyyymmdd}&reportType=north"
    
    for attempt in range(max_retries):
        try:
            response = session.get(api_url, timeout=30)
            if response.status_code == 200:
                data = response.json()
                return data.get('hydra:member', [])
            elif response.status_code == 404:
                return []
        except Exception as e:
            if attempt == max_retries - 1:
                print(f"\nFout voor {date_yyyymmdd}: {str(e)[:100]}...")
            time.sleep(2 ** attempt)
    return []

def append_to_csv(data, filename='kijkcijfers.csv'):
    if not data:
        return
    
    fields = ['dateDiff', 'ranking', 'description', 'channel', 'startTime', 'rLength', 'rateInK']
    
    # Schrijf modus bepalen (append of write headers)
    file_exists = os.path.isfile(filename)
    
    with open(filename, 'a', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fields)
        
        if not file_exists:
            writer.writeheader()
        
        for item in data:
            row = {field: item.get(field) for field in fields}
            if 'dateDiff' in row and row['dateDiff']:
                try:
                    dt = datetime.strptime(row['dateDiff'], '%Y-%m-%dT%H:%M:%S.%fZ')
                    row['dateDiff'] = dt.strftime('%Y-%m-%d')
                except ValueError:
                    pass
            
            # Controleer op dubbele records
            if not is_duplicate(row, filename):
                writer.writerow(row)

def is_duplicate(row, filename):
    """Controleer of een record al bestaat in het CSV-bestand"""
    if not os.path.isfile(filename):
        return False
    
    with open(filename, 'r', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for existing_row in reader:
            if (existing_row['dateDiff'] == row['dateDiff'] and 
                existing_row['description'] == row['description'] and 
                existing_row['channel'] == row['channel'] and
                existing_row['startTime'] == row['startTime']):
                return True
    return False

def scrape_with_progress(start_date, end_date):
    date_range = [start_date + timedelta(days=x) for x in range((end_date - start_date).days + 1)]
    
    with tqdm(total=len(date_range), desc="Data scrapen", unit="dag") as pbar:
        for date in date_range:
            date_str = date.strftime('%Y%m%d')
            data = get_viewership_data(date_str)
            append_to_csv(data)
            pbar.update(1)

    print(f"\nData succesvol toegevoegd aan kijkcijfers.csv")

# Voorbeeldgebruik:
if __name__ == "__main__":
    # Test met een kleine periode eerst
    start = datetime(2024, 10, 1)
    end = datetime(2025, 5, 20)
    scrape_with_progress(start, end)

Data scrapen: 100%|██████████| 232/232 [01:36<00:00,  2.40dag/s]


Data succesvol toegevoegd aan kijkcijfers.csv





In [28]:
import requests
import csv
from datetime import datetime, timedelta
from tqdm import tqdm
import os

def get_viewership_data(date_yyyymmdd):
    """Haal kijkcijferdata op voor specifieke datum"""
    try:
        response = requests.get(
            f"https://api.cim.be/api/cim_tv_public_results_daily_views?dateDiff={date_yyyymmdd}&reportType=north",
            timeout=10
        )
        if response.status_code == 200:
            return response.json().get('hydra:member', [])
        return []
    except Exception as e:
        print(f"Fout voor {date_yyyymmdd}: {str(e)[:100]}...")
        return []

def save_data(data, filename='kijkcijfers_test.csv'):
    """Sla data op in CSV (append modus)"""
    if not data:
        return
    
    fields = ['dateDiff', 'ranking', 'description', 'channel', 'startTime', 'rLength', 'rateInK']
    file_exists = os.path.exists(filename)
    
    with open(filename, 'a', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=fields)
        if not file_exists:
            writer.writeheader()
        
        for item in data:
            row = {f: item.get(f) for f in fields}
            if row['dateDiff']:
                try:
                    row['dateDiff'] = datetime.strptime(row['dateDiff'], '%Y-%m-%dT%H:%M:%S.%fZ').strftime('%Y-%m-%d')
                except ValueError:
                    continue
            writer.writerow(row)

def scrape_period(start_date, end_date):
    """Scrape data voor een periode"""
    days = (end_date - start_date).days + 1
    for day in tqdm(range(days), desc="Data scrapen"):
        current_date = start_date + timedelta(days=day)
        data = get_viewership_data(current_date.strftime('%Y%m%d'))
        save_data(data)

# Voorbeeldgebruik
if __name__ == "__main__":
    scrape_period(
        start_date=datetime(2016, 10, 1),
        end_date=datetime(2025, 5, 20)
    )

Data scrapen:  14%|█▍        | 456/3154 [02:52<2:26:54,  3.27s/it]

Fout voor 20171230: HTTPSConnectionPool(host='api.cim.be', port=443): Read timed out. (read timeout=10)...


Data scrapen:  34%|███▎      | 1063/3154 [06:32<12:52,  2.71it/s] 


KeyboardInterrupt: 