# Scraping Stations Traffic Status from RATP.fr

In [34]:
from bs4 import BeautifulSoup as bs
from dotenv import load_dotenv

import pandas as pd
import requests
import boto3
import os

In [28]:
load_dotenv()

True

In [3]:
AWS_ACCESS_KEY_ID = os.getenv('AWS_ACCESS_KEY_ID')
AWS_SECRET_ACCESS_KEY = os.getenv('AWS_SECRET_ACCESS_KEY')
AWS_BUCKET_NAME = os.getenv('AWS_BUCKET_NAME')

In [35]:
ratp = requests.get("https://www.ratp.fr/")

soup = bs(ratp.content, 'html.parser')

In [37]:
print(soup)

<!DOCTYPE html>
</body></html>


#### Update: (14/09/2023)

As you can see above, now the RATP website has added a cloudflare captcha to prevent web scraping, but at the time this script was done it wasn't there, as you can see in the code snippet below.

In [38]:
transportations_btns = soup.find_all('button', class_='border-line')

perturbations = []
for btn in transportations_btns:
    perturbation_span = btn['aria-label']
    perturbations.append(perturbation_span)

In [6]:
perturbations

[]

## Putting it together in a function

In [7]:
def scrape_stations_status():
    ratp = requests.get("https://www.ratp.fr/")

    soup = bs(ratp.content, 'html.parser')

    transportations_btns = soup.find_all('button', class_='border-line')

    perturbations = []
    for btn in transportations_btns:
        perturbation_span = btn['aria-label']
        perturbations.append(perturbation_span)

    df = pd.DataFrame([x.split(', ')[:2] for x in perturbations], columns=['name', 'status'])

    # Extract the name without leading/trailing spaces
    df['name'] = df['name'].str.strip()

    return df

In [8]:
df = scrape_stations_status()
df.head()

Unnamed: 0,name,status
0,RER A,trafic normal travaux prévus
1,RER B,trafic perturbé travaux en cours
2,RER C,trafic perturbé travaux en cours
3,RER D,trafic perturbé travaux en cours
4,RER E,trafic perturbé travaux en cours


## Data Exploration

In [9]:
df['name'].unique()

array(['RER A', 'RER B', 'RER C', 'RER D', 'RER E', 'METRO 1', 'METRO 2',
       'METRO 3', 'METRO 3B', 'METRO 4', 'METRO 5', 'METRO 6', 'METRO 7',
       'METRO 7B', 'METRO 8', 'METRO 9', 'METRO 10', 'METRO 11',
       'METRO 12', 'METRO 13', 'METRO 14', 'METRO ORV', 'TRAM T1',
       'TRAM T2', 'TRAM T3A', 'TRAM T3B', 'TRAM T4', 'TRAM T5', 'TRAM T6',
       'TRAM T7', 'TRAM T8', 'TRAM T9', 'TRAM T10', 'TRAM T11',
       'TRAM T13', 'TRANSILIEN H', 'TRANSILIEN J', 'TRANSILIEN K',
       'TRANSILIEN L', 'TRANSILIEN N', 'TRANSILIEN P', 'TRANSILIEN R',
       'TRANSILIEN U'], dtype=object)

In [10]:
df['status'].unique()

array(['trafic normal travaux prévus', 'trafic perturbé travaux en cours',
       'trafic normal'], dtype=object)

In [11]:
df.to_csv('data/traffic_status.csv', index=False)

Exporting the data to Amazon S3 Bucket

In [77]:
def export_data_to_s3(data):
    s3 = boto3.client('s3')
    csv_data = data.to_csv(index=False)

    bucket_name = "omdena-paris-ratp-stations-status-useast1"
    file_name = "ratp_stations_traffic_status.csv"

    s3.put_object(Body=csv_data, Bucket=bucket_name, Key=file_name)

    print("Dataframe is saved as CSV in S3 bucket.")

In [78]:
export_data_to_s3(df)

Dataframe is saved as CSV in S3 bucket.


Loading the dataframe from the S3 Bucket

In [19]:
def load_data():

    s3 = boto3.resource('s3', aws_access_key_id=AWS_ACCESS_KEY_ID, aws_secret_access_key=AWS_SECRET_ACCESS_KEY)
    bucket = s3.Bucket(AWS_BUCKET_NAME)
    obj = bucket.Object('ratp_stations_traffic_status.csv')
    body = obj.get()['Body']
    df = pd.read_csv(body)

    return df

In [20]:
df2 = load_data()
df2

Unnamed: 0,name,status
