# Weather data scraping with Selenium

In [None]:
# all necessary imports

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.expected_conditions import presence_of_element_located
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains

from selenium.webdriver import Firefox

import pandas as pd

import time

import os

In [None]:
# if there is no csv in the current folder, scrape the website and create one
if not os.path.isfile("2019_paris_weather.csv"):

    # declare a dicitonnary with all required columns
    weather_data = {
        "day": [],
        "time": [],
        "temperature": [],
        "weather": []
    }

    # use firefox as the driver
    with Firefox() as driver:

        # implicitly wait 10 seconds for any elements to be accessible
        driver.implicitly_wait(10)

        # go on the weather website, for every month of 2019,
        # and accept all cookies, only for the first load
        first_time = True
        for month in range(1,13):
            driver.get(f"https://www.timeanddate.com/weather/france/paris/historic?month={month}&year=2019")
            if first_time:
                driver.find_element_by_xpath("/html/body/div[3]/div/div/div/div[2]/div[1]/button[1]").click()
                first_time = False

            # store the path to the scrolling menu that selects the days of the month
            menu = driver.find_element(By.XPATH, '//*[@id="wt-his-select"]')

            # store the days in an iterable "days"
            days = menu.find_elements(By.TAG_NAME, "option")

            # for every day of the current month ...
            for day in days:

                # click on the day
                day.click()

                # store the table address in "table"
                table = driver.find_element(By.XPATH, "/html/body/div[6]/main/article/div[6]/div[2]/div/table/tbody")

                # store the table entries in "entries"
                entries = table.find_elements(By.TAG_NAME, "tr")

                # for every entry of the table ...
                for entry in entries:

                    # add a new line to weather_data, with all required data
                    weather_data["day"].append(day.text)
                    weather_data["time"].append(entry.find_element(By.TAG_NAME, "th").text[:5])
                    weather_data["temperature"].append(entry.find_elements(By.TAG_NAME, "td")[1].text)
                    weather_data["weather"].append(entry.find_elements(By.TAG_NAME, "td")[2].text)

    # create a pandas dataframe
    weather_df = pd.DataFrame.from_dict(weather_data)
    
    # enregistre un fichier csv
    weather_df.to_csv("2019_paris_weather.csv", index=False)
    print("csv generation")

else:
    
    # read the csv and store the data in weather_df
    weather_df = pd.read_csv("2019_paris_weather.csv")
    print("the csv already exists")

In [None]:
# add a "date" column that holds the date as datetime type
weather_df['date'] = pd.to_datetime(weather_df['day'] + " " + weather_df['time'])

# delete every weather data from midnight to 6AM
weather_df['hours'] = pd.to_datetime(weather_df['date']).dt.hour
weather_df = weather_df[weather_df['hours'].between(6, 23)]

In [None]:
weather_df.head(50)

# Reddit scraping using the Pushshift API

In [None]:
# all necessary imports

from datetime import datetime
import pandas as pd
import requests
import json
import csv

In [None]:
def getPushshiftData(after, before, score_min):
    
    # function that scrapes the website within the time window that we want
    
    url = 'https://api.pushshift.io/reddit/search/submission/?size=100&after='+str(after)+'&before='+str(before)+'&subreddit=france&score=%3E'+str(score_min)
    r = requests.get(url)
    data = json.loads(r.text)
    return data['data']


def collectSubData(subm, dict_data):
    
    # function that gathers all necessary data from a submission and puts it in dict_data
    
    dict_data["created_utc"].append(int(subm["created_utc"]))
    dict_data["# comments"].append(int(subm["num_comments"]))
    dict_data["score"].append(int(subm["score"]))

In [None]:
# if there is no csv in the current folder, scrape the website and create one
if not os.path.isfile("2019_best_france_post.csv"):
    
    # create a dictionnary that will hold the data
    dict_data = {
        "created_utc": [],
        "# comments": [],
        "score": []
    }
    
    # declare the time window in which to look for submissions,
    # and the minimal score of the submissions
    after = int(datetime(2019, 1, 1).timestamp())
    before = int(datetime(2020, 1, 1).timestamp())
    score_min = 500
    print(after)

    # gather data from the website
    data = getPushshiftData(after, before, score_min)

    # ask for all submissions within the time period, as the api only shows 100 submissions at a time
    while len(data) > 0:
        for submission in data:
            collectSubData(submission, dict_data)
        print(len(data))
        after = data[-1]['created_utc']
        time.sleep(10)
        print(after)
        data = getPushshiftData(after, before, score_min)

    # create a pandas dataframe to hold the data
    reddit_df = pd.DataFrame.from_dict(dict_data)
    
    # generate the csv file
    print("csv generation")
    reddit_df.to_csv("2019_best_france_post.csv", index=False)
    
else:
    
    # read the data from the csv file
    reddit_df = pd.read_csv("2019_best_france_post.csv")
    print("the csv already exists")

In [None]:
reddit_df.head(50)