In [None]:
#modules installation
!pip install --upgrade pip
!apt-get update -y 
!pip install selenium 
!pip install yfinance

In [None]:
#install google chrome 
!wget https://dl.google.com/linux/linux_signing_key.pub
!sudo apt-key add linux_signing_key.pub
!echo 'deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main' >> /etc/apt/sources.list.d/google-chrome.list
!sudo apt-get -y update
!sudo apt-get install -y google-chrome-stable

#install chromedrive
!wget -O /tmp/chromedriver.zip http://chromedriver.storage.googleapis.com/`curl -sS chromedriver.storage.googleapis.com/LATEST_RELEASE`/chromedriver_linux64.zip
!unzip /tmp/chromedriver.zip chromedriver -d /usr/local/bin/

#check versions
!google-chrome --version
!chromedriver -v

In [None]:
#imports
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium import webdriver
from bs4 import BeautifulSoup as bs
import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib
import datetime
import re

In [None]:
#chrome configuration
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument("--window-size=1920,1080")
wd = webdriver.Chrome('chromedriver', options=chrome_options)

In [None]:
#Scraping Data from website(licenses)
urls = {'comm': 'https://tldrlegal.com/search?reverse=true&can%5B%5D=52c0d009a1ddc9766c00000a',
        'noncom': 'https://tldrlegal.com/search?reverse=true&cannot%5B%5D=52c0d009a1ddc9766c00000a'}

df = pd.DataFrame()

all_licenses_links = {}

for url in urls:
    wd.get(urls[url])
    html = wd.page_source
    soup = bs(html)
    link_html = soup.find_all(class_='search-result flatbox')

    for link in link_html:
        temp_df = pd.DataFrame()
        name = link.findChild("h3").text
        
        for i in link.findChildren("span"):
            if ' can ' in str(i):
                temp_df[i.text] = [1]
            elif ' cannot ' in str(i):
                temp_df[i.text] = [0]
                
        temp_df['Name'] = [name]    
        df = pd.concat([df, temp_df])
        
        html = 'https://tldrlegal.com' + str(link.findChild("a")['href'])
        all_licenses_links[name] = html
        
df.set_index('Name', inplace=True)
df.fillna(2, inplace=True)
df.to_csv('comm_noncomm.csv')        

In [None]:
#CREATING DATASET FROM COMMERCIAL AND NONCOMMERCIAL LICENSES
!mkdir comm
!mkdir noncomm

for url in all_licenses_links:
    text = ''
    wd.get(all_licenses_links[url]+'#fulltext')
    html = wd.page_source
    soup = bs(html)
    texts = soup.find_all(class_='editable')
    for sent in texts:
        text += ' ' + sent.text
     
    #Check if commercial or noncommercial
    if df.loc[url]['Commercial Use'] == 1:
        directory = 'comm'
    else:
        directory = 'noncomm'
    name = re.sub('\W+','', url )
    with open(f"{directory}/{name}.txt", 'w') as file:
        file.write(text)

In [None]:
!zip -r comm_noncomm_data noncomm comm

In [None]:
#Scraping Data from website(trading data)
url = 'https://tradingeconomics.com/poland/calendar#'
wd.get(url)
html = wd.page_source
soup = bs(html)

wd.find_element(By.CLASS_NAME, 'btn.btn-default.dropdown-toggle.btn-calendar').click()
wd.find_element(By.CLASS_NAME, 'glyphicon.glyphicon-pencil').click()
wd.implicitly_wait(10)

#MAX 1000 records at one time
start_date = wd.find_element(By.ID, 'startDate') #From
start_date.clear()
start_date.send_keys("2019-01-01")

end_date = wd.find_element(By.ID, 'endDate') #Until
end_date.clear()
end_date.send_keys("2023-01-01")

wd.find_element(By.CLASS_NAME, 'btn.btn-success').click()

time_zone = Select(wd.find_element(By.ID, 'DropDownListTimezone'))
time_zone.select_by_visible_text('UTC +1') #Timezone

html = wd.page_source

In [None]:
def try_except(success):
    try:
        return float(re.sub("[^0-9.-]", "", success.text))
    except:
        return np.nan

In [None]:
soup = bs(html, 'lxml')
df = pd.DataFrame()
temp_df = pd.DataFrame()

all_dates = soup.find_all(class_='table-header')
all_dates = list(map(lambda x: datetime.strptime(' '.join(x.text.split()[1:4]), '%B %d %Y'), all_dates))

tbody_elements = soup.find(id='calendar').find_all('tbody')
tr_elements = list(map(lambda x: len(x.find_all('tr', {'data-id':True})), tbody_elements))
num_events_in_day = list(filter(None, tr_elements))

date_to_event = np.repeat(all_dates, num_events_in_day)

data = soup.find_all("tr", {"data-event":True})
for idx, event in enumerate(data):
    data_event = event['data-event']
    actual = try_except(event.find(['a', 'span'], {"id":'actual'}))
    previous = try_except(event.find(['a', 'span'], {"id":'previous'}))
    consensus = try_except(event.find(['a', 'span'], {"id":'consensus'}))
    forecast = try_except(event.find(['a', 'span'], {"id":'forecast'}))
    temp_df[f"{data_event}-actual"] = [actual]
    temp_df[f"{data_event}-previous"] = [previous]
    temp_df[f"{data_event}-consensus"] = [consensus]
    temp_df[f"{data_event}-forecast"] = [forecast]
    temp_df['date'] = [date_to_event[idx]]
    df = pd.concat([df, temp_df])

df.set_index('date', inplace=True)    