In [1]:
from IPython.core.display import HTML
HTML(r"""
<style>
    * {
        font-family: monospace;
        font-size: 12px;
        line-height: normal;
    }
</style>
""")

In [3]:
from datetime import datetime, timedelta
from typing import List
import matplotlib.pyplot as plt
import re
import requests
import xmltodict

from tabulate import tabulate
from crawler_engine import crawl_url
import numpy as np
import pandas as pd
import pandasql as psql
import seaborn as sns

# Crawl data

In [4]:
def format_url(event_date: datetime):
    return f"https://www.vietinbank.vn/web/home/vn/ty-gia?theDate={event_date.strftime('%d/%m/%Y')}"

today = datetime.now()
print(format_url(event_date=today))

https://www.vietinbank.vn/web/home/vn/ty-gia?theDate=18/06/2024


In [33]:
def transform_data(data: str, list_records: List[str], event_date: datetime):
    price_table_pattern = r"<table id=\"hor-ex-b\".*?<\/tr><\/tr><\/td><\/table>"
    currency_pattern = r"<tr class=\"ex-.*?<\/tr>"
    if matched_data := re.findall(pattern=price_table_pattern, string=data.replace("\r", " ").replace("\n", " ").replace("\t", " ")):
        price_table = matched_data[0]
    else:
        return

    for matched_string in re.findall(pattern=currency_pattern, string=price_table,):
        if "&" in matched_string:
            # Record containing & shows currency conversion with EUR, USD pieces priced smaller than 50, 100
            # Skip these records
            continue
        matched_string = matched_string.replace("#", "")  # Record containing & shows currency conversion with EUR, USD pieces priced 50, 100
        tbl = xmltodict.parse(xml_input=matched_string, encoding="utf-8")
        currency, central_rate, cash, bank_transfer, sell_price, *_ = [i["#text"].replace(",", ".") for i in tbl["tr"]["td"]]
        list_records.append([event_date.strftime('%Y%m%d'), currency, central_rate, cash, bank_transfer, sell_price])
    

In [100]:
"""
    Date range should be less than 30 days unless IP is banned
"""
start_date = datetime(year=2024, month=1, day=1)
end_date = datetime(year=2024, month=1, day=31)
# end_date = datetime.now()
foreign_exchange_data = await crawl_url(start_date=start_date, end_date=end_date, format_url=format_url, transform_data=transform_data,)

In [101]:
df_foreign_exchange_data = pd.DataFrame(
    data=foreign_exchange_data,
    columns=["event_date", "currency", "central_rate", "cash", "bank_transfer", "sell_price"]
)

In [102]:
df_foreign_exchange_data.head(n=5)

Unnamed: 0,event_date,currency,central_rate,cash,bank_transfer,sell_price
0,20240111,AUD,-,15.974,15.994,16.594
1,20240111,CAD,-,17.911,17.921,18.621
2,20240111,CHF,-,28.102,28.122,29.072
3,20240111,CNY,-,-,3.346,3.486
4,20240111,DKK,-,-,3.505,3.675


In [103]:
df_foreign_exchange_data[["central_rate", "cash", "bank_transfer", "sell_price"]] = df_foreign_exchange_data[["central_rate", "cash", "bank_transfer", "sell_price"]] \
    .replace("-", "0")
df_foreign_exchange_data = df_foreign_exchange_data.astype({
    "event_date": int,
    "central_rate": float,
    "cash": float,
    "bank_transfer": float,
    "sell_price": float,
}).sort_values(by="event_date", ascending=False)
df_foreign_exchange_data.head(n=5)

Unnamed: 0,event_date,currency,central_rate,cash,bank_transfer,sell_price
366,20240131,KRW,0.0,16.15,16.35,20.15
373,20240131,USD,23.991,24.17,24.21,24.63
371,20240131,SGD,0.0,17.695,17.705,18.505
370,20240131,SEK,0.0,0.0,2.286,2.421
369,20240131,NZD,0.0,14.752,14.762,15.342


In [104]:
df_foreign_exchange_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 459 entries, 366 to 401
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   event_date     459 non-null    int64  
 1   currency       459 non-null    object 
 2   central_rate   459 non-null    float64
 3   cash           459 non-null    float64
 4   bank_transfer  459 non-null    float64
 5   sell_price     459 non-null    float64
dtypes: float64(4), int64(1), object(1)
memory usage: 25.1+ KB


In [105]:
df_foreign_exchange_data.to_csv(path_or_buf=f"./data/foreign_exchange/{start_date.strftime('%Y%m%d')}_{end_date.strftime('%Y%m%d')}.csv", index=False)

# Visualize data

In [70]:
%%bash
ls -lah data/foreign_exchange/*.csv

-rw-r--r--  1 lap14443  staff    16K Jun 18 15:51 data/foreign_exchange/20240401_20240430.csv
-rw-r--r--  1 lap14443  staff    16K Jun 18 15:48 data/foreign_exchange/20240501_20240531.csv
-rw-r--r--  1 lap14443  staff   8.4K Jun 18 15:47 data/foreign_exchange/20240601_20240618.csv


In [123]:
li_data = []
li_files = [
    "data/foreign_exchange/20240101_20240131.csv",
    "data/foreign_exchange/20240201_20240229.csv",
    "data/foreign_exchange/20240301_20240331.csv",
    "data/foreign_exchange/20240401_20240430.csv",
    "data/foreign_exchange/20240501_20240531.csv",
    "data/foreign_exchange/20240601_20240618.csv",
]
for i in li_files:
    li_data.append(pd.read_csv(filepath_or_buffer=i))
df_foreign_exchange_data = pd.concat(li_data, axis=0, ignore_index=True)

In [124]:
# print(df_foreign_exchange_data.shape)
# df_foreign_exchange_data.info()

In [125]:
currencies = sorted(set(df_foreign_exchange_data["currency"].values))
print(currencies)

['AUD', 'CAD', 'CHF', 'CNY', 'DKK', 'EUR', 'GBP', 'HKD', 'JPY', 'KRW', 'LAK', 'NOK', 'NZD', 'SEK', 'SGD', 'THB', 'USD']


## USD

In [131]:
# Last 30 days
query_date = int((datetime.now() - timedelta(days=30)).strftime("%Y%m%d"))
df_usd = df_foreign_exchange_data.query(f"event_date >= {query_date} and currency == 'USD'").reset_index(drop=True) \
    .sort_values(by=["event_date"], ascending=False) \
    .reset_index(drop=True) \
    [['event_date', 'central_rate', 'cash', 'bank_transfer', 'sell_price']]
df_usd[["event_date", "bank_transfer"]].head(n=100)

Unnamed: 0,event_date,bank_transfer
0,20240618,25.251
1,20240617,25.251
2,20240614,25.229
3,20240613,25.145
4,20240612,25.213
5,20240611,25.213
6,20240610,25.143
7,20240608,25.105
8,20240607,25.105
9,20240606,25.143
