<a href="https://colab.research.google.com/github/MarylouBer/MLD_Python/blob/main/DataEngineering_Webscraping_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import sqlite3
import numpy as np
from datetime import datetime

# Set pandas option to display all columns in one line in the df
pd.set_option('display.width', 200)
pd.set_option('display.max_columns', None)

url = 'https://en.wikipedia.org/wiki/List_of_most_valuable_crops_and_livestock_products'
db_name = 'CropsAndLivestock.db'
table_name = 'Most_Valuable_CropsAndLivestock'
csv_path = './Most_Valuable_CropsAndLivestock_data.csv'
table_attribs = ['Crop_Livestock','Global_Production_USDBillion','Global_Production_MetricTons' ]

def extract(url, table_attribs):
    page = requests.get(url).text
    data = BeautifulSoup(page, 'html.parser')
    df = pd.DataFrame (columns=table_attribs)
    tables = data.find_all('tbody')
    rows = tables[0].find_all('tr')
    for row in rows:
        col = row.find_all('td')
        if len(col)!=0:
            crop_or_livestock = col[0].find_all('a')[0]['title']
            global_gross_production_value = col[1].contents[0].replace('$', '').replace(',', '').strip()
            global_production_in_metric_tons = col[2].contents[0][:-1]
            data_dict = {'Crop_Livestock': crop_or_livestock,
                        'Global_Production_USDBillion': global_gross_production_value,
                        'Global_Production_MetricTons': global_production_in_metric_tons}
            df1 = pd.DataFrame(data_dict, index=[0])
            df = pd.concat([df,df1], ignore_index=True)

    return df

def transform(df) :
    # Use a public currency API
    response = requests.get("https://api.exchangerate-api.com/v4/latest/USD")
    data = response.json()
    # Get USD to EUR rate
    usd_to_eur = data['rates']['EUR']
    # transform usd to eur and save in df
    df['Global_Production_USDBillion'] = pd.to_numeric(df['Global_Production_USDBillion'], errors='coerce')
    df['Global_Production_EURBillion'] = df['Global_Production_USDBillion'] * usd_to_eur
    df = df[['Crop_Livestock', 'Global_Production_USDBillion', 'Global_Production_EURBillion', 'Global_Production_MetricTons']]
    return df

def load_to_csv(df, csv_path):
    df.to_csv(csv_path, index=False)

def load_to_db(df, sql_connection, table_name):
    df.to_sql(table_name, sql_connection, if_exists='replace', index=False)

def run_query(query_statement, sql_connection):
    query_output = pd.read_sql(query_statement, sql_connection)
    print("\n" + query_statement)
    print(query_output)

def log_progress(message) :
    timestamp_format = '%Y-%h-%d-%H:%M:%S'
    now = datetime.now()
    timestamp = now.strftime(timestamp_format)
    with open('./code_log.txt', 'a') as f:
        f.write(timestamp + ':' + message + '\n')

log_progress('Prelimiaries complete. Initiating ETL process')
df = extract(url, table_attribs)
print("\n" + 'Extracted data')
print(df)
log_progress('Data extraction complete. Initiating Transformation process')
df = transform(df)
print("\n" + 'Transformed data')
print(df)
log_progress('Data transformation complete. Initiating Loading process')
load_to_csv(df,csv_path)
log_progress('Data saved to CSV file')
sql_connection = sqlite3.connect('CropsAndLivestock.db')
log_progress('SQL connection initiated')
load_to_db(df, sql_connection, table_name)
log_progress('Data loaded to Database as table. Executing queries')
run_query('SELECT * FROM Most_Valuable_CropsAndLivestock', sql_connection)
run_query('SELECT AVG(Global_Production_EURBillion) FROM Most_Valuable_CropsAndLivestock', sql_connection)
run_query('SELECT Crop_Livestock from Most_Valuable_CropsAndLivestock LIMIT 5', sql_connection)
log_progress ('Process complete')
sql_connection.close()
log_progress('Server connection closed')


Extracted data
    Crop_Livestock Global_Production_USDBillion Global_Production_MetricTons
0             Rice                          332                  751,885,117
1     Domestic pig                          280                  118,956,327
2           Cattle                          269                   64,568,004
3             Milk                          238                  665,596,536
4          Chicken                          192                  106,638,508
5            Maize                          191                1,126,990,585
6            Wheat                          168                  748,392,150
7          Soybean                          107                  335,613,801
8      Eggs (food)                         93.6                   74,180,272
9           Potato                         92.7                  356,952,488
10       Vegetable                         89.1                  292,920,885
11          Tomato                         87.9             