# Updating our data regularly
## Bronze layer

### Getting latest links

In [None]:
from datetime import datetime 

now = datetime.now()
current_year = now.year
current_month = now.month

current_quarter = (current_month - 1) // 3 
previous_data_quarter = (current_month - 1) // 3 - 1

current_needed_data = f'https://www.sec.gov/files/dera/data/financial-statement-data-sets/{current_year}q{current_quarter}.zip'

previous_needed_data = f'https://www.sec.gov/files/dera/data/financial-statement-data-sets/{current_year}q{previous_data_quarter}.zip'

new_links = [current_needed_data,previous_needed_data]
new_links


['https://www.sec.gov/files/dera/data/financial-statement-data-sets/2024q2.zip',
 'https://www.sec.gov/files/dera/data/financial-statement-data-sets/2024q1.zip']

### Updating latest zip files

In [None]:
from azure.storage.blob import BlobServiceClient
import requests

connection_string = ""
container_name = "testtech"
headers = {
   "User-Agent": "jo boulement jo@gmx.at",
    "Accept-Encoding": "gzip, deflate" 
}

# Initialize BlobServiceClient
blob_service_client = BlobServiceClient.from_connection_string(connection_string)

def download_to_blob(url, blob_name):
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        blob_client = blob_service_client.get_blob_client(container=container_name, blob=blob_name)
        
        # Check if the blob already exists
        if blob_client.exists():
            print(f'file {blob_name} already exists in blob storage. Skipping upload.')
        else:
            # Upload the blob if it doesn't exist
            blob_client.upload_blob(response.content, overwrite=True)
            print(f'file {blob_name} uploaded to blob storage')
    else:
        print(f'failed to download from {url}. Status code {response.status_code}')

def main():
    for url in new_links:
        blob_name = url.split('/')[-1]
        download_to_blob(url, blob_name)


main()


file 2024q2.zip already exists in blob storage. Skipping upload.
file 2024q1.zip already exists in blob storage. Skipping upload.


## Silver Layer

### Updating Unzipped Files

In [None]:
from azure.storage.blob import BlobServiceClient, ContainerClient
from datetime import datetime, timedelta, timezone

#blob_service_client = BlobServiceClient.from_connection_string(connection_string)
container_client = blob_service_client.get_container_client(container_name)

# Function to get blobs uploaded within the last 'n' minutes
def get_recent_blobs(container_client, minutes):
    # Calculate the time threshold
    threshold_time = datetime.now(timezone.utc) - timedelta(minutes=minutes)

    # List all blobs in the container
    blobs = container_client.list_blobs()

    # Filter blobs based on the 'last_modified' time
    recent_blobs = [blob for blob in blobs if blob.last_modified >= threshold_time]

    return recent_blobs

# Get blobs uploaded within the last 10 minutes
recent_blobs = get_recent_blobs(container_client, 10)

recent_blobs

[{'name': '2024q1.zip', 'container': 'testtech', 'snapshot': None, 'version_id': None, 'is_current_version': None, 'blob_type': <BlobType.BLOCKBLOB: 'BlockBlob'>, 'metadata': {}, 'encrypted_metadata': None, 'last_modified': datetime.datetime(2024, 7, 14, 19, 53, 9, tzinfo=datetime.timezone.utc), 'etag': '0x8DCA43E96338689', 'size': 56204702, 'content_range': None, 'append_blob_committed_block_count': None, 'is_append_blob_sealed': None, 'page_blob_sequence_number': None, 'server_encrypted': True, 'copy': {'id': None, 'source': None, 'status': None, 'progress': None, 'completion_time': None, 'status_description': None, 'incremental_copy': None, 'destination_snapshot': None}, 'content_settings': {'content_type': 'application/octet-stream', 'content_encoding': None, 'content_language': None, 'content_md5': bytearray(b'-"D\xe9\xff\xbc\xc6\xf1\xb8\x0c\xa4\x12\x8c\x14+R'), 'content_disposition': None, 'cache_control': None}, 'lease': {'status': 'unlocked', 'state': 'available', 'duration': N

In [None]:
storage_account_name = "testtech"
storage_account_key = ''
container_name_source = "testtech"
container_name_dest = "newtesttech"


# Configure the spark context with the storage account key
spark.conf.set(f"fs.azure.account.key.{storage_account_name}.blob.core.windows.net", storage_account_key)

# Mount your source container
dbutils.fs.mount(
  source = f"wasbs://{container_name_source}@{storage_account_name}.blob.core.windows.net",
  mount_point = f"/mnt/{container_name_source}",
  extra_configs = {f"fs.azure.account.key.{storage_account_name}.blob.core.windows.net": storage_account_key}
)

# Mount your destination container
dbutils.fs.mount(
  source = f"wasbs://newtesttech@{storage_account_name}.blob.core.windows.net/",
  mount_point = "/mnt/newtesttech",
  extra_configs = {f"fs.azure.account.key.{storage_account_name}.blob.core.windows.net": storage_account_key}
)

True

In [None]:
from azure.storage.blob import BlobServiceClient, ContainerClient
from datetime import datetime, timedelta, timezone
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import zipfile


blob_service_client = BlobServiceClient.from_connection_string(connection_string)
container_client = blob_service_client.get_container_client(container_name_source)

# Function to get blobs uploaded within the last 'n' minutes
def get_recent_blobs(container_client, minutes):
    threshold_time = datetime.now(timezone.utc) - timedelta(minutes=minutes)
    blobs = container_client.list_blobs()
    recent_blobs = [blob for blob in blobs if blob.last_modified >= threshold_time]
    return recent_blobs

# Get blobs uploaded within the last 10 minutes
recent_blobs = get_recent_blobs(container_client, 10)
recent_blob_names = {blob.name for blob in recent_blobs}

# Create SparkSession
spark = SparkSession.builder.appName("ProcessZipFiles").getOrCreate()

# Process only recent zip files
zip_files = dbutils.fs.ls(f"/mnt/{container_name_source}")
zip_files = [f for f in zip_files if f.name.endswith('.zip') and f.name in recent_blob_names]
print(zip_files)

for zip_file in zip_files:
    new_folder_name = zip_file.name.split('.')[0]
    dbutils.fs.mkdirs(f"/mnt/{container_name_dest}/{new_folder_name}")
    
    extract_path = f'/dbfs/mnt/{container_name_source}/temp_folder'
    working_dir = f'/dbfs/mnt/{container_name_source}/{new_folder_name}.zip'
    dbutils.fs.mkdirs(extract_path)
    
    with zipfile.ZipFile(working_dir, 'r') as zip_ref:
        zip_ref.extractall(extract_path)
    
    sub_path = f'/mnt/{container_name_source}/temp_folder/sub.txt'
    sub_df = spark.read.option("delimiter", "\t").option("header", "true").csv(sub_path)
    sub_df = sub_df.withColumn("filed", to_date(col("filed"), "yyyyMMdd")).withColumn("period", to_date(col("period"), "yyyyMMdd"))
    sub_df.write.mode("overwrite").parquet(f'/mnt/{container_name_dest}/{new_folder_name}/sub.parquet')
    
    tag_path = f'/mnt/{container_name_source}/temp_folder/tag.txt'
    tag_df = spark.read.option("delimiter", "\t").option("header", "true").csv(tag_path)
    tag_df.write.mode("overwrite").parquet(f'/mnt/{container_name_dest}/{new_folder_name}/tag.parquet')
    
    num_path = f'/mnt/{container_name_source}/temp_folder/num.txt'
    num_df = spark.read.option("delimiter", "\t").option("header", "true").csv(num_path)
    num_df = num_df.withColumn("ddate", to_date(col("ddate"), "yyyyMMdd"))
    num_df.write.mode("overwrite").parquet(f'/mnt/{container_name_dest}/{new_folder_name}/num.parquet')
    
    pre_path = f'/mnt/{container_name_source}/temp_folder/pre.txt'
    pre_df = spark.read.option("delimiter", "\t").option("header", "true").csv(pre_path)
    pre_df.write.mode("overwrite").parquet(f'/mnt/{container_name_dest}/{new_folder_name}/pre.parquet')
    
    dbutils.fs.rm(extract_path, recurse=True)

print("Data upload completed for recent zip files.")

# Unmount the containers
dbutils.fs.unmount(f"/mnt/{container_name_source}")
dbutils.fs.unmount(f"/mnt/{container_name_dest}")


[FileInfo(path='dbfs:/mnt/testtech/2024q1.zip', name='2024q1.zip', size=56204702, modificationTime=1720986789000), FileInfo(path='dbfs:/mnt/testtech/2024q2.zip', name='2024q2.zip', size=53821685, modificationTime=1720986788000)]
Data upload completed for recent zip files.
/mnt/testtech has been unmounted.
/mnt/newtesttech has been unmounted.


True

In [None]:
dbutils.fs.mount(
  source = f"wasbs://newtesttech@{storage_account_name}.blob.core.windows.net/",
  mount_point = "/mnt/newtesttech",
  extra_configs = {f"fs.azure.account.key.{storage_account_name}.blob.core.windows.net": storage_account_key}
)

## Gold layer
### Updating database

In [None]:
# List all folders in the root directory
root_directory = f"/mnt/{container_name_dest}"

# Function to get folders updated within the last 'n' minutes
def get_recently_updated_folders(folders, minutes):
    threshold_time = datetime.now(timezone.utc) - timedelta(minutes=minutes)
    recently_updated_folders = [folder for folder in folders if folder.modificationTime >= threshold_time.timestamp() * 1000]  # Convert to milliseconds
    return recently_updated_folders


recently_updated_folders = get_recently_updated_folders(dbutils.fs.ls(root_directory), 10)

# Define schemas for each table
submissions_schema = StructType([
    StructField("adsh", StringType(), False),
    StructField("cik", LongType(), True),
    StructField("name", StringType(), True),
    StructField("sic", IntegerType(), True),
    StructField("countryba", StringType(), False),
    StructField("stprba", StringType(), True),
    StructField("cityba", StringType(), False),
    StructField("zipba", StringType(), True),
    StructField("bas1", StringType(), True),
    StructField("bas2", StringType(), True),
    StructField("baph", StringType(), True),
    StructField("countryma", StringType(), True),
    StructField("stprma", StringType(), True),
    StructField("cityma", StringType(), True),
    StructField("zipma", StringType(), True),
    StructField("mas1", StringType(), True),
    StructField("mas2", StringType(), True),
    StructField("countryinc", StringType(), False),
    StructField("stprinc", StringType(), True),
    StructField("ein", StringType(), True),
    StructField("former", StringType(), True),
    StructField("changed", DateType(), True),
    StructField("afs", StringType(), True),
    StructField("wksi", BooleanType(), False),
    StructField("fye", StringType(), False),
    StructField("form", StringType(), False),
    StructField("period", DateType(), False),
    StructField("fy", IntegerType(), False),
    StructField("fp", StringType(), False),
    StructField("filed", DateType(), False),
    StructField("accepted", TimestampType(), False),
    StructField("prevrpt", BooleanType(), False),
    StructField("detail", BooleanType(), False),
    StructField("instance", StringType(), False),
    StructField("nciks", IntegerType(), False),
    StructField("aciks", StringType(), True)
])

tags_schema = StructType([
    StructField("tag", StringType(), False),
    StructField("version", StringType(), False),
    StructField("custom", IntegerType(), False),
    StructField("abstract", BooleanType(), False),
    StructField("datatype", StringType(), True),
    StructField("iord", StringType(), True),
    StructField("crdr", StringType(), True),
    StructField("tlabel", StringType(), True),
    StructField("doc", StringType(), True)
])

numbers_schema = StructType([
    StructField("adsh", StringType(), True),
    StructField("tag", StringType(), True),
    StructField("version", StringType(), True),
    StructField("ddate", DateType(), True),
    StructField("qtrs", IntegerType(), True),
    StructField("uom", StringType(), True),
    StructField("coreg", StringType(), True),
    StructField("value", DecimalType(28,4), True),
    StructField("footnote", StringType(), True)
])

presentations_schema = StructType([
    StructField("adsh", StringType(), False),
    StructField("report", IntegerType(), True),
    StructField("line", IntegerType(), False),
    StructField("stmt", StringType(), False),
    StructField("inpth", BooleanType(), False),
    StructField("rfile", StringType(), False),
    StructField("tag", StringType(), False),
    StructField("version", StringType(), False),
    StructField("plabel", StringType(), False)
])



# Function to process files in a specific quarter folder
def process_quarter_folder(folder_path):

    files = dbutils.fs.ls(folder_path)
    for file in files:
        if file.name == 'pre.parquet/':
            file_path = file.path
            df = spark.read.parquet(file_path)
            for field in presentations_schema.fields:
                df = df.withColumn(field.name, col(field.name).cast(field.dataType))
            df.write.mode("append").saveAsTable('Presentations')
        elif file.name == 'num.parquet/':
            file_path = file.path
            df = spark.read.parquet(file_path)
            for field in numbers_schema.fields:
                df = df.withColumn(field.name, col(field.name).cast(field.dataType))
            df.write.mode("append").saveAsTable('Numbers')

        elif file.name == 'sub.parquet/':
            file_path = file.path
            df = spark.read.parquet(file_path)
            for field in submissions_schema.fields:
                df = df.withColumn(field.name, col(field.name).cast(field.dataType))
            df.write.mode("append").saveAsTable('Submissions')

        elif file.name == 'tag.parquet/':
            file_path = file.path
            df = spark.read.parquet(file_path)
            for field in tags_schema.fields:
                df = df.withColumn(field.name, col(field.name).cast(field.dataType))
            df.write.mode("append").saveAsTable('Tags')
        else:
            print(f"Skipping file {file.name} as it doesn't match any known pattern")


spark.conf.set("spark.databricks.delta.schema.autoMerge.enabled", "true")

def process_recently_updated_quarters():
    for quarter in recently_updated_folders:
        print(f"Processing folder: {quarter.name}")
        process_quarter_folder(quarter.path)

process_recently_updated_quarters()


In [None]:
dbutils.fs.unmount(f"/mnt/{container_name_dest}")

In [None]:
%sql
SELECT count(*) FROM Numbers;

count(1)
5709476
