In [6]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import missingno as msno
import datetime as dt

In [7]:
from google.cloud import storage

def list_blobs(bucket_name, prefix):
    """Lists all the blobs in the bucket that match the prefix."""
    storage_client = storage.Client()
    blobs = storage_client.list_blobs(bucket_name, prefix=prefix)

    for blob in blobs:
        print(blob.name)

def download_blob(bucket_name, source_blob_name, destination_file_name):
    """Downloads a blob from the bucket."""
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(source_blob_name)
    blob.download_to_filename(destination_file_name)
    print(f"Blob {source_blob_name} downloaded to {destination_file_name}.")

bucket_name = "my-project-bucket-ls"
prefix = "landing/"
list_blobs(bucket_name, prefix)


landing/D1/A.US_D1.csv
landing/D1/AAL.US_D1.csv
landing/D1/AAPL.US_D1.csv
landing/D1/ABBV.US_D1.csv
landing/D1/ABNB.US_D1.csv
landing/D1/ABT.US_D1.csv
landing/D1/ACGL.US_D1.csv
landing/D1/ACN.US_D1.csv
landing/D1/ADBE.US_D1.csv
landing/D1/ADI.US_D1.csv
landing/D1/ADM.US_D1.csv
landing/D1/ADP.US_D1.csv
landing/D1/ADSK.US_D1.csv
landing/D1/AEE.US_D1.csv
landing/D1/AEP.US_D1.csv
landing/D1/AES.US_D1.csv
landing/D1/AFL.US_D1.csv
landing/D1/AIG.US_D1.csv
landing/D1/AIZ.US_D1.csv
landing/D1/AJG.US_D1.csv
landing/D1/AKAM.US_D1.csv
landing/D1/ALB.US_D1.csv
landing/D1/ALGN.US_D1.csv
landing/D1/ALL.US_D1.csv
landing/D1/ALLE.US_D1.csv
landing/D1/AMAT.US_D1.csv
landing/D1/AMCR.US_D1.csv
landing/D1/AMD.US_D1.csv
landing/D1/AME.US_D1.csv
landing/D1/AMGN.US_D1.csv
landing/D1/AMP.US_D1.csv
landing/D1/AMT.US_D1.csv
landing/D1/AMZN.US_D1.csv
landing/D1/ANET.US_D1.csv
landing/D1/ANSS.US_D1.csv
landing/D1/AON.US_D1.csv
landing/D1/AOS.US_D1.csv
landing/D1/APA.US_D1.csv
landing/D1/APD.US_D1.csv
landing/D1/A

In [9]:
from google.cloud import storage
import pandas as pd
import os
from io import StringIO

storage_client = storage.Client()
bucket_name = 'my-project-bucket-ls'

def clean_and_save_df(df, filename, bucket_name):
    columns_to_keep = [
         'datetime', 'open', 'high', 'low', 'close', 'volume',
                    'bbands_20_upperband', 'bbands_20_middleband',
                    'bbands_20_lowerband', 'dema_20', 'ema_20', 'ema_60',
                    'kama_20', 'kama_60', 'ma_20', 'ma_60',
                    'sma_20', 'sma_60',
                    'tema_20', 'tema_60', 'adx_20', 'adx_60',
                    'macd_20_40_10_macd', 'macd_60_120_30_macd',
                    'rsi_20', 'rsi_60'
        ]
    df = df[columns_to_keep]
    
    df_cleaned = df.dropna().copy()    
    df_cleaned.loc[:, 'datetime'] = pd.to_datetime(df_cleaned['datetime'])
    
    parquet_bytes = df.to_parquet(index=False)
    
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(f'cleaned/{filename}.parquet')
    blob.upload_from_string(parquet_bytes, content_type='application/octet-stream')
    
    print(f'Cleaned data saved to cleaned/{filename}.parquet')

def process_all_csv(bucket_name, prefix):
    bucket = storage_client.get_bucket(bucket_name)
    blobs = bucket.list_blobs(prefix=prefix)
    
    for blob in blobs:
        if blob.name.endswith('.csv'):
            csv_string = blob.download_as_text()
            
            df = pd.read_csv(StringIO(csv_string))
            
            filename = os.path.splitext(os.path.basename(blob.name))[0] 
            clean_and_save_df(df, filename, bucket_name)

process_all_csv(bucket_name, 'landing/')


print("Data clean Complete.")

Cleaned data saved to cleaned/A.US_D1.parquet
Cleaned data saved to cleaned/AAL.US_D1.parquet
Cleaned data saved to cleaned/AAPL.US_D1.parquet
Cleaned data saved to cleaned/ABBV.US_D1.parquet
Cleaned data saved to cleaned/ABNB.US_D1.parquet
Cleaned data saved to cleaned/ABT.US_D1.parquet
Cleaned data saved to cleaned/ACGL.US_D1.parquet
Cleaned data saved to cleaned/ACN.US_D1.parquet
Cleaned data saved to cleaned/ADBE.US_D1.parquet
Cleaned data saved to cleaned/ADI.US_D1.parquet
Cleaned data saved to cleaned/ADM.US_D1.parquet
Cleaned data saved to cleaned/ADP.US_D1.parquet
Cleaned data saved to cleaned/ADSK.US_D1.parquet
Cleaned data saved to cleaned/AEE.US_D1.parquet
Cleaned data saved to cleaned/AEP.US_D1.parquet
Cleaned data saved to cleaned/AES.US_D1.parquet
Cleaned data saved to cleaned/AFL.US_D1.parquet
Cleaned data saved to cleaned/AIG.US_D1.parquet
Cleaned data saved to cleaned/AIZ.US_D1.parquet
Cleaned data saved to cleaned/AJG.US_D1.parquet
Cleaned data saved to cleaned/AKAM.U

Cleaned data saved to cleaned/ES.US_D1.parquet
Cleaned data saved to cleaned/ESS.US_D1.parquet
Cleaned data saved to cleaned/ETN.US_D1.parquet
Cleaned data saved to cleaned/ETR.US_D1.parquet
Cleaned data saved to cleaned/ETSY.US_D1.parquet
Cleaned data saved to cleaned/EVRG.US_D1.parquet
Cleaned data saved to cleaned/EW.US_D1.parquet
Cleaned data saved to cleaned/EXC.US_D1.parquet
Cleaned data saved to cleaned/EXPD.US_D1.parquet
Cleaned data saved to cleaned/EXPE.US_D1.parquet
Cleaned data saved to cleaned/EXR.US_D1.parquet
Cleaned data saved to cleaned/F.US_D1.parquet
Cleaned data saved to cleaned/FANG.US_D1.parquet
Cleaned data saved to cleaned/FAST.US_D1.parquet
Cleaned data saved to cleaned/FCX.US_D1.parquet
Cleaned data saved to cleaned/FDS.US_D1.parquet
Cleaned data saved to cleaned/FDX.US_D1.parquet
Cleaned data saved to cleaned/FE.US_D1.parquet
Cleaned data saved to cleaned/FFIV.US_D1.parquet
Cleaned data saved to cleaned/FI.US_D1.parquet
Cleaned data saved to cleaned/FICO.US_D

Cleaned data saved to cleaned/NKE.US_D1.parquet
Cleaned data saved to cleaned/NOC.US_D1.parquet
Cleaned data saved to cleaned/NOW.US_D1.parquet
Cleaned data saved to cleaned/NRG.US_D1.parquet
Cleaned data saved to cleaned/NSC.US_D1.parquet
Cleaned data saved to cleaned/NTAP.US_D1.parquet
Cleaned data saved to cleaned/NTRS.US_D1.parquet
Cleaned data saved to cleaned/NUE.US_D1.parquet
Cleaned data saved to cleaned/NVDA.US_D1.parquet
Cleaned data saved to cleaned/NVR.US_D1.parquet
Cleaned data saved to cleaned/NWS.US_D1.parquet
Cleaned data saved to cleaned/NWSA.US_D1.parquet
Cleaned data saved to cleaned/NXPI.US_D1.parquet
Cleaned data saved to cleaned/O.US_D1.parquet
Cleaned data saved to cleaned/ODFL.US_D1.parquet
Cleaned data saved to cleaned/OKE.US_D1.parquet
Cleaned data saved to cleaned/OMC.US_D1.parquet
Cleaned data saved to cleaned/ON.US_D1.parquet
Cleaned data saved to cleaned/ORCL.US_D1.parquet
Cleaned data saved to cleaned/ORLY.US_D1.parquet
Cleaned data saved to cleaned/OTIS.

Cleaned data saved to cleaned/ZION.US_D1.parquet
Cleaned data saved to cleaned/ZS.US_D1.parquet
Cleaned data saved to cleaned/ZTS.US_D1.parquet
Data clean Complete.
