In [None]:
''' PREREQUISITES:
    - Download all files to be uploaded as csvs.
    - All columns should be in the following order: ['Channel', 'Country', 'DMA', 'Campaign', 'Quarter', 'Date', 'Impressions', 'Spend']
    - This process doesn't work with the 'Currency' column. 
'''

In [10]:
#  Imports 
import pandas as pd 

import os



In [2]:
# This is the path of the folder where your raw files (i.e. files not cleaned yet) are.
raw_path_files = '/Users/leazurfluh/Downloads/GCP_marketing_spend/2024_archive/'
cleaned_path_files = raw_path_files + 'cleaned_files/' # building a new folder to store cleaned files

# Suffix you want to give the files that have been refactored. Can be '' if you need to keep the same name.
suffix = '_refactored'

In [6]:
# Dictionary of all csvs that are going to be loaded for cleaning purposes.
# Change the key to be the name of each file.

df_dict = {
    '2024 Meta Restated_refactored': raw_path_files,
    'AB Updated MMM H1 2024 US Data - NYT_refactored': raw_path_files,
    'AB Updated MMM H1 2024 US Data - Snapchat_refactored': raw_path_files,
    'AB Updated MMM H1 2024 US Data - Vox_refactored': raw_path_files,
    'AB Updated MMM H1 2024 US Data - YouTube via MiQ_refactored': raw_path_files,
    'AB Updated MMM H1 2024 US Data - iHeart_refactored': raw_path_files,
}

In [11]:

def df_cleaning(df_dict):
    
    for key in df_dict:
        # Load cdv as df 
        df = pd.read_csv(df_dict[key] + key + '.csv')

        # Rename columns
        df.columns = ['Channel', 'Country', 'DMA', 'Campaign', 'Quarter', 'Date', 'Impressions', 'Spend']

        # Clean Date, Impressions and Spend columns
        # WARNING column Date isn't cleaned to account for formats other than yyyy-mm-dd
        df.Date = df.Date.astype('datetime64[ns]')

        df['Spend'] = df['Spend'].astype("str")
        df['Spend'] = df['Spend'].str.replace('$', '')
        df['Spend'] = df['Spend'].str.replace(',', '')
        df['Spend'] = df['Spend'].str.replace('"', '')
        df['Spend'] = df['Spend'].str.replace(' -', '0')
        df['Spend'] = df['Spend'].astype("float")

        df['Impressions'] = df['Impressions'].astype("str")
        df['Impressions'] = df['Impressions'].str.replace(',', '')
        df['Impressions'] = df['Impressions'].str.replace('"', '')
        df['Impressions'] = df['Impressions'].str.replace('-', '0')
        df['Impressions'] = df['Impressions'].str.replace('.00', '')
        df['Impressions'] = df['Impressions'].str.replace('nan', '0')
        df['Impressions'] = df['Impressions'].fillna('0')
        df['Impressions'] = df['Impressions'].astype("float")

        # Check if destination folder exists, if not creates it:
        if not os.path.exists(cleaned_path_files):
            os.makedirs(cleaned_path_files)
            print(f"Created folder: {cleaned_path_files}")

        df.to_csv(cleaned_path_files + key + suffix + '.csv', index=False)
        
    return print('All done!')


In [12]:
df_cleaning(df_dict)

Created folder: /Users/leazurfluh/Downloads/GCP_marketing_spend/2024_archive/cleaned_files/
All done!
