## Running Utils file to fetch the Raw data which are in that case Events

In [None]:
%run utils.py

## Importing Required Libraries

In [None]:
import numpy as np
import pandas as pd
import os
import re

### fixing the shape for dataframe and creating index column that will be used in lookup channel names

In [None]:
def process_dataframe(df):
    def concatenate_columns(row):
        try:
            return row['sessionSourceMedium'] + row['sessionCampaignName']
        except Exception as e:
            print(f"Error in concatenating columns: {e}")
            return None
    
    # Convert the 'date' column to a datetime format
    df['date'] = pd.to_datetime(df['date'], format='%Y%m%d').dt.strftime('%m/%d/%Y')
    df['sessionCampaignName'] = df['sessionCampaignName'].apply(lambda x: x[1:] if isinstance(x, str) and x.startswith('=') else x)

    # Create a new column 'Index' by concatenating 'sessionSourceMedium' and 'sessionCampaignName'
    df['Index'] = df.apply(concatenate_columns, axis=1)

    # Create 'source/medium_index_date' column by concatenating 'sessionSourceMedium', 'sessionCampaignName', and 'date'
    df['source/medium_index_date'] = pd.concat([df['sessionSourceMedium'], df['sessionCampaignName'], df['date']], axis=1).apply(''.join, axis=1)

    # Set 'date' as the index
    df.set_index('date', append=True, inplace=True)

    return df

### Streamlining Data Processing and File Output

In [None]:
class DataProcessor:
    def __init__(self, output_directory, processed_directory):
        self.output_directory = output_directory
        self.processed_directory = processed_directory
        self.date_pattern = r"\d{4}-\d{2}-\d{2}"

        # Ensure the processed data directory exists
        if not os.path.exists(self.processed_directory):
            os.makedirs(self.processed_directory)
            print(f'Created directory: {self.processed_directory}')
        else:
            os.makedirs(self.processed_directory, exist_ok=True)
            print('Processed folder already exists', self.processed_directory)
            print('===============================================')

    def process_data(self):
        if os.path.exists(self.output_directory):
            output_files = os.listdir(self.output_directory)

            for output_file in output_files:
                match = re.search(self.date_pattern, output_file)

                if match:
                    date_string = match.group()
                    processed_file_name = f'Processed_Sessions_{date_string}.csv'
                    output_file_path = os.path.join(self.output_directory, output_file)
                    processed_file_path = os.path.join(self.processed_directory, processed_file_name)

                    if os.path.exists(processed_file_path):
                        choice = input(f'The file {processed_file_name} already exists. Do you want to (O)verwrite or (C)ontinue? [O/C]: ').strip().lower()
                        if choice == 'o':
                            print(f'Overwriting the existing file {processed_file_name}')
                            self.process_file(output_file_path, processed_file_path)
                        elif choice == 'c':
                            print('Continuing with the existing file.')
                            self.continue_with_existing_file(processed_file_path)
                        else:
                            print('Invalid choice. Please enter "O" to overwrite or "C" to continue.')
                    else:
                        self.process_file(output_file_path, processed_file_path)
                else:
                    print(f"No date pattern found in filename: {output_file}")
        else:
            print("Output data directory does not exist.")

    def process_file(self, input_file_path, output_file_path):
        # Add your code to process the file and save it in the processed directory
        raw_df = pd.read_csv(input_file_path)
        # print(raw_df.columns)
        raw_df = process_dataframe(raw_df)
        raw_df.reset_index(inplace=True)
        print(raw_df.columns)

        event_types = ['session_start', 'add_to_cart', 'remove_from_cart', 'begin_checkout', 'purchase']
        # Create dictionaries to store aggregated data
        agg_data = {event: raw_df[raw_df['eventName'] == event].groupby('Index')['sessions'].sum() for event in event_types}

        # Create a DataFrame using the 'Index' as the index
        merged_df = pd.DataFrame(index=raw_df['Index'])

        # Populate the merged_df with aggregated data from the dictionaries
        for event in event_types:
            merged_df[event] = agg_data[event]

        # Fill missing values with 0
        merged_df.fillna(0, inplace=True)
        merged_df.reset_index(inplace=True)
        merged_df.drop_duplicates(subset='Index', inplace=True)

        final_merge = merged_df.merge(raw_df[['date', 'sessionSourceMedium', 'sessionCampaignName', 'Index']], on='Index', how='left')

        final_merge.rename(columns={'session_start': 'Sessions', 'add_to_cart': 'ATC', 'remove_from_cart': 'Remove_From_Cart', 'begin_checkout': 'Checkout', 'purchase': 'Orders', 'sessionCampaignName': 'Campaign', 'sessionSourceMedium': 'Source / Medium'}, inplace=True)


        final_merge.drop_duplicates(subset='Index', inplace=True)
        desired_columns = ['date', 'Source / Medium', 'Campaign', 'Index', 'Sessions', 'ATC', 'Remove_From_Cart', 'Checkout', 'Orders']
        final_merge = final_merge[desired_columns]
        # Save the processed data to the output file
        final_merge.to_csv(output_file_path, index=False)
        print(f'Processed file Created At {output_file_path}')

    def continue_with_existing_file(self, file_path):
        print('continuing with existing file')



## Calling the functions from where it query and to where it will be saved

In [None]:
# # Usage
output_directory = "./raw_data"
processed_directory = "./processed_funnel_data"

data_processor = DataProcessor(output_directory, processed_directory)
data_processor.process_data()