In [ ]:
import pandas as pd
import os

# Define the path to the data files (assuming they are in the MIT Data Science/CSV Files subfolder)
data_path = 'C:/Users/ADMIN/ET6-CDSP-group-17-repo/1_datasets/MIT Data Science/CSV Files/'

# List of file names
file_names = [
    'customer_nodes_training.csv',
    'event_table_training.csv',
    'product_nodes_training.csv'
]

# Dictionary to store loaded dataframes
loaded_data = {}

# Load each CSV file
for name in file_names:
    file_path = os.path.join(data_path, name)
    try:
        loaded_data[name] = pd.read_csv(file_path)
        print(f"Successfully loaded {name}")
    except FileNotFoundError:
        print(f"Error: {file_path} not found. Please ensure the path is correct.")
    except Exception as e:
        print(f"Error loading {name}: {e}")

# Assign dataframes to variables for easier access
customer_nodes = loaded_data['customer_nodes_training.csv']
event_table = loaded_data['event_table_training.csv']
product_nodes = loaded_data['product_nodes_training.csv']

print("
--- Initial DataFrames Loaded ---")
print("Customer Nodes Shape:", customer_nodes.shape)
print("Event Table Shape:", event_table.shape)
print("Product Nodes Shape:", product_nodes.shape)


In [ ]:
# Merge event_table with customer_nodes
merged_df = event_table.merge(customer_nodes, on='hash(customerId)', how='left')

# Merge the result with product_nodes
merged_df = merged_df.merge(product_nodes, on='hash(variantID)', how='left')

print("
--- Merged DataFrame Info ---")
print("Merged DataFrame Shape:", merged_df.shape)
print("Columns after merge:", merged_df.columns.tolist())
print("Missing values after merge:
", merged_df.isnull().sum()[merged_df.isnull().sum() > 0])
print("First 5 rows of merged DataFrame:
", merged_df.head())


In [ ]:
# Rename columns for clarity (optional but good practice)
merged_df.rename(columns={
    'hash(customerId)': 'customer_id',
    'hash(variantID)': 'variant_id',
    'hash(productID)': 'product_id',
    'hash(supplierRef)': 'supplier_ref_id'
}, inplace=True)

print("
--- DataFrame after Renaming Columns ---")
print("Columns after renaming:", merged_df.columns.tolist())
print("First 5 rows with new column names:
", merged_df.head())


In [ ]:
# Handle categorical variables (if not already one-hot encoded)
# Based on exploration, many categorical features are already one-hot encoded (e.g., Country_A, productType_B)# However, 'shippingCountry', 'productType', 'brandDesc' might still be present as original categorical columns.
# Let's check and apply one-hot encoding if needed.

# Identify categorical columns that are not already one-hot encoded
categorical_cols = ['shippingCountry', 'productType', 'brandDesc']

# Filter to only include columns that actually exist in the DataFrame
categorical_cols_to_encode = [col for col in categorical_cols if col in merged_df.columns]

if categorical_cols_to_encode:
    print(f"
--- One-Hot Encoding Categorical Columns: {categorical_cols_to_encode} ---")
    merged_df = pd.get_dummies(merged_df, columns=categorical_cols_to_encode, drop_first=True)
    print("Columns after one-hot encoding:", merged_df.columns.tolist())
else:
    print("
No additional categorical columns to one-hot encode.")

print("
--- Final DataFrame Info after Preparation ---")
print("Final DataFrame Shape:", merged_df.shape)
print("Final DataFrame Columns:", merged_df.columns.tolist())
print("Final Missing Values:
", merged_df.isnull().sum()[merged_df.isnull().sum() > 0])
print("Final Data Types:
", merged_df.info())

# Save the prepared DataFrame for later use
output_dir = 'C:/Users/ADMIN/ET6-CDSP-group-17-repo/2_data_preparation/ASOS_GraphReturns/'
os.makedirs(output_dir, exist_ok=True)
output_file_path = os.path.join(output_dir, 'prepared_asos_data.csv')
merged_df.to_csv(output_file_path, index=False)
print(f"
Prepared data saved to: {output_file_path}")
