In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [None]:
# Load the dataset (if not already loaded)
data = pd.read_csv('data.csv')

In [None]:
# Handle missing values: Fill missing ProductCategory with 'Unknown'
data['ProductCategory'].fillna('Unknown', inplace=True)

In [None]:
# Aggregate customer transaction data
data['TotalTransactionAmount'] = data.groupby('CustomerId')['Amount'].transform('sum')
data['TransactionCount'] = data.groupby('CustomerId')['TransactionId'].transform('count')

In [None]:
# Extracting time-based features from TransactionStartTime
data['TransactionStartTime'] = pd.to_datetime(data['TransactionStartTime'])
data['TransactionHour'] = data['TransactionStartTime'].dt.hour
data['TransactionDay'] = data['TransactionStartTime'].dt.day
data['TransactionMonth'] = data['TransactionStartTime'].dt.month

In [None]:
from sklearn.preprocessing import OneHotEncoder

# Encoding categorical variables (e.g., ProductCategory and ChannelId)
encoder = OneHotEncoder(sparse_output=False)  # Use sparse_output=False to get a dense array output

In [None]:
# Fit and transform the categorical columns and convert to DataFrame
encoded_data = pd.DataFrame(encoder.fit_transform(data[['ProductCategory', 'ChannelId']]),
                            columns=encoder.get_feature_names_out(['ProductCategory', 'ChannelId']))

In [None]:
from sklearn.preprocessing import StandardScaler
# Normalize/Standardize numerical features (e.g., Amount)
# Normalize/Standardize numerical features (e.g., Amount, TotalTransactionAmount)
scaler = StandardScaler()

In [None]:
# Apply scaling to the 'Amount' and 'TotalTransactionAmount' columns
data[['Amount', 'TotalTransactionAmount']] = scaler.fit_transform(data[['Amount', 'TotalTransactionAmount']])

In [None]:
# Display the scaled data
print(data[['Amount', 'TotalTransactionAmount']].head())

In [None]:
# Save cleaned and feature-engineered dataset
data.to_csv('cleaned_data.csv', index=False)

In [None]:
# Confirmation message
print("Cleaned data saved to 'cleaned_data.csv'")

In [None]:
from google.colab import files
files.download('cleaned_data.csv')