In [3]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import OneHotEncoder
import pickle

In [13]:
investments_pd = pd.read_csv('../syntheticDataGenerators/investment/invest_data_gemini-2.5-flash.csv', sep=';')
baskets_pd = pd.read_csv('../data/company_basket.csv', sep=';')
users_pd = pd.read_csv('../syntheticDataGenerators/user/swedish_users.csv')

# User data into numerical values

In [14]:
# First, drop user_name column but keep user_id
users_pd_processed = users_pd.drop(['user_name'], axis=1)

# Create age groups without encoding
def create_age_group(age):
    if age <= 20:
        return '<=20'
    elif age <= 30:
        return '21-30'
    elif age <= 40:
        return '31-40'
    elif age <= 50:
        return '41-50'
    elif age <= 60:
        return '51-60'
    elif age <= 70:
        return '61-70'
    elif age <= 80:
        return '71-80'
    else:
        return '>80'

# Apply age grouping
users_pd_processed['age_group'] = users_pd_processed['age'].apply(create_age_group)
users_pd_processed = users_pd_processed.drop('age', axis=1)  # Remove original age column

# Handle locations that are not in the predefined list
all_locations = [
    'Stockholm', 'Västra Götaland', 'Skåne', 'Uppsala', 'Östergötland', 
    'Västmanland', 'Södermanland', 'Jönköping', 'Halland', 'Västerbotten', 
    'Kalmar', 'Gävleborg', 'Kronoberg', 'Värmland', 'Örebro', 
    'Blekinge', 'Dalarna', 'Norrbotten', 'Västernorrland', 'Gotland', 
    'Jämtland', 'Other'
]

users_pd_processed['location'] = users_pd_processed['location'].apply(
    lambda x: x if x in all_locations else 'Other'
)

# View the processed dataframe
print(users_pd_processed)

# Step 1: Create a dataset where each row is a single purchase
investment_data = investments_pd.copy()

# Step 2: Merge with user features to create the final dataset
# Each row will be a purchase with the user's features
purchase_dataset = investment_data.merge(users_pd_processed, on='user_id', how='inner')

# Step 3: Split each user's purchases 80/20 for train/test
# Group the purchase dataset by user_id
user_groups = purchase_dataset.groupby('user_id')

# Initialize empty lists for train and test indices
train_indices = []
test_indices = []

# For each user, split their purchases 80/20
for user_id, user_data in user_groups:
    # Get indices for this user's purchases
    user_indices = user_data.index.tolist()
    
    # Randomly shuffle the indices
    np.random.seed(42)  # For reproducibility
    np.random.shuffle(user_indices)

    # Calculate split point (80% for training)
    split_idx = int(len(user_indices) * 0.8)

    # Add indices to train and test lists
    train_indices.extend(user_indices[:split_idx])
    test_indices.extend(user_indices[split_idx:])

# Create train and test datasets using the indices
train_data = purchase_dataset.loc[train_indices]
test_data = purchase_dataset.loc[test_indices]

# Create X and y for training data - using string values directly
X_train = train_data[['user_id', 'location', 'gender', 'education', 'invest_goal', 'age_group']]
y_train = train_data[['user_id', 'basket_name']]  # Use basket_name instead of basket_encoded

# Create X and y for test data - using string values directly
X_test = test_data[['user_id', 'location', 'gender', 'education', 'invest_goal', 'age_group']]
y_test = test_data[['user_id', 'basket_name']]  # Use basket_name instead of basket_encoded

     user_id         location  gender          education invest_goal age_group
0       1001      Västmanland  Female    Master's Degree   Long-term     31-40
1       1002            Skåne    Male        High School   Long-term     21-30
2       1003      Västmanland  Female    Master's Degree   Long-term     61-70
3       1004        Stockholm    Male        High School  Short-term     21-30
4       1005           Kalmar    Male  Bachelor's Degree  Short-term     21-30
..       ...              ...     ...                ...         ...       ...
995     1996            Skåne    Male        High School  Short-term     51-60
996     1997          Uppsala    Male        High School   Long-term     71-80
997     1998        Gävleborg    Male        High School  Short-term     31-40
998     1999        Jönköping    Male        High School   Long-term     61-70
999     2000  Västra Götaland  Female        High School   Long-term     41-50

[1000 rows x 6 columns]


In [15]:
X_train.to_csv('X_train.csv', index=False)
# Save the test features with user_id
X_test.to_csv('X_test.csv', index=False)
y_train.to_csv('y_train.csv', index=False)
y_test.to_csv('y_test.csv', index=False)
# Save the training labels
#pd.DataFrame(y_train, columns=['basket_encoded']).to_csv('y_train.csv', index=False)
# Save the test labels
#pd.DataFrame(y_test, columns=['basket_encoded']).to_csv('y_test.csv', index=False)

In [16]:
import pandas as pd
import numpy as np

def convert_basket_features(csv_path, basket_encoder):
    # Read the CSV file
    df = pd.read_csv(csv_path)
    
    # Get the basket names (first column)
    baskets = df.iloc[:, 0].tolist()
    
    # Initialize the result dataframe with basket names
    result_df = pd.DataFrame({'basket_name': baskets})
    
    result_df['basket_encoded'] = basket_encoder.transform(result_df['basket_name'])
    
    # Create mappings for different feature categories
    feature_categories = {
        'industry_name': [],
        'economic_sector_name': [],
        'Size Category': [],
        'volatility_category': []
    }
    
    # Identify columns for each category
    for col in df.columns[1:]:  # Skip the first column which is basket_name
        for category in feature_categories:
            if category in col:
                feature_categories[category].append(col)
    
    # Process each basket and assign numerical features
    for category, columns in feature_categories.items():
        if not columns:
            continue
            
        # Create a new column to store the feature value
        result_df[f'{category}_feature'] = 0
        
        # For each basket, find the column with maximum value
        for idx, row in df.iterrows():
            if len(columns) > 0:
                # Get values for this category
                values = row[columns].values
                
                # Find the maximum value index
                max_idx = np.argmax(values)
                
                if values[max_idx] > 0:  # Only if there's a positive value
                    # Get the feature name from the column
                    feature_name = columns[max_idx].replace(f'{category}_', '')
                    
                    # Create a mapping of feature names to numerical values if needed
                    # Here we simply use the index as the numerical feature
                    result_df.at[idx, f'{category}_feature'] = max_idx + 1  # +1 to avoid 0
                    
                    # Optionally store the feature name for reference
                    result_df.at[idx, f'{category}_name'] = feature_name
    
    return result_df

# Path to CSV file
csv_path = '../basket_features.csv'

# Convert features
result = convert_basket_features(csv_path, basket_encoder)

print(result.head())

# Save result of numerical basket features to new CSV file
result.to_csv('basket_numerical_features.csv', index=False)
print("Saved numerical features to 'basket_numerical_features.csv'")

NameError: name 'basket_encoder' is not defined