at this stage we need to transform the table to train the gan model

In [42]:
import pandas as pd
import ast
import numpy as np
from sklearn.preprocessing import MinMaxScaler

In [2]:
df = pd.read_csv('/transactions.csv')

In [3]:
df.head()

Unnamed: 0,subject,verb,object,time,amount,mask
0,User 40578,bought,book,2023-01-11,1524,"[1, 1, 1, 1, 1]"
1,User 94768,paid,laptop,2022-11-23,-1,"[1, 1, 1, 1, 0]"
2,User 65142,paid,service,2023-03-25,-1,"[1, 1, 1, 1, 0]"
3,User 10499,returned,book,2023-12-19,-1,"[1, 1, 1, 1, 0]"
4,User 89538,ordered,phone,2025-07-13,3949,"[1, 1, 1, 1, 1]"


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   subject  20000 non-null  object
 1   verb     20000 non-null  object
 2   object   20000 non-null  object
 3   time     20000 non-null  object
 4   amount   20000 non-null  int64 
 5   mask     20000 non-null  object
dtypes: int64(1), object(5)
memory usage: 937.6+ KB


1. Transformation of categorical attributes

In [5]:
df['user_id'] = df['subject'].str.replace('User ', '').astype(int)

In [6]:
df.drop('subject', axis=1, inplace=True)

An LSTM neural network layer could be used to encode the mask, but this would require more computational resources. So I'll focus on a simple separation.

In [7]:
# one-hot
df = pd.get_dummies(df, columns=['verb', 'object'], prefix_sep='_')

In [8]:
mask_columns = ['mask_0', 'mask_1', 'mask_2', 'mask_3', 'mask_4']

In [11]:
df['mask'] = df['mask'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

In [12]:
df[mask_columns] = pd.DataFrame(df['mask'].tolist(), index=df.index)

In [14]:
df.drop('mask', axis=1, inplace=True)

In [17]:
df.rename(columns={'mask_4':'mask_amount', 'mask_3':'mask_time'}, inplace=True)

In [19]:
df.drop(columns=['mask_0', 'mask_1', 'mask_2'], inplace=True)

In [20]:
df.head()

Unnamed: 0,time,amount,user_id,verb_bought,verb_cancelled,verb_ordered,verb_paid,verb_returned,verb_sold,object_accessory,object_book,object_laptop,object_phone,object_product,object_service,mask_time,mask_amount
0,2023-01-11,1524,40578,True,False,False,False,False,False,False,True,False,False,False,False,1,1
1,2022-11-23,-1,94768,False,False,False,True,False,False,False,False,True,False,False,False,1,0
2,2023-03-25,-1,65142,False,False,False,True,False,False,False,False,False,False,False,True,1,0
3,2023-12-19,-1,10499,False,False,False,False,True,False,False,True,False,False,False,False,1,0
4,2025-07-13,3949,89538,False,False,True,False,False,False,False,False,False,True,False,False,1,1


2. Date processing and amount

In [22]:
df['time'].replace('[PAD]', np.nan, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['time'].replace('[PAD]', np.nan, inplace=True)


In [30]:
df['datetime'] = pd.to_datetime(df['time'], errors='coerce')

In [34]:
df['unix_time'] = (df['datetime'].astype('int64') // 10**9).astype('Int64')

In [37]:
df.drop(columns=['time', 'datetime'], axis=1, inplace=True)

In [39]:
df['amount'].replace(-1, np.nan, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['amount'].replace(-1, np.nan, inplace=True)


In [43]:
valid_time = df['unix_time'].dropna()

In [44]:
scaler = MinMaxScaler()

In [45]:
df.loc[valid_time.index, 'unix_time_norm'] = scaler.fit_transform(valid_time.values.reshape(-1, 1))

In [47]:
df['unix_time_norm'] = df['unix_time_norm'].fillna(0)

In [49]:
df.drop('unix_time', axis=1, inplace=True)

In [51]:
df['amount_norm'] = df['amount'].copy()

In [52]:
valid_amounts = df['amount_norm'].dropna()

In [54]:
if not valid_amounts.empty:
  df.loc[valid_amounts.index, 'amount_norm'] = scaler.fit_transform(valid_amounts.values.reshape(-1, 1))

In [55]:
df.drop('amount', axis=1, inplace=True)

In [57]:
df['amount_norm'] = df['amount_norm'].fillna(0)

In [58]:
df.head()

Unnamed: 0,user_id,verb_bought,verb_cancelled,verb_ordered,verb_paid,verb_returned,verb_sold,object_accessory,object_book,object_laptop,object_phone,object_product,object_service,mask_time,mask_amount,unix_time_norm,amount_norm
0,40578,True,False,False,False,False,False,False,True,False,False,False,False,1,1,0.99147,0.652465
1,94768,False,False,False,True,False,False,False,False,True,False,False,False,1,0,0.991085,0.0
2,65142,False,False,False,True,False,False,False,False,False,False,False,True,1,0,0.992044,0.0
3,10499,False,False,False,False,True,False,False,True,False,False,False,False,1,0,0.994159,0.0
4,89538,False,False,True,False,False,False,False,False,False,True,False,False,1,1,0.998656,0.894989


I warn readers that there is no imbalance between classes, since I generated this data myself, I know

In [60]:
df.to_csv('training_data.csv')