In [None]:
import pandas as pd
import numpy as np
import os
import tarfile

In [None]:
data_in_path = '../../data/Elliptic/elliptic_bitcoin_dataset/'
data_out_path = '../../data/Elliptic/'

## Step 1: Create a file named `elliptic_txs_orig2contiguos.csv` and modify `elliptic_txs_features.csv`.

First, read the original dataset:

In [None]:
# Read the original dataset
txs_features = pd.read_csv(os.path.join(data_in_path, 'elliptic_txs_features.csv'), header=None)
txs_features

Next, create a new dataframe that stores the original ID (in the first column) and the contiguous ID (which is simply the line number):

In [None]:
# Create a mapping dataframe
id_mapping_df = pd.DataFrame({
    'originalId': txs_features.iloc[:, 0],
    'contiguousId': range(len(txs_features))
})

# Save the mapping dataframe to a csv file
id_mapping_df.to_csv(os.path.join(data_out_path, 'elliptic_txs_orig2contiguos.csv'), index=False)

Finally, modify the original dataframe by replacing the first column with the line number and converting the first two columns to float:

In [None]:
# Replace the first column with the line number and convert to float
txs_features.iloc[:, 0] = id_mapping_df['contiguousId'].astype(float)

# Convert the second column to float
txs_features.iloc[:, 1] = txs_features.iloc[:, 1].astype(float)

# Save the modified dataframe to a new csv file
txs_features.to_csv(os.path.join(data_out_path, 'modified_elliptic_txs_features.csv'), index=False, header=None)

This should create two new CSV files:

* `elliptic_txs_orig2contiguos.csv`: This file contains the mapping from the original ID to the contiguous ID.

* `modified_elliptic_txs_features.csv`: This file is a modified version of your original dataset, where the first number in each line is replaced by the line number (converted to a float), and the second number is also converted to a float.

In [None]:
# take a look
txs_orig2contiguos = pd.read_csv(os.path.join(data_out_path, 'elliptic_txs_orig2contiguos.csv'))
txs_orig2contiguos

In [None]:
modified_txs_features = pd.read_csv(os.path.join(data_out_path, 'modified_elliptic_txs_features.csv'), header=None)
modified_txs_features

## Step 2: Modify `elliptic_txs_classes.csv`

In [None]:
df_classes = pd.read_csv(os.path.join(data_in_path, 'elliptic_txs_classes.csv'))
df_classes

Replace the `txId` values with the corresponding contiguous ids from our `id_mapping_df` dataframe:

In [None]:
# Merge the classes dataframe with the mapping dataframe
df_classes = df_classes.merge(id_mapping_df, left_on='txId', right_on='originalId', how='left')
df_classes

In [None]:
# Drop the originalId and txId columns
df_classes.drop(columns=['txId', 'originalId'], inplace=True)
df_classes

In [None]:
# Rename contiguousId to txId
df_classes.rename(columns={'contiguousId': 'txId'}, inplace=True)
df_classes

In [None]:
df_classes = df_classes[['txId', 'class']]
df_classes.head()

Then, replace the class values according to the rules you provided (-1.0 for 'unknown', 1.0 for '1', and 0 for '2'):

In [None]:
# Create a dictionary mapping for classes
class_mapping = {'unknown': -1.0, '1': 1.0, '2': 0}

# Replace the class values
df_classes['class'] = df_classes['class'].map(class_mapping)
df_classes.head()

Finally, save the modified classes data to a new CSV file:

In [None]:
# Save the modified classes dataframe to a new csv file
df_classes.to_csv(os.path.join(data_out_path, 'modified_elliptic_txs_classes.csv'), index=False)

This should create a new CSV file `modified_elliptic_txs_classes.csv`. In this file, the `txId` values are replaced by the contiguous ids, and the class values are converted to numeric values.

In [None]:
modified_txs_classes = pd.read_csv(os.path.join(data_out_path, 'modified_elliptic_txs_classes.csv'))
modified_txs_classes

## Step 3: Create a file named `elliptic_txs_nodetime.csv`

Continuing from the previous steps, we will now use the previously modified `txs_features` dataframe that holds the features to extract the txId and timestep data.

Let's create the `elliptic_txs_nodetime.csv` file:

In [None]:
modified_txs_features

In [None]:
# Create a new dataframe that contains the new node id and the timestamp
df_nodetime = pd.DataFrame({
    'txId': modified_txs_features.iloc[:, 0].astype(int),  # the new node id
    'timestep': (modified_txs_features.iloc[:, 1] - 1).astype(int)  # the timestamp, shifted down by 1
})

# Save the nodetime dataframe to a csv file
df_nodetime.to_csv(os.path.join(data_out_path, 'elliptic_txs_nodetime.csv'), index=False)

This will create a new CSV file `elliptic_txs_nodetime.csv`. The txId values in this file are the contiguous ids, and the timestep values are the timestamps from the original `elliptic_txs_features.csv`, shifted down by 1 as per your requirements.

In [None]:
elliptic_txs_nodetime = pd.read_csv(os.path.join(data_out_path, 'elliptic_txs_nodetime.csv'))
elliptic_txs_nodetime

In [None]:
elliptic_txs_nodetime['timestep'].unique()

## Step 4: Modify elliptic_txs_edgelist.csv and rename it to elliptic_txs_edgelist_timed.csv

First, we read the edgelist data:

In [None]:
# Read the edgelist data
df_edgelist = pd.read_csv(os.path.join(data_in_path, 'elliptic_txs_edgelist.csv'))
df_edgelist.head()

Next, we replace the `txId1` and `txId2` values with the corresponding new ids from our `id_mapping_df` dataframe:

In [None]:
# Merge the edge list dataframe with the mapping dataframe
df_edgelist = df_edgelist.merge(id_mapping_df, left_on='txId1', right_on='originalId', how='left')

# Drop the originalId and txId1 columns
df_edgelist.drop(columns=['txId1', 'originalId'], inplace=True)

# Rename contiguousId to txId1
df_edgelist.rename(columns={'contiguousId': 'txId1'}, inplace=True)

# Repeat the same for txId2
df_edgelist = df_edgelist.merge(id_mapping_df, left_on='txId2', right_on='originalId', how='left')
df_edgelist.drop(columns=['txId2', 'originalId'], inplace=True)
df_edgelist.rename(columns={'contiguousId': 'txId2'}, inplace=True)

In [None]:
df_edgelist

Then, we need to add a timestep column to the dataframe. We can extract this from the `df_nodetime` dataframe:

In [None]:
# Merge the edge list dataframe with the nodetime dataframe
df_edgelist = df_edgelist.merge(df_nodetime, left_on='txId1', right_on='txId', how='left')

# Drop the unnecessary txId column
df_edgelist.drop(columns=['txId'], inplace=True)

# Rename timestep to timestep (float)
df_edgelist['timestep'] = df_edgelist['timestep'].astype(float)

In [None]:
# Save the modified edgelist dataframe to a new csv file
df_edgelist.to_csv(os.path.join(data_out_path, 'elliptic_txs_edgelist_timed.csv'), index=False)

This should create a new CSV file `elliptic_txs_edgelist_timed.csv`. In this file, the `txId1` and `txId2` values are replaced by the new node ids, and a timestep column is added which indicates the timestamp for the corresponding edge.

In [None]:
elliptic_txs_edgelist_timed = pd.read_csv(os.path.join(data_out_path, 'elliptic_txs_edgelist_timed.csv'))
elliptic_txs_edgelist_timed

## Final: move all files into `elliptic_compress` folder and turn it into a tar.gz file

Remove NaN entry

In [None]:
files = [
    'elliptic_txs_edgelist_timed.csv',
    'elliptic_txs_nodetime.csv',
    'modified_elliptic_txs_classes.csv',
    'modified_elliptic_txs_features.csv',
]

# iterate over each file
for file_name in files:
    # read the csv file into a pandas DataFrame
    if file_name == 'modified_elliptic_txs_features.csv':
        df = pd.read_csv(os.path.join(data_out_path, file_name), header=None)
    else:
        df = pd.read_csv(os.path.join(data_out_path, file_name))

    # check if there are any NaN values in the DataFrame
    if df.isna().any().any():
        print(f"The file {file_name} contains NaN values.")
    else:
        print(f"The file {file_name} does not contain any NaN values.")
        
#     # drop any rows that contain NaN
#     df = df.dropna()

#     # write the DataFrame back to the csv file
#     if file_name == 'modified_elliptic_txs_features.csv':
#         df.to_csv(file_name, index=False, header=None)
#     else:
#         df.to_csv(file_name, index=False)

Compress into tar.gz

In [None]:
# specify the output tar.gz file name
output_filename = "elliptic_bitcoin_dataset_cont_updated.tar.gz"

files = [
    'elliptic_txs_edgelist_timed.csv',
    'elliptic_txs_nodetime.csv',
    'modified_elliptic_txs_classes.csv',
    'modified_elliptic_txs_features.csv',
]

with tarfile.open(os.path.join(data_out_path, output_filename), "w:gz") as tar:
    # iterate over each item in the directory
    for item in files:
        print('item:', item)
        # add the item (file or directory) to the tar.gz file
        tar.add(os.path.join(data_out_path, item), arcname=os.path.basename(item))