Part A â€” Data preparation 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)


In [3]:
trades = pd.read_csv('../data/raw/historical_data.csv')
sentiment = pd.read_csv('../data/raw/fear_greed_index.csv')


In [4]:
print("Trader Data Shape:", trades.shape)
print("Sentiment Data Shape:", sentiment.shape)

print("\nTrader Data Info:")
display(trades.info())

print("\nSentiment Data Info:")
display(sentiment.info())


Trader Data Shape: (211224, 16)
Sentiment Data Shape: (2644, 4)

Trader Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 211224 entries, 0 to 211223
Data columns (total 16 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Account           211224 non-null  object 
 1   Coin              211224 non-null  object 
 2   Execution Price   211224 non-null  float64
 3   Size Tokens       211224 non-null  float64
 4   Size USD          211224 non-null  float64
 5   Side              211224 non-null  object 
 6   Timestamp IST     211224 non-null  object 
 7   Start Position    211224 non-null  float64
 8   Direction         211224 non-null  object 
 9   Closed PnL        211224 non-null  float64
 10  Transaction Hash  211224 non-null  object 
 11  Order ID          211224 non-null  int64  
 12  Crossed           211224 non-null  bool   
 13  Fee               211224 non-null  float64
 14  Trade ID          211224 non-null

None


Sentiment Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2644 entries, 0 to 2643
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   timestamp       2644 non-null   int64 
 1   value           2644 non-null   int64 
 2   classification  2644 non-null   object
 3   date            2644 non-null   object
dtypes: int64(2), object(2)
memory usage: 82.8+ KB


None

In [5]:
# Missing values
trades_missing = trades.isna().sum()
sentiment_missing = sentiment.isna().sum()

# Duplicates
trades_duplicates = trades.duplicated().sum()
sentiment_duplicates = sentiment.duplicated().sum()

print("Trader missing values:\n", trades_missing)
print("\nSentiment missing values:\n", sentiment_missing)

print("\nTrader duplicates:", trades_duplicates)
print("Sentiment duplicates:", sentiment_duplicates)


Trader missing values:
 Account             0
Coin                0
Execution Price     0
Size Tokens         0
Size USD            0
Side                0
Timestamp IST       0
Start Position      0
Direction           0
Closed PnL          0
Transaction Hash    0
Order ID            0
Crossed             0
Fee                 0
Trade ID            0
Timestamp           0
dtype: int64

Sentiment missing values:
 timestamp         0
value             0
classification    0
date              0
dtype: int64

Trader duplicates: 0
Sentiment duplicates: 0


In [None]:
trades.columns


Index(['Account', 'Coin', 'Execution Price', 'Size Tokens', 'Size USD', 'Side',
       'Timestamp IST', 'Start Position', 'Direction', 'Closed PnL',
       'Transaction Hash', 'Order ID', 'Crossed', 'Fee', 'Trade ID',
       'Timestamp'],
      dtype='object')

In [9]:
sentiment.columns

Index(['timestamp', 'value', 'classification', 'date'], dtype='object')

Convert Trade Timestamp to Datetime

In [10]:
# Convert UNIX timestamp (milliseconds) to datetime
trades['trade_time'] = pd.to_datetime(trades['Timestamp'], unit='ms', errors='coerce')

# Extract date
trades['date'] = trades['trade_time'].dt.date


In [11]:
trades[['Timestamp', 'trade_time', 'date']].head()


Unnamed: 0,Timestamp,trade_time,date
0,1730000000000.0,2024-10-27 03:33:20,2024-10-27
1,1730000000000.0,2024-10-27 03:33:20,2024-10-27
2,1730000000000.0,2024-10-27 03:33:20,2024-10-27
3,1730000000000.0,2024-10-27 03:33:20,2024-10-27
4,1730000000000.0,2024-10-27 03:33:20,2024-10-27


Convert Sentiment Date Column

In [12]:
sentiment['date'] = pd.to_datetime(sentiment['date']).dt.date


In [13]:
sentiment.head()


Unnamed: 0,timestamp,value,classification,date
0,1517463000,30,Fear,2018-02-01
1,1517549400,15,Extreme Fear,2018-02-02
2,1517635800,40,Fear,2018-02-03
3,1517722200,24,Extreme Fear,2018-02-04
4,1517808600,11,Extreme Fear,2018-02-05


In [14]:
print("Trade data date range:")
print(trades['date'].min(), "to", trades['date'].max())

print("\nSentiment data date range:")
print(sentiment['date'].min(), "to", sentiment['date'].max())


Trade data date range:
2023-03-28 to 2025-06-15

Sentiment data date range:
2018-02-01 to 2025-05-02


In [15]:
trades[['Account', 'Coin', 'Side', 'Closed PnL', 'date']].sample(5)


Unnamed: 0,Account,Coin,Side,Closed PnL,date
21852,0x4f93fead39b70a1824f981a54d4e55b278e9f760,ETH,BUY,0.0,2024-03-09
38239,0x75f7eeb85dc639d5e99c78f95393aa9a5f1170d4,TRUMP,BUY,38.5917,2025-06-15
127900,0x47add9a56df66b524d5e2c1993a43cde53b6ed85,SUI,BUY,15.56442,2025-02-19
16983,0x083384f897ee0f19899168e3b1bec365f52a9012,ETH,SELL,0.0,2025-02-19
26222,0x4f93fead39b70a1824f981a54d4e55b278e9f760,BTC,BUY,0.0,2025-02-19
