Step 1: Import Required Libraries

In [3]:
import pandas as pd
import numpy as np
from pathlib import Path

Step 2: Define Paths and Create Data Directory

In [4]:
data_dir = Path("data")
merged_path = data_dir / "merged_data.csv"
sample_path = data_dir / "historical_data_sample.csv"
data_dir.mkdir(parents=True, exist_ok=True)

Step 3: Load Data Files

In [5]:
try:
    df_hist = pd.read_csv(data_dir / "historical_data.csv")
    df_fg   = pd.read_csv(data_dir / "fear_greed_index.csv")
except FileNotFoundError as e:
    print(f"Error: {e}")
    print("Please ensure the CSV files are placed in the 'data/' directory.")
    raise

Step 4: Inspect Data

In [6]:
print("--- Historical Data ---")
print(df_hist.columns.tolist())
print(df_hist.info())
print("--- Fear & Greed ---")
print(df_fg.columns.tolist())
print(df_fg.info())

--- Historical Data ---
['Account', 'Coin', 'Execution Price', 'Size Tokens', 'Size USD', 'Side', 'Timestamp IST', 'Start Position', 'Direction', 'Closed PnL', 'Transaction Hash', 'Order ID', 'Crossed', 'Fee', 'Trade ID', 'Timestamp']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 211224 entries, 0 to 211223
Data columns (total 16 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Account           211224 non-null  object 
 1   Coin              211224 non-null  object 
 2   Execution Price   211224 non-null  float64
 3   Size Tokens       211224 non-null  float64
 4   Size USD          211224 non-null  float64
 5   Side              211224 non-null  object 
 6   Timestamp IST     211224 non-null  object 
 7   Start Position    211224 non-null  float64
 8   Direction         211224 non-null  object 
 9   Closed PnL        211224 non-null  float64
 10  Transaction Hash  211224 non-null  object 
 11  Order ID          211224 

Step 5: Convert Timestamp to Datetime Format

In [7]:
df_hist['time'] = pd.to_datetime(
    df_hist['Timestamp IST'], format='%d-%m-%Y %H:%M', errors='coerce'
)

Step 6: Convert Columns to Numeric

In [8]:
numeric_cols = ['Execution Price', 'Size USD', 'Closed PnL']
for col in numeric_cols:
    df_hist[col] = pd.to_numeric(df_hist[col], errors='coerce')

Step 7: Create Date Column for Merge

In [9]:
df_hist['date'] = df_hist['time'].dt.date

df_fg['date'] = pd.to_datetime(df_fg['date']).dt.date
df_fg.rename(columns={'classification': 'Sentiment'}, inplace=True)

In [13]:
# LOCALIZE to IST then convert to UTC

In [14]:
df_hist['time_ist'] = df_hist['time'].dt.tz_localize('Asia/Kolkata')
df_hist['timestamp_utc'] = df_hist['time_ist'].dt.tz_convert('UTC')
print(df_hist[['time','time_ist','timestamp_utc']].head())

                 time                  time_ist             timestamp_utc
0 2024-12-02 22:50:00 2024-12-02 22:50:00+05:30 2024-12-02 17:20:00+00:00
1 2024-12-02 22:50:00 2024-12-02 22:50:00+05:30 2024-12-02 17:20:00+00:00
2 2024-12-02 22:50:00 2024-12-02 22:50:00+05:30 2024-12-02 17:20:00+00:00
3 2024-12-02 22:50:00 2024-12-02 22:50:00+05:30 2024-12-02 17:20:00+00:00
4 2024-12-02 22:50:00 2024-12-02 22:50:00+05:30 2024-12-02 17:20:00+00:00


Step 9: Save Merged Data to CSV

In [20]:
df_merged.to_csv(merged_path, index=False)
print(f"Saved full merged dataset to: {merged_path}")

Saved full merged dataset to: data/merged_data.csv


Step 10: Create Sample Dataset

In [17]:
if len(df_hist) >= 10000:
    sample_df = df_hist.sample(n=10000, random_state=42)
else:
    sample_df = df_hist.copy()
    print("Warning: Less than 10,000 rows in dataset, using full data for sample.")

sample_df.to_csv(sample_path, index=False)
print(f"Saved sample to: {sample_path}")

Saved sample to: data/historical_data_sample.csv
