# Set Up

In [1]:
import os
import pandas as pd
from datetime import datetime
import json

# Transform data

In [2]:
# Setup Directories
RAW_DATA_DIR = r"D:\HTP\Project-personal\CoinAnalytics\Main\data\raw"
PROCESSED_DATA_DIR = r"D:\HTP\Project-personal\CoinAnalytics\Main\data\processed"

# Tạo thư mục processed nếu chưa tồn tại
os.makedirs(PROCESSED_DATA_DIR, exist_ok=True)

In [None]:
# Lấy danh sách các file .csv trong thư mục raw
csv_files = [f for f in os.listdir(RAW_DATA_DIR) if f.endswith('.csv')]
print("CSV Files:", csv_files)

In [None]:
# Load Data from Raw Data Directory
all_dataframes = []
for file_name in csv_files:
    file_path = os.path.join(RAW_DATA_DIR, file_name)
    df = pd.read_csv(file_path)
    all_dataframes.append((file_name, df))
    print(f"Loaded {file_name}:")
    print(df.head(1))

# remove unnecessary columns

In [5]:
# Filter Columns
KEEP_COLS = ['timestamp', 'open', 'high', 'low', 'close', 'volume']
filtered_dataframes = []

for file_name, df in all_dataframes:
    filtered_df = df[KEEP_COLS]
    filtered_dataframes.append((file_name, filtered_df))

# Handle Missing Values

In [6]:
def handle_missing(df):
    for col in df.select_dtypes(include=['float64', 'int64']):
        if df[col].isnull().any():
            df[col].fillna(df[col].median(), inplace=True)
    return df

cleaned_dataframes = [(file_name, handle_missing(df)) for file_name, df in filtered_dataframes]


In [7]:
# Data Preprocessing
def preprocess_data(df):
    # Sử dụng .loc để thay đổi giá trị của cột 'timestamp'
    df.loc[:, 'timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')
    
    numeric_cols = ['open', 'high', 'low', 'close', 'volume']
    
    for col in numeric_cols:
        # Sử dụng .loc để thay đổi giá trị của các cột numeric
        df.loc[:, col] = pd.to_numeric(df[col], errors='coerce')
    
    return df

# Áp dụng preprocess cho tất cả các dataframe
processed_dataframes = [(file_name, preprocess_data(df)) for file_name, df in cleaned_dataframes]


# add indicators

In [None]:
def add_indicators(df):
    df.loc[:, 'price_difference'] = df['close'] - df['open']
    df.loc[:, 'price_percentage_change'] = ((df['close'] - df['open']) / df['open']) * 100
    df.loc[:, 'mid_price'] = (df['high'] + df['low']) / 2
    df.loc[:, 'vwap'] = (df['close'] * df['volume']).cumsum() / df['volume'].cumsum()
    return df

final_dataframes = [(file_name, add_indicators(df)) for file_name, df in processed_dataframes]


# Save

In [9]:
# Save Processed Data
def save_to_csv(dataframes, output_dir):
    for file_name, df in dataframes:
        token_name = file_name.split('_')[0]
        output_path = os.path.join(output_dir, f"{token_name}_processed.csv")
        df.to_csv(output_path, index=False)
        print(f"Saved {token_name} data to {output_path}")

save_to_csv(final_dataframes, PROCESSED_DATA_DIR)


Saved ALICEUSDT data to D:\HTP\Project-personal\CoinAnalytics\Main\data\processed\ALICEUSDT_processed.csv
Saved AXSUSDT data to D:\HTP\Project-personal\CoinAnalytics\Main\data\processed\AXSUSDT_processed.csv
Saved ENJUSDT data to D:\HTP\Project-personal\CoinAnalytics\Main\data\processed\ENJUSDT_processed.csv
Saved GALAUSDT data to D:\HTP\Project-personal\CoinAnalytics\Main\data\processed\GALAUSDT_processed.csv
Saved MANAUSDT data to D:\HTP\Project-personal\CoinAnalytics\Main\data\processed\MANAUSDT_processed.csv
Saved SANDUSDT data to D:\HTP\Project-personal\CoinAnalytics\Main\data\processed\SANDUSDT_processed.csv
Saved SUSHIUSDT data to D:\HTP\Project-personal\CoinAnalytics\Main\data\processed\SUSHIUSDT_processed.csv
Saved YGGUSDT data to D:\HTP\Project-personal\CoinAnalytics\Main\data\processed\YGGUSDT_processed.csv


# Transform JSON Data

In [10]:
# Định nghĩa đường dẫn
SOURCE_JSON = r"D:\HTP\Project-personal\CoinAnalytics\Main\data\raw\token_info.json"
PROCESSED_DATA_DIR = r"D:\HTP\Project-personal\CoinAnalytics\Main\data\processed"
OUTPUT_JSON = os.path.join(PROCESSED_DATA_DIR, "token_info.json")

# Load JSON data
with open(SOURCE_JSON, "r", encoding="utf-8") as file:
    json_data = json.load(file)

# Transform JSON data
for token in json_data:
    # Xử lý các trường trống
    for field in ["Website", "Technical Doc", "Twitter", "Reddit", "Logo"]:
        if token.get(field) and token[field].startswith("Không có"):
            token[field] = "blank"

    # Xóa URL cuối trong Description
    if "Description" in token and "More information can be found at" in token["Description"]:
        token["Description"] = token["Description"].split("More information can be found at")[0].strip()

# Sắp xếp danh sách theo thứ tự bảng chữ cái của "Token"
json_data = sorted(json_data, key=lambda x: x["Token"])

# Save transformed JSON data
with open(OUTPUT_JSON, "w", encoding="utf-8") as file:
    json.dump(json_data, file, ensure_ascii=False, indent=4)

print("Transformed JSON data saved.")


Transformed JSON data saved.
