In [None]:
import pandas as pd
import ta
import os
import numpy as np
import logging


logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

input_folder = r"C:\Users\Kasper Hassing\Desktop\Speciale_KryptoSentiment\data\market_data"
output_folder = input_folder  # Gemmer i samme mappe for nu

coins = ['BTCUSDT']
for coin in coins:
    file_path = os.path.join(input_folder, f'{coin}_ohlcv.csv')
    logging.info(f"Indl√¶ser data fra {file_path}")
    
    # Tjek om filen eksisterer
    if not os.path.exists(file_path):
        logging.error(f"Filen {file_path} eksisterer ikke!")
        continue
        
    # Indl√¶s data - antag at den har headers
    try:
        df = pd.read_csv(file_path)
        logging.info(f"Indl√¶ste {len(df)} r√¶kker data med headers")
    except:
        # Hvis fejl, pr√∏v at indl√¶se uden headers og tilf√∏j dem manuelt
        try:
            df = pd.read_csv(file_path, header=None, 
                             names=['timestamp', 'open', 'high', 'low', 'close', 'volume'])
            logging.info(f"Indl√¶ste {len(df)} r√¶kker data uden headers")
        except Exception as e:
            logging.error(f"Kunne ikke indl√¶se fil: {str(e)}")
            continue
    
    # Konverter timestamp til datetime
    try:
        df['timestamp'] = pd.to_datetime(df['timestamp'])
    except:
        # Pr√∏v specifikt YYYY-MM-DD format hvis automatisk konvertering fejler
        try:
            df['timestamp'] = pd.to_datetime(df['timestamp'], format='%Y-%m-%d')
        except Exception as e:
            logging.error(f"Kunne ikke konvertere timestamp: {str(e)}")
            continue
    
    # Tilf√∏j dato kolonne
    df['date'] = df['timestamp'].dt.date
    
    logging.info(f"Data d√¶kker perioden: {df['date'].min()} til {df['date'].max()}")
    logging.info(f"Antal r√¶kker: {len(df)}")
    
    original_df = df.copy()
    
    # üìä Beregn alle tekniske indikatorer
    logging.info("Beregner tekniske indikatorer...")
    
    # Momentum
    df['rsi'] = ta.momentum.RSIIndicator(close=df['close'], window=14).rsi()
    
    # Trend
    df['ema_short'] = ta.trend.EMAIndicator(close=df['close'], window=12).ema_indicator()
    df['ema_long'] = ta.trend.EMAIndicator(close=df['close'], window=26).ema_indicator()
    
    # Volatilitet
    df['volatility_atr'] = ta.volatility.AverageTrueRange(
        high=df['high'], low=df['low'], close=df['close'], window=14
    ).average_true_range()
    
    bollinger = ta.volatility.BollingerBands(close=df['close'], window=20, window_dev=2)
    df['bb_width'] = (bollinger.bollinger_hband() - bollinger.bollinger_lband()) / bollinger.bollinger_mavg()
    
    # Volumen
    df['obv'] = ta.volume.OnBalanceVolumeIndicator(close=df['close'], volume=df['volume']).on_balance_volume()
    df['volume_norm'] = df['volume'] / df['volume'].rolling(window=20).mean()
    
    # MACD
    macd = ta.trend.MACD(close=df['close'], window_fast=12, window_slow=26, window_sign=9)
    df['macd'] = macd.macd()
    df['macd_signal'] = macd.macd_signal()
    df['macd_hist'] = macd.macd_diff()
    
    # Pris√¶ndringer
    df['return_1d'] = df['close'].pct_change(1)
    df['return_3d'] = df['close'].pct_change(3)
    df['return_7d'] = df['close'].pct_change(7)
    
    # Fremtidige returns
    df['future_return_1d'] = df['close'].pct_change(1).shift(-1)
    df['future_return_3d'] = df['close'].pct_change(3).shift(-3)
    df['future_return_7d'] = df['close'].pct_change(7).shift(-7)
    
    # Bevar alle r√¶kker, men identificer NaN-v√¶rdier
    nan_rows = df.isna().any(axis=1).sum()
    if nan_rows > 0:
        logging.warning(f"{nan_rows} r√¶kker indeholder NaN-v√¶rdier")
        
        # For analytisk brug, lad os t√¶lle NaN for hver kolonne
        nan_columns = df.isna().sum()
        logging.info(f"NaN-v√¶rdier per kolonne:\n{nan_columns[nan_columns > 0]}")
    
    # Gem alle r√¶kker fra den oprindelige fil med de nye features
    output_path = os.path.join(output_folder, f'{coin}_features.csv')
    df.to_csv(output_path, index=False)
    logging.info(f"‚úÖ Gemte fil med indikatorer: {output_path}")
    logging.info(f"Filen indeholder {len(df)} r√¶kker")
    

    clean_df = df.dropna().reset_index(drop=True)
    clean_output_path = os.path.join(output_folder, f'{coin}_features_clean.csv')
    clean_df.to_csv(clean_output_path, index=False)
    logging.info(f"‚úÖ Gemte rens fil uden NaN-v√¶rdier: {clean_output_path}")
    logging.info(f"Den rene fil indeholder {len(clean_df)} r√¶kker")

2025-05-09 01:48:44,452 - INFO - Indl√¶ser data fra C:\Users\Kasper Hassing\Desktop\Speciale_KryptoSentiment\data\market_data\BTCUSDT_ohlcv.csv
2025-05-09 01:48:44,454 - INFO - Indl√¶ste 486 r√¶kker data med headers
2025-05-09 01:48:44,456 - INFO - Data d√¶kker perioden: 2020-12-01 til 2022-03-31
2025-05-09 01:48:44,457 - INFO - Antal r√¶kker: 486
2025-05-09 01:48:44,457 - INFO - Beregner tekniske indikatorer...
2025-05-09 01:48:44,472 - INFO - NaN-v√¶rdier per kolonne:
rsi                 13
ema_short           11
ema_long            25
bb_width            19
volume_norm         19
macd                25
macd_signal         33
macd_hist           33
return_1d            1
return_3d            3
return_7d            7
future_return_1d     1
future_return_3d     3
future_return_7d     7
dtype: int64
2025-05-09 01:48:44,484 - INFO - ‚úÖ Gemte fil med indikatorer: C:\Users\Kasper Hassing\Desktop\Speciale_KryptoSentiment\data\market_data\BTCUSDT_features.csv
2025-05-09 01:48:44,484 - INFO 