In [19]:
import pandas as pd
import numpy as np
from datetime import datetime

In [20]:
df = pd.read_csv("data/finnhub1h.csv")
columns_to_read = ["symbol","trade_ts","price","trade_conditions","type","volume"]
timestamp_format = "%Y-%m-%d %H:%M:%S.%f%z"
df = df[columns_to_read]
# df["trade_ts"] = df["trade_ts"].apply(lambda a: datetime.strptime(a, timestamp_format))
# df = df.set_index("trade_ts")

In [21]:
# Function to introduce anomalies
def add_anomalies(input_df, columns, anomaly_ratio=0.2):
    """
    Introduce random anomalies in specified columns of the DataFrame.
    
    Parameters:
    df (pd.DataFrame): Input DataFrame
    columns (list): Columns to introduce anomalies
    anomaly_ratio (float): Proportion of rows to have anomalies (0 < anomaly_ratio <= 1)
    
    Returns:
    pd.DataFrame: DataFrame with anomalies
    """
    df_anomalous = input_df.copy()
    num_rows = len(input_df)
    num_anomalies = max(1, int(anomaly_ratio * num_rows))  # Ensure at least one anomaly

    for col in columns:
        min_val, max_val = input_df[col].min(), input_df[col].max()
        anomaly_indices = np.random.choice(input_df.index, size=num_anomalies, replace=False)
        
        # Generate random anomalies: values outside the normal range
        anomalies = np.random.uniform(min(min_val - (max_val - min_val),0), 
                                       max_val + (max_val - min_val), 
                                       size=num_anomalies)
        df_anomalous.loc[anomaly_indices, col] = anomalies
    
    return df_anomalous

In [22]:
# Adding anomalies
columns_to_anomalize = ['price', 'volume']
df_with_anomalies = add_anomalies(df, columns_to_anomalize, anomaly_ratio=0.04)

# Display original and anomalous DataFrames
print("Original DataFrame:")
print(df.describe())
print("\nDataFrame with Anomalies:")
print(df_with_anomalies.describe())
df_with_anomalies.to_csv('data/finnhub_anomalies.csv')

Original DataFrame:
               price  trade_conditions        volume
count   41004.000000               0.0  41004.000000
mean   101566.731830               NaN      0.016646
std       229.775165               NaN      0.074600
min    101056.000000               NaN      0.000010
25%    101367.787500               NaN      0.000240
50%    101620.010000               NaN      0.000820
75%    101775.992500               NaN      0.005900
max    101898.000000               NaN      6.602400

DataFrame with Anomalies:
               price  trade_conditions        volume
count   41004.000000               0.0  41004.000000
mean    99593.300784               NaN      0.150706
std     11323.244802               NaN      1.327159
min        31.299671               NaN     -6.573366
25%    101328.640000               NaN      0.000230
50%    101592.510000               NaN      0.000880
75%    101768.350000               NaN      0.007060
max    102732.810421               NaN     13.181165