In [11]:
import pandas as pd

def calculate_forecast_labels(df):
    # Ensure the dataframe is sorted by date
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df = df.sort_values('timestamp').reset_index(drop=True)
    
    # Define the thresholds
    threshold = 0.05
    
    # Calculate the shift for each forecast column
    df['Forecast 1 Week'] = (df['5. adjusted close'].shift(-5) - df['5. adjusted close']) / df['5. adjusted close']
    df['Forecast 2 Week'] = (df['5. adjusted close'].shift(-10) - df['5. adjusted close']) / df['5. adjusted close']
    df['Forecast 3 Week'] = (df['5. adjusted close'].shift(-15) - df['5. adjusted close']) / df['5. adjusted close']
    df['Forecast 4 Week'] = (df['5. adjusted close'].shift(-20) - df['5. adjusted close']) / df['5. adjusted close']
    
    # Define a function to classify the percentage change
    def classify_change(pct_change):
        if pd.isna(pct_change):
            return 3  # Set to 3 if there's not enough future data
        elif pct_change <= -threshold:
            return 0  # Decreased by 5% or more
        elif pct_change >= threshold:
            return 2  # Increased by 5% or more
        else:
            return 1  # Stayed the same within ±5%
    
    # Apply classification to each forecast column
    df['Forecast 1 Week'] = df['Forecast 1 Week'].apply(classify_change)
    df['Forecast 2 Week'] = df['Forecast 2 Week'].apply(classify_change)
    df['Forecast 3 Week'] = df['Forecast 3 Week'].apply(classify_change)
    df['Forecast 4 Week'] = df['Forecast 4 Week'].apply(classify_change)
    
    return df



In [12]:
csv_path = '/Users/danielcaraballo/Desktop/TimeSeriesProject/data/raw_data/Technology/DFIN_Price.csv'
df = pd.read_csv(csv_path)
df_with_forecasts = calculate_forecast_labels(df)


In [13]:
df_with_forecasts

Unnamed: 0.1,Unnamed: 0,timestamp,1. open,2. high,3. low,4. close,5. adjusted close,6. volume,7. dividend amount,8. split coefficient,sector,Forecast 1 Week,Forecast 2 Week,Forecast 3 Week,Forecast 4 Week
0,1995,2016-10-03,28.17,28.5000,20.26,22.97,22.97,2353074.0,0.0,1.0,0,1,1,2,0
1,1994,2016-10-04,22.85,24.4500,22.34,22.39,22.39,1659857.0,0.0,1.0,0,1,2,2,0
2,1993,2016-10-05,22.12,22.8419,21.25,21.85,21.85,1474019.0,0.0,1.0,0,1,2,2,1
3,1992,2016-10-06,21.57,22.4400,21.26,21.87,21.87,999588.0,0.0,1.0,0,1,2,1,1
4,1991,2016-10-07,21.60,21.8500,20.73,20.95,20.95,1528846.0,0.0,1.0,0,2,2,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1991,4,2024-09-03,66.42,67.0000,64.99,66.15,66.15,234106.0,0.0,1.0,0,3,3,3,3
1992,3,2024-09-04,65.87,66.7800,65.64,66.23,66.23,94329.0,0.0,1.0,0,3,3,3,3
1993,2,2024-09-05,66.72,66.7200,64.74,65.46,65.46,85693.0,0.0,1.0,0,3,3,3,3
1994,1,2024-09-06,65.47,65.6800,64.43,64.66,64.66,108158.0,0.0,1.0,0,3,3,3,3


In [14]:
df_with_forecasts.to_csv('./forecasted_data.csv', index=False)

In [18]:
import os
import pandas as pd

def calculate_forecast_labels(df):
    # Ensure the dataframe is sorted by date
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df = df.sort_values('timestamp').reset_index(drop=True)
    
    # Define the thresholds
    threshold = 0.05
    
    # Calculate the shift for each forecast column
    df['Forecast 1 Week'] = (df['5. adjusted close'].shift(-5) - df['5. adjusted close']) / df['5. adjusted close']
    df['Forecast 2 Week'] = (df['5. adjusted close'].shift(-10) - df['5. adjusted close']) / df['5. adjusted close']
    df['Forecast 3 Week'] = (df['5. adjusted close'].shift(-15) - df['5. adjusted close']) / df['5. adjusted close']
    df['Forecast 4 Week'] = (df['5. adjusted close'].shift(-20) - df['5. adjusted close']) / df['5. adjusted close']
    
    # Define a function to classify the percentage change
    def classify_change(pct_change):
        if pd.isna(pct_change):
            return 3  # Set to 3 if there's not enough future data
        elif pct_change <= -threshold:
            return 0  # Decreased by 5% or more
        elif pct_change >= threshold:
            return 2  # Increased by 5% or more
        else:
            return 1  # Stayed the same within ±5%
    
    # Apply classification to each forecast column
    df['Forecast 1 Week'] = df['Forecast 1 Week'].apply(classify_change)
    df['Forecast 2 Week'] = df['Forecast 2 Week'].apply(classify_change)
    df['Forecast 3 Week'] = df['Forecast 3 Week'].apply(classify_change)
    df['Forecast 4 Week'] = df['Forecast 4 Week'].apply(classify_change)
    
    return df

def process_sector_csvs(root_dir, sectors):
    # Loop through each sector directory
    for sector in sectors:
        sector_dir = os.path.join(root_dir, sector)
        
        # Check if the sector directory exists
        if os.path.exists(sector_dir):
            for csv_file in os.listdir(sector_dir):
                if csv_file.endswith('.csv'):
                    csv_path = os.path.join(sector_dir, csv_file)
                    
                    # Load the CSV
                    df = pd.read_csv(csv_path)
                    
                    # Apply the forecast calculation
                    df_with_forecasts = calculate_forecast_labels(df)
                    
                    # Save the updated CSV with the new forecasts
                    save_path = os.path.join(sector_dir, f"{csv_file}")
                    df_with_forecasts.to_csv(save_path, index=False)
                    print(f"Processed and saved: {save_path}")
        else:
            print(f"Sector directory not found: {sector_dir}")

# List of sector subdirectories
sectors = ['Technology', 'Financial_Services', 'Healthcare', 'Consumer_Cyclical',
           'Industrials', 'Communication_Services', 'Consumer_Defensive', 'Energy',
           'Real_Estate', 'Basic_Materials', 'Utilities']



In [None]:
# Root directory containing the sector subdirectories
root_dir = '/Users/danielcaraballo/Desktop/TimeSeriesProject/data/raw_data'

# Process and save the CSVs
process_sector_csvs(root_dir, sectors)
