In [8]:
import pandas as pd
import numpy as np
import os

def create_master_dataset():

    print("Starting Master Dataset Creation for Reliance Industries ") 

    prices_file = 'cleaned_reliance_reliance__prices.csv' # Base

    news_sentiment_file = 'reliance_news_with_sentiment.csv' 
    reddit_sentiment_file = 'reliance_reddit_with_sentiment.csv' 

    financial_files = {
        'profit_loss': 'cleaned_reliance_relianceidustriesprofitnloss.csv',
        'balance_sheet': 'cleaned_reliance_relianceindustries_balance.csv',
        'cash_flow': 'cleaned_reliance_relianceindustrues_cashflow.csv',
        'ratios': 'cleaned_reliance_relianceindustries_ratio.csv',
        'quarterly': 'cleaned_reliance_relianceindustriesquater.csv'
    }

    try:
        print(f"Loading base price data: {prices_file}")
        if not os.path.exists(prices_file):
            raise FileNotFoundError(f"Base price data file not found: {prices_file}")

        master_df = pd.read_csv(prices_file)
        master_df['date'] = pd.to_datetime(master_df['date'])
        master_df.sort_values('date', inplace=True)
        print(f"Base data loaded. Shape: {master_df.shape}")

        print("\nProcessing sentiment data...")

        if os.path.exists(news_sentiment_file):
            news_df = pd.read_csv(news_sentiment_file)
            news_df['date'] = pd.to_datetime(news_df['date'])
            news_agg = news_df.groupby('date')['sentiment_score'].mean().reset_index()
            news_agg = news_agg.rename(columns={'sentiment_score': 'news_sentiment_avg'})
            master_df = pd.merge(master_df, news_agg, on='date', how='left')
            print(f"News sentiment data merged from {news_sentiment_file}.")
        else:
            print(f"Warning: News sentiment file '{news_sentiment_file}' not found. Skipping news sentiment merge.")
            master_df['news_sentiment_avg'] = 0.0 

        if os.path.exists(reddit_sentiment_file):
            reddit_df = pd.read_csv(reddit_sentiment_file)
            reddit_df['date'] = pd.to_datetime(reddit_df['date'])
            reddit_agg = reddit_df.groupby('date')['sentiment_score'].mean().reset_index()
            reddit_agg = reddit_agg.rename(columns={'sentiment_score': 'reddit_sentiment_avg'})
            master_df = pd.merge(master_df, reddit_agg, on='date', how='left')
            print(f"Reddit sentiment data merged from {reddit_sentiment_file}.")
        else:
            print(f"Warning: Reddit sentiment file '{reddit_sentiment_file}' not found. Skipping Reddit sentiment merge.")
            master_df['reddit_sentiment_avg'] = 0.0 

        master_df['news_sentiment_avg'].fillna(0, inplace=True)
        master_df['reddit_sentiment_avg'].fillna(0, inplace=True)
        print("Sentiment data merge process completed.")
        print(f"Shape after sentiment merge: {master_df.shape}")

        print("\nProcessing and merging financial data...")
        for name, file_path in financial_files.items():
            print(f"- Merging {name} data from {file_path}")
            if os.path.exists(file_path):
                fin_df = pd.read_csv(file_path)
                fin_df['date'] = pd.to_datetime(fin_df['date'])
                fin_df.sort_values('date', inplace=True)

                cols_to_drop = [col for col in fin_df.columns if col in master_df.columns and col != 'date']
                fin_df = fin_df.drop(columns=cols_to_drop)


                master_df = pd.merge_asof(master_df, fin_df, on='date', direction='backward')
                print(f" Shape after merging {name}: {master_df.shape}")
            else:
                print(f" Warning: Financial file '{file_path}' not found. Skipping {name} merge.")

        print("\nFinalizing the master dataset...")
        master_df.dropna(inplace=True)

        output_file = 'reliance_master_dataset.csv'
        master_df.to_csv(output_file, index=False)

        print(f"\nMaster dataset created successfully!")
        print(f"Final shape: {master_df.shape}")
        print(f"Saved to: {output_file}")

    except FileNotFoundError as e:
        print(f"Error: A required file was not found - {e.filename}.")
        print("Please ensure all previous cleaning and sentiment analysis steps were completed successfully.")
    except Exception as e:
        print(f"An error occurred during master dataset creation: {e}")

if __name__ == '__main__':
    create_master_dataset()


--- Starting Master Dataset Creation for Reliance Industries ---
Loading base price data: cleaned_reliance_reliance__prices.csv
Error: A required file was not found - None.
Please ensure all previous cleaning and sentiment analysis steps were completed successfully.
