In [1]:
import pandas as pd
import os
import re

def clean_financial_data(df, date_col_name='Date'):
    new_columns = {}
    for col in df.columns:
        clean_col = re.sub(r'\s*\(.*\)\s*', '', col)
        clean_col = re.sub(r'[^a-zA-Z0-9]+', '_', clean_col).lower()
        clean_col = clean_col.strip('_')
        new_columns[col] = clean_col
    df = df.rename(columns=new_columns)

    date_col_name_cleaned = new_columns.get(date_col_name, str(date_col_name).lower())

    if date_col_name_cleaned in df.columns:
        df[date_col_name_cleaned] = pd.to_datetime(df[date_col_name_cleaned], errors='coerce')
        df = df.rename(columns={date_col_name_cleaned: 'date'})

    if 'date' in df.columns:
        df = df.sort_values(by='date').reset_index(drop=True)
    df.ffill(inplace=True)

    print(f"Cleaned columns: {df.columns.tolist()}")
    print(f"Date column type: {df['date'].dtype if 'date' in df.columns else 'Not found'}")
    print("-" * 30)

    return df

def clean_text_data(df, date_col_name):

    df.columns = [col.strip().lower() for col in df.columns]

    date_col_name_lower = date_col_name.lower()

    if date_col_name_lower in df.columns:
        df = df.rename(columns={date_col_name_lower: 'date'})
        df['date'] = pd.to_datetime(df['date'], errors='coerce')

    text_cols = []
    if 'title' in df.columns and 'summary' in df.columns: # News data
        text_cols = ['title', 'summary']
    elif 'news' in df.columns and 'top_comments' in df.columns: # Reddit data
        text_cols = ['news', 'top_comments']

    df.dropna(subset=['date'] + text_cols, inplace=True)

    if 'title' in text_cols and 'summary' in text_cols:
        df['text'] = df['title'] + ". " + df['summary']
    elif 'news' in text_cols and 'top_comments' in text_cols:
        df['text'] = df['news'] + ". " + df['top_comments']

    print(f"Cleaned columns: {df.columns.tolist()}")
    print(f"Date column type: {df['date'].dtype if 'date' in df.columns else 'Not found'}")
    print("-" * 30)

    return df


def main():

    files_to_clean = {
        'relianceidustriesProfitnLoss.csv': {'type': 'financial', 'date_col': 'Date'},
        'relianceindustries_balance.csv': {'type': 'financial', 'date_col': 'Date'},
        'relianceindustrues_cashflow.csv': {'type': 'financial', 'date_col': 'Date'},
        'relianceindustries_Ratio.csv': {'type': 'financial', 'date_col': 'Date'},
        'relianceindustriesQuater.csv': {'type': 'financial', 'date_col': 'Date'},
        'RELIANCE  Prices.csv': {'type': 'financial', 'date_col': 'date'},
        'reliance_news.csv': {'type': 'text', 'date_col': 'publishedAt'},
        'RelianceReddit.csv': {'type': 'text', 'date_col': 'date'},
    }

    output_dir = './'

    print("Starting Data Cleaning Process for Reliance")

    for filename, params in files_to_clean.items():
        try:
            print(f"Processing file: {filename}")


            df = pd.read_csv(filename)

            if params['type'] == 'financial':
                cleaned_df = clean_financial_data(df, date_col_name=params['date_col'])
            elif params['type'] == 'text':
                cleaned_df = clean_text_data(df, date_col_name=params['date_col'])
            else:
                print(f"Warning: No cleaning type defined for {filename}. Skipping.")
                continue

            output_filename = f"cleaned_reliance_{filename.lower().replace(' ', '_')}"
            output_path = os.path.join(output_dir, output_filename)
            cleaned_df.to_csv(output_path, index=False)
            print(f"Successfully cleaned and saved to {output_path}\n")

        except FileNotFoundError:
            print(f"Error: The file {filename} was not found. Please check the path.")
        except Exception as e:
            print(f"An error occurred while processing {filename}: {e}")

    print("--- Data Cleaning Process Completed ---")

if __name__ == '__main__':
    main()


--- Starting Data Cleaning Process for Reliance ---
Processing file: relianceidustriesProfitnLoss.csv
Cleaned columns: ['date', 'revenue_from_operations_net', 'other_income', 'total_revenue', 'total_expenses', 'profit_loss_before_exceptional_extraordinary_items_and_tax', 'profit_loss_before_tax', 'profit_loss_for_the_period', 'basic_eps', 'diluted_eps']
Date column type: datetime64[ns]
------------------------------
Successfully cleaned and saved to ./cleaned_reliance_relianceidustriesprofitnloss.csv

Processing file: relianceindustries_balance.csv
Cleaned columns: ['date', 'equity_share_capital', 'total_share_capital', 'reserves_and_surplus', 'total_reserves_and_surplus', 'total_shareholders_funds', 'long_term_borrowings', 'deferred_tax_liabilities_net', 'other_long_term_liabilities', 'long_term_provisions', 'total_non_current_liabilities', 'short_term_borrowings', 'trade_payables', 'other_current_liabilities', 'short_term_provisions', 'total_current_liabilities', 'total_capital_and_l