In [7]:
import numpy as np
import pandas as pd
import os
import re

In [8]:
categories = {
    "Balance-sheet": [
        "Reserves and Surplus", "Total Assets", "Total Capital And Liabilities", "Total Current Assets",
        "Total Current Liabilities", "Total Non-Current Assets", "Total Reserves and Surplus",
        "Total Share Capital", "Total Shareholders Funds", "Fixed Assets", "Other Current Liabilities",
        "Tangible Assets"
    ],
    "Cash-flow": [
        "Cash And Cash Equivalents Begin of Year", "Cash And Cash Equivalents End Of Year",
        "Net Cash Used In Investing Activities", "Net CashFlow From Operating Activities",
        "Net Profit/Loss Before Extraordinary Items And Tax", "Net Inc/Dec In Cash And Cash Equivalents",
        "Net Cash Used From Financing Activities", "Foreign Exchange Gains / Losses"
    ],
    "Profit-loss": [
        "Basic EPS (Rs.)", "Diluted EPS (Rs.)", "Employee Benefit Expenses", "Other Expenses",
        "Profit/Loss After Tax And Before ExtraOrdinary Items", "Profit/Loss Before Exceptional, ExtraOrdinary Items And Tax",
        "Profit/Loss Before Tax", "Profit/Loss For The Period", "Profit/Loss From Continuing Operations",
        "Revenue From Operations [Gross]", "Revenue From Operations [Net]", "Total Expenses", "Total Operating Revenues",
        "Total Revenue", "Other Income", "Depreciation And Amortisation Expenses", "Total Tax Expenses",
        "Deferred Tax", "Finance Costs", "Current Tax", "Operating And Direct Expenses"
    ],
    "Quarterly-resul": [
        "Equity Share Capital", "Net Profit/(Loss) For the Period", "Other Expenses",
        "P/L After Tax from Ordinary Activities", "P/L Before Exceptional Items & Tax", "P/L Before Int., Excpt. Items & Tax",
        "P/L Before Other Inc. , Int., Excpt. Items & Tax", "P/L Before Tax", "Employees Cost",
        "Net Sales/Income from operations", "Total Income From Operations", "Basic EPS", "Basic EPS.",
        "Diluted EPS", "Diluted EPS.", "depreciat", "Other Income", "Interest", "Tax"
    ],
    "Ratios": [
        "Net Profit Margin (%)", "Net Profit/Share (Rs.)", "PBDIT Margin (%)", "PBDIT/Share (Rs.)", "PBIT Margin (%)",
        "PBIT/Share (Rs.)", "PBT Margin (%)", "PBT/Share (Rs.)", "Return on Assets (%)",
        "Return on Networth / Equity (%)", "Revenue from Operations/Share (Rs.)", "EV/EBITDA (X)", "Enterprise Value (Cr.)",
        "Dividend Payout Ratio (CP) (%)", "Dividend Payout Ratio (NP) (%)", "Earnings Retention Ratio (%)",
        "Total Debt/Equity (X)"
    ],
    "Year": [
        "Equity Share Capital", "Net Profit/(Loss) For the Period", "Other Expenses", "P/L After Tax from Ordinary Activities",
        "P/L Before Exceptional Items & Tax", "P/L Before Int., Excpt. Items & Tax",
        "P/L Before Other Inc. , Int., Excpt. Items & Tax", "P/L Before Tax", "Employees Cost",
        "Net Sales/Income from operations", "Total Income From Operations", "Basic EPS", "Basic EPS.",
        "Diluted EPS", "Diluted EPS.", "depreciat", "Other Income", "Interest", "Tax"
    ]
}

In [9]:
# Input and output directories
input_folder = r"C:\Users\sharm\OneDrive\Desktop\Kishan\Contractzy\WebScrapping\Tutorial\Financial_Data\MoneyControl\Companies\IT Services & Consulting\3i Infotech Ltd\Excel"
output_folder = r"C:\Users\sharm\OneDrive\Desktop\Kishan\Data\Data Preprocessing\3i Infotech"
os.makedirs(output_folder, exist_ok=True)  # Ensure output folder exists

In [10]:

def extract_year(col_name):
    """Extracts year from column names, returns 'inf' if no year found."""
    match = re.search(r'\d{4}', str(col_name))  # Looking for a 4-digit year
    return int(match.group()) if match else float('inf')

In [11]:
for category, valid_parameters in categories.items():
    input_file = os.path.join(input_folder, f"{category}_combined.xlsx")
    output_file = os.path.join(output_folder, f"cleanedData_{category}.xlsx")

    if os.path.exists(input_file):
        df = pd.read_excel(input_file)

        # Normalize the parameter column
        df.iloc[:, 0] = df.iloc[:, 0].astype(str).str.strip()  

        # Identify parameters that do not match
        not_common_parameters = [item for item in df.iloc[:, 0] if item not in valid_parameters]

        # Drop rows that contain non-matching parameters
        df = df[~df.iloc[:, 0].isin(not_common_parameters)]

        # Clean numeric data
        df.replace("12 mths", np.nan, inplace=True)
        df.dropna(inplace=True)
        df.iloc[:, 1:] = df.iloc[:, 1:].replace(",", "", regex=True)
        df.iloc[:, 1:] = df.iloc[:, 1:].apply(pd.to_numeric, errors='coerce')

        # Ensure numeric sorting only if years exist
        column_start = df.columns[0]
        if any(re.search(r'\d{4}', str(col)) for col in df.columns[1:]):
            sorted_columns = sorted(df.columns[1:], key=extract_year)
            df = df[[column_start] + sorted_columns]

        # Transpose and save
        df = df.T.reset_index()
        df.to_excel(output_file, index=False)

        print(f"Processed {category}: {input_file} -> {output_file}")

print("All categories processed successfully!")

Processed Balance-sheet: C:\Users\sharm\OneDrive\Desktop\Kishan\Contractzy\WebScrapping\Tutorial\Financial_Data\MoneyControl\Companies\IT Services & Consulting\3i Infotech Ltd\Excel\Balance-sheet_combined.xlsx -> C:\Users\sharm\OneDrive\Desktop\Kishan\Data\Data Preprocessing\3i Infotech\cleanedData_Balance-sheet.xlsx
Processed Cash-flow: C:\Users\sharm\OneDrive\Desktop\Kishan\Contractzy\WebScrapping\Tutorial\Financial_Data\MoneyControl\Companies\IT Services & Consulting\3i Infotech Ltd\Excel\Cash-flow_combined.xlsx -> C:\Users\sharm\OneDrive\Desktop\Kishan\Data\Data Preprocessing\3i Infotech\cleanedData_Cash-flow.xlsx
Processed Profit-loss: C:\Users\sharm\OneDrive\Desktop\Kishan\Contractzy\WebScrapping\Tutorial\Financial_Data\MoneyControl\Companies\IT Services & Consulting\3i Infotech Ltd\Excel\Profit-loss_combined.xlsx -> C:\Users\sharm\OneDrive\Desktop\Kishan\Data\Data Preprocessing\3i Infotech\cleanedData_Profit-loss.xlsx
Processed Quarterly-resul: C:\Users\sharm\OneDrive\Desktop\Ki

In [12]:
output_folder = r"C:\Users\sharm\OneDrive\Desktop\Kishan\Data\Data Preprocessing\3i Infotech\Mew"
os.makedirs(output_folder, exist_ok=True)
# Dictionary to store DataFrames for combining later
category_dfs = {}
company_name = "3i Infotech"
for category, valid_parameters in categories.items():
    input_file = os.path.join(input_folder, f"{category}_combined.xlsx")
    output_file = os.path.join(output_folder, f"cleanedData_{category}.xlsx")

    if os.path.exists(input_file):
        df = pd.read_excel(input_file)

        # Normalize and clean parameter column
        df.iloc[:, 0] = df.iloc[:, 0].astype(str).str.strip()

        # Identify parameters that do not match
        not_common_parameters = [item for item in df.iloc[:, 0] if item not in valid_parameters]

        # Drop rows that contain non-matching parameters and reset index
        df = df[~df.iloc[:, 0].isin(not_common_parameters)].reset_index(drop=True)

        # Clean numeric data
        df.replace("12 mths", np.nan, inplace=True)
        df.dropna(inplace=True)  # Drop NaN values
        # df.reset_index(drop=True, inplace=True)  # Reset index after dropping NaN

        df.iloc[:, 1:] = df.iloc[:, 1:].replace(",", "", regex=True)
        df.iloc[:, 1:] = df.iloc[:, 1:].apply(pd.to_numeric, errors='coerce')

        # Ensure numeric sorting only if years exist
        column_start = df.columns[0]
        print(column_start)
        if any(re.search(r'\d{4}', str(col)) for col in df.columns[1:]):
            sorted_columns = sorted(df.columns[1:], key=extract_year)
            df = df[[column_start] + sorted_columns]

        # # Transpose and save
        # df = df.T.reset_index(drop=True)  # Reset index after transpose
        # df.to_excel(output_file, index=False)
        
        df = df.T  # Transpose
        df.columns = df.iloc[0]  # Set the first row as column headers
        df = df[1:].reset_index(drop=True)  # Remove the first row and reset index



        print(f"Processed {category}: {input_file} -> {output_file}")

        # Store DataFrame for combining later
        category_dfs[category] = df

# Combine all cleaned data into one Excel file with multiple sheets
final_output_file = os.path.join(output_folder, f"Pruned_{company_name}.xlsx")

with pd.ExcelWriter(final_output_file, engine='xlsxwriter') as writer:
    for category, df in category_dfs.items():
        df.to_excel(writer, sheet_name=category, index=False)

print(f"All cleaned data combined and saved in {final_output_file}")

Balance Sheet of 3i Infotech(in Rs. Cr.)
Processed Balance-sheet: C:\Users\sharm\OneDrive\Desktop\Kishan\Contractzy\WebScrapping\Tutorial\Financial_Data\MoneyControl\Companies\IT Services & Consulting\3i Infotech Ltd\Excel\Balance-sheet_combined.xlsx -> C:\Users\sharm\OneDrive\Desktop\Kishan\Data\Data Preprocessing\3i Infotech\Mew\cleanedData_Balance-sheet.xlsx
Cash Flow of 3i Infotech(in Rs. Cr.)
Processed Cash-flow: C:\Users\sharm\OneDrive\Desktop\Kishan\Contractzy\WebScrapping\Tutorial\Financial_Data\MoneyControl\Companies\IT Services & Consulting\3i Infotech Ltd\Excel\Cash-flow_combined.xlsx -> C:\Users\sharm\OneDrive\Desktop\Kishan\Data\Data Preprocessing\3i Infotech\Mew\cleanedData_Cash-flow.xlsx
Profit & Loss account of 3i Infotech(in Rs. Cr.)
Processed Profit-loss: C:\Users\sharm\OneDrive\Desktop\Kishan\Contractzy\WebScrapping\Tutorial\Financial_Data\MoneyControl\Companies\IT Services & Consulting\3i Infotech Ltd\Excel\Profit-loss_combined.xlsx -> C:\Users\sharm\OneDrive\Deskto