In [84]:
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np

In [85]:
# -----------------------
# Utility functions
# -----------------------
def clean_value(x):
    """Clean financial values: remove ₹, commas, %, convert to float."""
    if pd.isna(x) or str(x).strip() in ["-", "NaN", "nan", "None"]:
        return None
    x = str(x).replace("₹", "").replace("â‚¹", "").replace(",", "").replace("−", "-").strip()
    if x.endswith("%"):
        try:
            return float(x.replace("%", "")) / 100.0
        except:
            return None
    try:
        return float(x)
    except:
        return None

def reshape_df(df, company, source):
    """Convert wide-format financial data into long format."""
    df = df.rename(columns={df.columns[0]: "Metric"})
    melted = df.melt(id_vars=["Metric"], var_name="Quarter", value_name="Value")
    melted["Value"] = melted["Value"].apply(clean_value)
    melted["Company"] = company
    melted["Source"] = source
    
    melted["Value"] = (
        melted["Value"]
        .astype(str)
        .str.replace(r"[^\d\.\-]", "", regex=True)  # keep only numbers, decimal, minus
        .replace("", "0")                          # empty -> 0
        .astype(float)
    )
    
    return melted

# -----------------------
# Load CSVs
# -----------------------
income_df = pd.read_csv('C:\\Users\\mjaye\\PycharmProjects\\Fintellect\\Data\\CIPLA.NS_QUARTER_INCOME_STATEMENT_FROM_PERPLEXITY.csv')
key_stats_df = pd.read_csv('C:\\Users\\mjaye\\PycharmProjects\\Fintellect\\Data\\CIPLA.NS_QUARTER_KEY_STATS_FROM_PERPLEXITY.csv')

df_test = reshape_df(income_df, "CIPLA", "KeyStats")
print(df_test["Metric"].unique())

# -----------------------
# Restrict to common quarters
# -----------------------
common_quarters = set(income_df.columns[1:]).intersection(set(key_stats_df.columns[1:]))
common_quarters = sorted(common_quarters)  # e.g. ['Dec 24', 'Mar 25', 'Sept 24']

income_common = income_df[["Unnamed: 0"] + list(common_quarters)]
key_stats_common = key_stats_df[["Unnamed: 0"] + list(common_quarters)]

# -----------------------
# Reshape to long format
# -----------------------
income_long_common = reshape_df(income_common, "CIPLA", "Income Statement")
key_stats_long_common = reshape_df(key_stats_common, "CIPLA", "Key Stats")

# Combine both
combined_income_keystats = pd.concat([income_long_common, key_stats_long_common], ignore_index=True)

# -----------------------
# Pivot to wide feature table
# -----------------------
features_df = combined_income_keystats.pivot_table(
    index=["Company", "Quarter"],
    columns="Metric",
    values="Value",
    aggfunc="first"
).reset_index()

['Revenue' '% Growth' 'Cost of Goods Sold' 'Gross Profit' '% Margin'
 'R&D Expenses' 'G&A Expenses' 'SG&A Expenses' 'Sales & Mktg Exp.'
 'Other Operating Expenses' 'Operating Expenses' 'Operating Income'
 'Other Income/Exp. Net' 'Pre-Tax Income' 'Tax Expense' 'Net Income' 'EPS'
 'EPS Diluted' 'Weighted Avg Shares Out' 'Weighted Avg Shares Out Dil'
 'Supplemental Information' 'Interest Income' 'Interest Expense'
 'Depreciation & Amortization' 'EBITDA']


In [86]:
features_df

Metric,Company,Quarter,% Growth,% Margin,+ Debt,- Cash,Capital Expenditures,Cost of Goods Sold,Depreciation & Amortization,EBITDA,...,Other Operating Expenses,Pre-Tax Income,R&D Expenses,Revenue,SG&A Expenses,Sales & Mktg Exp.,Supplemental Information,Tax Expense,Weighted Avg Shares Out,Weighted Avg Shares Out Dil
0,CIPLA,Dec 24,-0.013,0.675,0.0,0.0,0.0,22641.0,2798.0,22105.0,...,13661.0,19161.0,3600.0,69616.0,12624.0,0.0,0.0,3324.0,807.0,808.0
1,CIPLA,Mar 25,-0.052,0.455,4382.0,7998.0,0.0,-7315.0,3087.0,18271.0,...,-30802.0,14934.0,4260.0,65977.0,12331.0,0.0,0.0,2793.0,808.0,808.0
2,CIPLA,Sept 24,0.0,0.466,4614.0,7978.0,0.0,37626.0,2717.0,20762.0,...,0.0,17891.0,3850.0,70510.0,12895.0,0.0,0.0,4830.0,808.0,808.0


In [87]:
# Load Lupin data (Income Statement + Key Stats)
lupin_income_df = pd.read_csv("C:\\Users\\mjaye\\PycharmProjects\\Fintellect\\Data\\LUPIN.NS_QUARTER_INCOME_STATEMENT_FROM_PERPLEXITY.csv")
lupin_key_stats_df = pd.read_csv("C:\\Users\\mjaye\\PycharmProjects\\Fintellect\\Data\\LUPIN.NS_QUARTER_KEY_STATS_FROM_PERPLEXITY.csv")

# Find common quarters for Lupin
common_quarters_lupin = set(lupin_income_df.columns[1:]).intersection(set(lupin_key_stats_df.columns[1:]))
common_quarters_lupin = sorted(common_quarters_lupin)

# Restrict both dfs to common quarters
lupin_income_common = lupin_income_df[["Unnamed: 0"] + list(common_quarters_lupin)]
lupin_key_stats_common = lupin_key_stats_df[["Unnamed: 0"] + list(common_quarters_lupin)]

# Reshape to long format
lupin_income_long_common = reshape_df(lupin_income_common, "LUPIN", "Income Statement")
lupin_key_stats_long_common = reshape_df(lupin_key_stats_common, "LUPIN", "Key Stats")

# Combine Lupin
combined_lupin = pd.concat([lupin_income_long_common, lupin_key_stats_long_common], ignore_index=True)

# Pivot to wide format
features_lupin = combined_lupin.pivot_table(
    index=["Company", "Quarter"],
    columns="Metric",
    values="Value",
    aggfunc="first"
).reset_index()

# Append to features_df (CIPLA + LUPIN)
features_all = pd.concat([features_df, features_lupin], ignore_index=True)

In [88]:
features_all

Metric,Company,Quarter,% Growth,% Margin,+ Debt,- Cash,Capital Expenditures,Cost of Goods Sold,Depreciation & Amortization,EBITDA,...,Other Operating Expenses,Pre-Tax Income,R&D Expenses,Revenue,SG&A Expenses,Sales & Mktg Exp.,Supplemental Information,Tax Expense,Weighted Avg Shares Out,Weighted Avg Shares Out Dil
0,CIPLA,Dec 24,-0.013,0.675,0.0,0.0,0.0,22641.0,2798.0,22105.0,...,13661.0,19161.0,3600.0,69616.0,12624.0,0.0,0.0,3324.0,807.0,808.0
1,CIPLA,Mar 25,-0.052,0.455,4382.0,7998.0,0.0,-7315.0,3087.0,18271.0,...,-30802.0,14934.0,4260.0,65977.0,12331.0,0.0,0.0,2793.0,808.0,808.0
2,CIPLA,Sept 24,0.0,0.466,4614.0,7978.0,0.0,37626.0,2717.0,20762.0,...,0.0,17891.0,3850.0,70510.0,12895.0,0.0,0.0,4830.0,808.0,808.0
3,LUPIN,Dec 24,0.022,0.694,0.0,0.0,0.0,17216.0,2715.0,14096.0,...,-4344.0,10713.0,4344.0,56186.0,11067.0,0.0,0.0,2124.0,456.0,458.0
4,LUPIN,Mar 25,0.009,0.653,54478.0,27552.0,0.0,19659.0,2723.0,12572.0,...,0.0,8958.0,-734.0,56671.0,27622.0,0.0,0.0,1135.0,456.0,458.0
5,LUPIN,Sept 24,0.0,0.693,34425.0,11095.0,0.0,16899.0,2569.0,13827.0,...,10887.0,10549.0,4481.0,54970.0,12189.0,0.0,0.0,1954.0,456.0,457.0


In [89]:
# -----------------------
# SUN PHARMA
# Load Sun Pharma data
sun_income_df = pd.read_csv('C:\\Users\\mjaye\\PycharmProjects\\Fintellect\\Data\\SUNPHARMA.BO_QUARTER_INCOME_STATEMENT_FROM_PERPLEXITY.csv')
sun_key_stats_df = pd.read_csv('C:\\Users\\mjaye\\PycharmProjects\\Fintellect\\Data\\SUNPHARMA.BO_QUARTER_KEY_STATS_FROM_PERPLEXITY.csv')

# Find common quarters
common_quarters_sun = set(sun_income_df.columns[1:]).intersection(set(sun_key_stats_df.columns[1:]))
common_quarters_sun = sorted(common_quarters_sun)

# Restrict to common quarters
sun_income_common = sun_income_df[["Unnamed: 0"] + list(common_quarters_sun)]
sun_key_stats_common = sun_key_stats_df[["Unnamed: 0"] + list(common_quarters_sun)]

# Reshape to long format
sun_income_long = reshape_df(sun_income_common, "SUNPHARMA", "Income Statement")
sun_key_stats_long = reshape_df(sun_key_stats_common, "SUNPHARMA", "Key Stats")

# Combine
combined_sun = pd.concat([sun_income_long, sun_key_stats_long], ignore_index=True)

# Pivot into wide format
features_sun = combined_sun.pivot_table(
    index=["Company", "Quarter"],
    columns="Metric",
    values="Value",
    aggfunc="first"
).reset_index()
features_all = pd.concat([features_all, features_sun], ignore_index=True)

In [90]:
features_all

Metric,Company,Quarter,% Growth,% Margin,+ Debt,- Cash,Capital Expenditures,Cost of Goods Sold,Depreciation & Amortization,EBITDA,...,Other Operating Expenses,Pre-Tax Income,R&D Expenses,Revenue,SG&A Expenses,Sales & Mktg Exp.,Supplemental Information,Tax Expense,Weighted Avg Shares Out,Weighted Avg Shares Out Dil
0,CIPLA,Dec 24,-0.013,0.675,0.0,0.0,0.0,22641.0,2798.0,22105.0,...,13661.0,19161.0,3600.0,69616.0,12624.0,0.0,0.0,3324.0,807.0,808.0
1,CIPLA,Mar 25,-0.052,0.455,4382.0,7998.0,0.0,-7315.0,3087.0,18271.0,...,-30802.0,14934.0,4260.0,65977.0,12331.0,0.0,0.0,2793.0,808.0,808.0
2,CIPLA,Sept 24,0.0,0.466,4614.0,7978.0,0.0,37626.0,2717.0,20762.0,...,0.0,17891.0,3850.0,70510.0,12895.0,0.0,0.0,4830.0,808.0,808.0
3,LUPIN,Dec 24,0.022,0.694,0.0,0.0,0.0,17216.0,2715.0,14096.0,...,-4344.0,10713.0,4344.0,56186.0,11067.0,0.0,0.0,2124.0,456.0,458.0
4,LUPIN,Mar 25,0.009,0.653,54478.0,27552.0,0.0,19659.0,2723.0,12572.0,...,0.0,8958.0,-734.0,56671.0,27622.0,0.0,0.0,1135.0,456.0,458.0
5,LUPIN,Sept 24,0.0,0.693,34425.0,11095.0,0.0,16899.0,2569.0,13827.0,...,10887.0,10549.0,4481.0,54970.0,12189.0,0.0,0.0,1954.0,456.0,457.0
6,SUNPHARMA,Dec 24,0.011,0.796,0.0,0.0,0.0,27405.0,6306.0,41585.0,...,29443.0,34764.0,8248.0,134369.0,33655.0,0.0,0.0,5589.0,2399.0,2399.0
7,SUNPHARMA,Mar 25,-0.036,0.796,23622.0,113316.0,0.0,26372.0,6438.0,34049.0,...,0.0,32476.0,7904.0,129588.0,24885.0,0.0,0.0,10937.0,2389.0,2389.0
8,SUNPHARMA,Sept 24,0.0,0.564,25720.0,80125.0,0.0,57978.0,6259.0,42930.0,...,0.0,35979.0,7628.0,132914.0,35458.0,0.0,0.0,5672.0,2394.0,2394.0


In [91]:
df = features_all.copy()

In [92]:
df["Quarter"] = df["Quarter"].str.replace("Sept", "Sep")

In [93]:
    # Convert Quarter to datetime for proper sorting
    # We use a custom parser to handle the 'Mon YY' format
df['Quarter'] = pd.to_datetime(df['Quarter'], format='%b %y')

    # Sort data by company and then by quarter chronologically
df = df.sort_values(by=["Company", "Quarter"], ascending=[True, True]).reset_index(drop=True)   

In [94]:
print(df["Quarter"].dtype)

datetime64[ns]


In [95]:
df

Metric,Company,Quarter,% Growth,% Margin,+ Debt,- Cash,Capital Expenditures,Cost of Goods Sold,Depreciation & Amortization,EBITDA,...,Other Operating Expenses,Pre-Tax Income,R&D Expenses,Revenue,SG&A Expenses,Sales & Mktg Exp.,Supplemental Information,Tax Expense,Weighted Avg Shares Out,Weighted Avg Shares Out Dil
0,CIPLA,2024-09-01,0.0,0.466,4614.0,7978.0,0.0,37626.0,2717.0,20762.0,...,0.0,17891.0,3850.0,70510.0,12895.0,0.0,0.0,4830.0,808.0,808.0
1,CIPLA,2024-12-01,-0.013,0.675,0.0,0.0,0.0,22641.0,2798.0,22105.0,...,13661.0,19161.0,3600.0,69616.0,12624.0,0.0,0.0,3324.0,807.0,808.0
2,CIPLA,2025-03-01,-0.052,0.455,4382.0,7998.0,0.0,-7315.0,3087.0,18271.0,...,-30802.0,14934.0,4260.0,65977.0,12331.0,0.0,0.0,2793.0,808.0,808.0
3,LUPIN,2024-09-01,0.0,0.693,34425.0,11095.0,0.0,16899.0,2569.0,13827.0,...,10887.0,10549.0,4481.0,54970.0,12189.0,0.0,0.0,1954.0,456.0,457.0
4,LUPIN,2024-12-01,0.022,0.694,0.0,0.0,0.0,17216.0,2715.0,14096.0,...,-4344.0,10713.0,4344.0,56186.0,11067.0,0.0,0.0,2124.0,456.0,458.0
5,LUPIN,2025-03-01,0.009,0.653,54478.0,27552.0,0.0,19659.0,2723.0,12572.0,...,0.0,8958.0,-734.0,56671.0,27622.0,0.0,0.0,1135.0,456.0,458.0
6,SUNPHARMA,2024-09-01,0.0,0.564,25720.0,80125.0,0.0,57978.0,6259.0,42930.0,...,0.0,35979.0,7628.0,132914.0,35458.0,0.0,0.0,5672.0,2394.0,2394.0
7,SUNPHARMA,2024-12-01,0.011,0.796,0.0,0.0,0.0,27405.0,6306.0,41585.0,...,29443.0,34764.0,8248.0,134369.0,33655.0,0.0,0.0,5589.0,2399.0,2399.0
8,SUNPHARMA,2025-03-01,-0.036,0.796,23622.0,113316.0,0.0,26372.0,6438.0,34049.0,...,0.0,32476.0,7904.0,129588.0,24885.0,0.0,0.0,10937.0,2389.0,2389.0


In [96]:
df.to_csv("test_one.csv", index=False, na_rep='N/A')

In [97]:
# # Define the features (X) and the target (y)
# # Features are all the financial metrics from the current quarter.
# # We exclude identifiers and the target itself.
# features = [col for col in df_clean.columns if col not in [
#         'Company', 'Quarter', 'Revenue', 'Target Next Q Growth (%)'
#     ]]
#     
# X = df_clean[features]
# y = df_clean['Target Next Q Growth (%)']
#     
# if len(X) < 2:
#     print("Not enough data to train a model. Need at least two complete samples.")
# 
#     # Split data for training and validation (optional but good practice)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 
#     # --- Model Training ---
#     # LightGBM is a good choice: it's fast, efficient, and performs well on tabular data.
# print("--- Training Model ---")
# model = lgb.LGBMRegressor(random_state=42, num_leaves=3, n_estimators=3)
# model.fit(X_train, y_train)
#     
#     # Evaluate the model
# predictions = model.predict(X_test)
# rmse = np.sqrt(mean_squared_error(y_test, predictions))
# print(f"\nModel Evaluation RMSE: {rmse:.2f}%")
#     
#     # --- Making a Prediction for the Future ---
# print("\n--- Predicting Next Quarter's Growth ---")
#     # To predict for the next quarter (e.g., Jun '25), we need the financial data
#     # from the most recent quarter (Mar '25).