In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, LSTM
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.metrics import RootMeanSquaredError
from tensorflow.keras.losses import MeanSquaredError
from tensorflow.keras.optimizers import Adam



In [2]:
df = pd.read_csv('uncleaned_final_final.csv')
df

Unnamed: 0.1,Unnamed: 0,Normalized EBITDA,Reconciled Depreciation,EBITDA,EBIT,Interest Expense,Diluted Average Shares,Diluted EPS,Net Income,Tax Provision,Other Non Operating Income Expenses,Operating Income,Operating Expense,Other Operating Expenses,Gross Profit,Cost Of Revenue,Total Revenue,Operating Revenue,Ticker,Sector
0,2025-06-30,2.160000e+09,1.140000e+09,2.160000e+09,1.020000e+09,600000000.0,3.613445e+08,1.19,4.300000e+08,-2.000000e+07,4.000000e+07,8.100000e+08,4.720000e+09,2.490000e+09,5.530000e+09,1.122000e+10,1.675000e+10,1.675000e+10,AARTIIND.NS,Chemicals
1,2025-03-31,2.374600e+09,1.132700e+09,2.268000e+09,1.135300e+09,253000000.0,,,9.587000e+08,-7.630000e+07,-3.318000e+08,1.868400e+09,-3.518700e+09,3.300000e+09,-1.650300e+09,1.973350e+10,1.808320e+10,1.808320e+10,AARTIIND.NS,Chemicals
2,2024-12-31,2.360000e+09,1.110000e+09,2.360000e+09,1.250000e+09,850000000.0,3.622047e+08,1.27,4.600000e+08,-6.000000e+07,5.000000e+07,1.210000e+09,4.890000e+09,2.730000e+09,6.100000e+09,1.230000e+10,1.840000e+10,1.840000e+10,AARTIIND.NS,Chemicals
3,2024-09-30,2.020000e+09,1.080000e+09,2.040000e+09,9.600000e+08,620000000.0,3.611111e+08,1.44,5.200000e+08,-1.800000e+08,5.000000e+07,8.800000e+08,5.230000e+09,3.100000e+09,6.110000e+09,1.017000e+10,1.628000e+10,1.628000e+10,AARTIIND.NS,Chemicals
4,2024-06-30,3.110000e+09,1.020000e+09,3.110000e+09,2.090000e+09,640000000.0,3.633952e+08,3.77,1.370000e+09,8.000000e+07,6.000000e+07,1.980000e+09,4.960000e+09,2.740000e+09,6.940000e+09,1.157000e+10,1.851000e+10,1.851000e+10,AARTIIND.NS,Chemicals
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1503,2025-06-30,2.243400e+10,2.381000e+09,2.243400e+10,2.005300e+10,847000000.0,1.006036e+09,14.58,1.466800e+10,4.340000e+09,1.549000e+09,1.793300e+10,2.884200e+10,1.747600e+10,4.677500e+10,1.789500e+10,6.467000e+10,6.467000e+10,ZYDUSLIFE.NS,Healthcare
1504,2025-03-31,1.990900e+10,2.379000e+09,1.914100e+10,1.676200e+10,42000000.0,,,1.170900e+10,4.232000e+09,-1.800000e+09,1.927000e+10,2.337700e+10,-1.879200e+10,4.264700e+10,2.025500e+10,6.290200e+10,6.290200e+10,ZYDUSLIFE.NS,Healthcare
1505,2024-12-31,1.445100e+10,2.290000e+09,1.445100e+10,1.216100e+10,320000000.0,1.006391e+09,10.17,1.023500e+10,1.795000e+09,5.750000e+08,9.760000e+09,2.564100e+10,1.534200e+10,3.540100e+10,1.583400e+10,5.123500e+10,5.123500e+10,ZYDUSLIFE.NS,Healthcare
1506,2024-09-30,1.529600e+10,2.336000e+09,1.529600e+10,1.296000e+10,251000000.0,1.005740e+09,9.06,9.112000e+09,3.731000e+09,6.820000e+08,1.182400e+10,2.459800e+10,1.470400e+10,3.642200e+10,1.471400e+10,5.113600e+10,5.113600e+10,ZYDUSLIFE.NS,Healthcare


In [3]:
important_cols = [
    "Total Revenue",
    "Operating Revenue",
    "Gross Profit",
    "EBITDA",
    "EBIT",
    "Operating Income",
    "Net Income",
    "Operating Expense",
    "Cost Of Revenue",
    "Diluted EPS"
]

def corrupted_or_missing(x):
    """
    Returns True if column for this ticker is completely unusable:
    - All values NaN
    - All values 0 (or a single repeated constant)
    """
    return x.isna().all() or (x.nunique(dropna=True) == 1 and (x.iloc[0] in [0, -9999]))

# Apply check ticker-wise
mask = df.groupby("Ticker")[important_cols].apply(
    lambda g: g.apply(corrupted_or_missing)
).any(axis=1)

# Tickers to drop
tickers_to_drop = mask[mask].index.tolist()

# Cleaned dataframe
df = df[~df["Ticker"].isin(tickers_to_drop)]

In [4]:
# Filter the DataFrame to only include rows where 'Sector' is not null
df = df[df['Sector'].notna()]

In [5]:
def impute_financials(df, group_key="Ticker"):
    df_out = df.copy()
    
    # interpolate within each ticker
    df_out = df_out.groupby(group_key).apply(
        lambda g: g.interpolate(method="linear", limit_direction="both")
    ).reset_index(drop=True)
    
    # fill remaining NaN with sector/overall median
    df_out = df_out.fillna(df_out.median(numeric_only=True))
    
    return df_out

In [6]:
df = impute_financials(df)

  lambda g: g.interpolate(method="linear", limit_direction="both")
  lambda g: g.interpolate(method="linear", limit_direction="both")
  lambda g: g.interpolate(method="linear", limit_direction="both")
  lambda g: g.interpolate(method="linear", limit_direction="both")
  lambda g: g.interpolate(method="linear", limit_direction="both")
  lambda g: g.interpolate(method="linear", limit_direction="both")
  lambda g: g.interpolate(method="linear", limit_direction="both")
  lambda g: g.interpolate(method="linear", limit_direction="both")
  lambda g: g.interpolate(method="linear", limit_direction="both")
  lambda g: g.interpolate(method="linear", limit_direction="both")
  lambda g: g.interpolate(method="linear", limit_direction="both")
  lambda g: g.interpolate(method="linear", limit_direction="both")
  lambda g: g.interpolate(method="linear", limit_direction="both")
  lambda g: g.interpolate(method="linear", limit_direction="both")
  lambda g: g.interpolate(method="linear", limit_direction="bo

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1289 entries, 0 to 1288
Data columns (total 20 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   Unnamed: 0                           1289 non-null   object 
 1   Normalized EBITDA                    1289 non-null   float64
 2   Reconciled Depreciation              1289 non-null   float64
 3   EBITDA                               1289 non-null   float64
 4   EBIT                                 1289 non-null   float64
 5   Interest Expense                     1289 non-null   float64
 6   Diluted Average Shares               1289 non-null   float64
 7   Diluted EPS                          1289 non-null   float64
 8   Net Income                           1289 non-null   float64
 9   Tax Provision                        1289 non-null   float64
 10  Other Non Operating Income Expenses  1289 non-null   float64
 11  Operating Income              

In [9]:
import pandas as pd

# Convert to datetime
df["Unnamed: 0"] = pd.to_datetime(df["Unnamed: 0"], errors="coerce")

# (Optional) rename it for clarity
df.rename(columns={"Unnamed: 0": "Date"}, inplace=True)

In [10]:
df = df.sort_values(by=["Ticker", "Date"]).reset_index(drop=True)

In [11]:
# Total unique quarters in dataset
print("Unique quarters in dataset:", df["Date"].nunique())

# Count how many unique quarters per ticker
quarter_counts = df.groupby("Ticker")["Date"].nunique()

# Summary stats
print(quarter_counts.describe())

# Tickers with < 5 quarters (incomplete data)
missing_quarters = quarter_counts[quarter_counts < 5]
print("Tickers with missing quarters:\n", missing_quarters)

# Split into mildly vs heavily corrupted
missing_one = quarter_counts[quarter_counts == 4]
heavily_corrupted = quarter_counts[quarter_counts <= 3]

print("\nTickers missing exactly 1 quarter:\n", missing_one)
print("\nHeavily corrupted tickers:\n", heavily_corrupted)


Unique quarters in dataset: 5
count    258.000000
mean       4.996124
std        0.062257
min        4.000000
25%        5.000000
50%        5.000000
75%        5.000000
max        5.000000
Name: Date, dtype: float64
Tickers with missing quarters:
 Ticker
PGHH.NS    4
Name: Date, dtype: int64

Tickers missing exactly 1 quarter:
 Ticker
PGHH.NS    4
Name: Date, dtype: int64

Heavily corrupted tickers:
 Series([], Name: Date, dtype: int64)


In [12]:
df = df[df["Ticker"] != "PGHH.NS"]

In [13]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

# Assume 'df' is your pre-loaded DataFrame

# --- STEP 1: (NEW) FEATURE ENGINEERING ---
# Create more powerful features from your existing data.
# This step is crucial for model performance.
# Add or remove ratios based on the columns you have.
# --- FEATURE ENGINEERING SECTION ---
# We use .replace(0, np.nan) to avoid division-by-zero errors, then fill any resulting NaNs.

print("Starting feature engineering...")

# 1. Profitability Ratios (How well the company generates profit)
df['Gross_Profit_Margin'] = (df['Gross Profit'] / df['Total Revenue'].replace(0, np.nan)).fillna(0)
df['Operating_Margin'] = (df['Operating Income'] / df['Total Revenue'].replace(0, np.nan)).fillna(0)
df['Net_Profit_Margin'] = (df['Net Income'] / df['Total Revenue'].replace(0, np.nan)).fillna(0)
df['EBITDA_Margin'] = (df['Normalized EBITDA'] / df['Total Revenue'].replace(0, np.nan)).fillna(0)

# 2. Operational Efficiency Ratios (How well the company manages costs)
df['Expense_Ratio'] = (df['Operating Expense'] / df['Total Revenue'].replace(0, np.nan)).fillna(0)
# Calculate Pretax Income to find the effective tax rate
pretax_income = df['Net Income'] + df['Tax Provision']
df['Effective_Tax_Rate'] = (df['Tax Provision'] / pretax_income.replace(0, np.nan)).fillna(0)

# 3. Growth Metrics (Quarter-over-Quarter changes)
# This is critical for a time-series model. It calculates the % change from the previous quarter.
df = df.sort_values(by=['Ticker', 'Date']) # Make sure data is sorted correctly
df['Revenue_Growth_QoQ'] = df.groupby('Ticker')['Total Revenue'].pct_change().fillna(0)
df['Net_Income_Growth_QoQ'] = df.groupby('Ticker')['Net Income'].pct_change().fillna(0)
df['Gross_Profit_Growth_QoQ'] = df.groupby('Ticker')['Gross Profit'].pct_change().fillna(0)



# Clean up infinite values that might result from pct_change if a previous value was 0
df.replace([np.inf, -np.inf], 0, inplace=True)

df.to_csv('final_engineered.csv', index=False)

print("Feature engineering complete. New columns added.")
# --- END OF FEATURE ENGINEERING ---

Starting feature engineering...
Feature engineering complete. New columns added.
X_train shape: (205, 5, 25)
y_train shape: (205,)


In [18]:
df = pd.read_csv('final_engineered.csv')

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1285 entries, 0 to 1284
Data columns (total 29 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   Date                                 1285 non-null   object 
 1   Normalized EBITDA                    1285 non-null   float64
 2   Reconciled Depreciation              1285 non-null   float64
 3   EBITDA                               1285 non-null   float64
 4   EBIT                                 1285 non-null   float64
 5   Interest Expense                     1285 non-null   float64
 6   Diluted Average Shares               1285 non-null   float64
 7   Diluted EPS                          1285 non-null   float64
 8   Net Income                           1285 non-null   float64
 9   Tax Provision                        1285 non-null   float64
 10  Other Non Operating Income Expenses  1285 non-null   float64
 11  Operating Income              

In [23]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# --- 1. Load and Prepare Data ---
try:
    df = pd.read_csv('final_engineered.csv')
except FileNotFoundError:
    print("Error: 'final_engineered.csv' not found. Please ensure the file is in the correct directory.")
    exit()

df['Date'] = pd.to_datetime(df['Date'])
df = df.sort_values(['Ticker', 'Date'])

# Define feature and target columns
target_col = 'Net Income'
# Select all numeric columns as features, excluding the target and identifiers
feature_cols = df.select_dtypes(include=np.number).columns.drop(target_col)

print(f"Target Column: {target_col}")
print(f"Number of Feature Columns: {len(feature_cols)}")

# --- 2. Create Sliding Windows ---
X, y = [], []
n_past = 4 # Use the past 4 rows (quarters) to predict the 5th

# Group by each company's ticker to create sequences
for ticker, group in df.groupby('Ticker'):
    features = group[feature_cols].values
    target = group[target_col].values
    
    # Ensure there's enough data for at least one window
    if len(group) > n_past:
        for i in range(n_past, len(group)):
            # The past 4 rows of features
            X.append(features[i-n_past:i, :])
            # The 'Net Income' of the current row (the one we want to predict)
            y.append(target[i])

X = np.array(X)
y = np.array(y)

print(f"\nOriginal X shape (Samples, Timesteps, Features): {X.shape}")
print(f"Original y shape (Samples,): {y.shape}")

# --- 3. Train-Test Split ---
# Split data before scaling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- 4. Scale the Data ---
# Scaler for features
scaler_X = StandardScaler()
n_train_samples, timesteps, n_features = X_train.shape
n_test_samples = X_test.shape[0]

# Reshape for scaling: (samples * timesteps, features)
X_train_reshaped = X_train.reshape(-1, n_features)
X_test_reshaped = X_test.reshape(-1, n_features)

# Fit on training data and transform both train and test
X_train_scaled_reshaped = scaler_X.fit_transform(X_train_reshaped)
X_test_scaled_reshaped = scaler_X.transform(X_test_reshaped)

# Reshape back to (samples, timesteps, features)
X_train_scaled = X_train_scaled_reshaped.reshape(n_train_samples, timesteps, n_features)
X_test_scaled = X_test_scaled_reshaped.reshape(n_test_samples, timesteps, n_features)

# Scaler for the target variable (y)
scaler_y = MinMaxScaler()
y_train_scaled = scaler_y.fit_transform(y_train.reshape(-1, 1))
y_test_scaled = scaler_y.transform(y_test.reshape(-1, 1))


# --- 5. FLATTEN Data for the Dense Network ---
# This is the key step for the FCN approach
X_train_flat = X_train_scaled.reshape(n_train_samples, timesteps * n_features)
X_test_flat = X_test_scaled.reshape(n_test_samples, timesteps * n_features)

print(f"\nFlattened X_train shape for FCN input: {X_train_flat.shape}")
print(f"Flattened X_test shape for FCN input: {X_test_flat.shape}")


# --- 6. Build and Compile the FCN Model ---
model = Sequential([
    # Input layer's shape is the number of flattened features
    Dense(128, activation='relu', input_shape=(timesteps * n_features,)),
    Dense(64, activation='relu'),
    # Final output layer has 1 neuron for the single regression target
    Dense(1) 
])

# For regression, we use a loss function like Mean Squared Error
model.compile(optimizer='adam', loss='mse')
model.summary()


# --- 7. Train the Model ---
print("\n--- Training Model ---")
history = model.fit(
    X_train_flat, 
    y_train_scaled, 
    epochs=50, 
    batch_size=32,
    validation_split=0.1,
    verbose=1
)


print("\n--- Evaluating Model ---")
loss = model.evaluate(X_test_flat, y_test_scaled, verbose=0)
print(f"Test Loss (MSE on scaled data): {loss:.4f}")

# Make predictions
predictions_scaled = model.predict(X_test_flat)

# Inverse transform to get predictions in the original scale
predictions_actual = scaler_y.inverse_transform(predictions_scaled)
y_test_actual = scaler_y.inverse_transform(y_test_scaled)

# --- NEW: Calculate MAPE ---
# Create a mask to filter out instances where the actual value is zero
non_zero_mask = y_test_actual.flatten() != 0

# Apply the mask to both actual and predicted values
y_test_mape = y_test_actual[non_zero_mask]
predictions_mape = predictions_actual[non_zero_mask]

# Calculate MAPE
if len(y_test_mape) > 0:
    mape = mean_absolute_percentage_error(y_test_mape, predictions_mape) * 100
    print(f"Test MAPE (on non-zero values): {mape:.2f}%")
    print(f"(Calculated on {len(y_test_mape)} out of {len(y_test_actual)} test samples)")
else:
    print("Could not calculate MAPE because all true values in the test set are zero.")

# Show a few examples
print("\n--- Example Predictions (in original scale) ---")
for i in range(5):
    print(f"Predicted: {predictions_actual[i][0]:,.2f} | Actual: {y_test_actual[i][0]:,.2f}")

Target Column: Net Income
Number of Feature Columns: 25

Original X shape (Samples, Timesteps, Features): (257, 4, 25)
Original y shape (Samples,): (257,)

Flattened X_train shape for FCN input: (205, 100)
Flattened X_test shape for FCN input: (52, 100)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)



--- Training Model ---
Epoch 1/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 23ms/step - loss: 0.4654 - val_loss: 0.3934
Epoch 2/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - loss: 0.4972 - val_loss: 0.0714
Epoch 3/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 0.0671 - val_loss: 0.0912
Epoch 4/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 0.0848 - val_loss: 0.0858
Epoch 5/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 0.0421 - val_loss: 0.0387
Epoch 6/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 0.0229 - val_loss: 0.0354
Epoch 7/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - loss: 0.0192 - val_loss: 0.0226
Epoch 8/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 0.0085 - val_loss: 0.0219
Epoch 9/50
[1m6/6[0m [32m━━━━━━━━━━━━

NameError: name 'mean_absolute_percentage_error' is not defined