# Fundamental-Based IPO Preprocessing
This notebook cleans and preprocesses the fundamental IPO data for the Random Forest model.

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Load data
df = pd.read_excel('raw_dataset/Initial Public Offering.xlsx')
print(f"Initial shape: {df.shape}")

Initial shape: (561, 21)


## 1. Data Cleaning
- Drop unnamed columns
- Drop rows with all NaN
- Handle missing values

In [2]:
# Drop unnamed columns
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

# Drop rows with all NaN
df.dropna(how='all', inplace=True)

# Check for nulls
print("Missing values per column:")
print(df.isnull().sum())

# Fill numeric nulls with median and categorical with mode
for col in df.columns:
    if df[col].dtype in ['float64', 'int64']:
        df[col] = df[col].fillna(df[col].median())
    else:
        df[col] = df[col].fillna(df[col].mode()[0])

print(f"Shape after basic cleaning: {df.shape}")

Missing values per column:
Date                   0
IPO_Name               0
Issue_Size(crores)     0
QIB                    2
HNI                    2
RII                    2
Total                  2
Offer Price            0
List Price             0
Listing Gain           0
CMP(BSE)               2
CMP(NSE)              10
Current Gains          3
dtype: int64
Shape after basic cleaning: (561, 13)


## 2. Feature Engineering
- Convert Date to datetime features
- Encode categorical variables

In [3]:
# Convert Date
df['Date'] = pd.to_datetime(df['Date'])
df['Listing_Year'] = df['Date'].dt.year
df['Listing_Month'] = df['Date'].dt.month
df.drop('Date', axis=1, inplace=True)

# Encode categorical columns
le = LabelEncoder()
for col in df.select_dtypes(include=['object']).columns:
    if col != 'IPO_Name':
        df[col] = le.fit_transform(df[col].astype(str))

df.head()

Unnamed: 0,IPO_Name,Issue_Size(crores),QIB,HNI,RII,Total,Offer Price,List Price,Listing Gain,CMP(BSE),CMP(NSE),Current Gains,Listing_Year,Listing_Month
0,M & B Engineering Ltd,650.0,36.72,38.24,32.55,36.2,385,386.0,0.26,426.85,426.15,10.87,2025,8
1,Sri Lotus Developers & Realty Ltd,792.0,163.9,57.71,20.28,69.14,150,179.1,19.4,201.1,199.72,34.07,2025,8
2,National Securities Depository Ltd (NSDL),4011.6,103.97,34.98,7.73,41.01,800,880.0,10.0,1294.05,61.76,14.675,2025,8
3,Aditya Infotech Ltd,1300.0,133.21,72.0,50.87,100.69,675,1018.0,50.81,1064.6,1062.7,57.72,2025,8
4,Laxmi India Finance Ltd,254.26,1.3,1.84,2.22,1.87,158,136.0,-13.92,149.0,150.0,-5.7,2025,8


## 3. Scaling
Scale features for the model.

In [4]:
scaler = StandardScaler()
features_to_scale = df.select_dtypes(include=['float64', 'int64']).columns
df[features_to_scale] = scaler.fit_transform(df[features_to_scale])

print("Preprocessing Complete.")

Preprocessing Complete.


## 4. Save Cleaned Data

In [None]:
df.to_csv('cleaned_dataset/cleaned_fundamental_data.csv', index=False)
print("Data saved to cleaned_fundamental_data.csv")