# Fundamental-Based IPO Preprocessing
This notebook cleans and preprocesses the fundamental IPO data for the Random Forest model.

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Load data
df = pd.read_excel('raw_dataset/Initial Public Offering.xlsx')
print(f"Initial shape: {df.shape}")

## 1. Data Cleaning
- Drop unnamed columns
- Drop rows with all NaN
- Handle missing values

In [None]:
# Drop unnamed columns
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

# Drop rows with all NaN
df.dropna(how='all', inplace=True)

# Check for nulls
print("Missing values per column:")
print(df.isnull().sum())

# Fill numeric nulls with median and categorical with mode
for col in df.columns:
    if df[col].dtype in ['float64', 'int64']:
        df[col] = df[col].fillna(df[col].median())
    else:
        df[col] = df[col].fillna(df[col].mode()[0])

print(f"Shape after basic cleaning: {df.shape}")

## 2. Feature Engineering
- Convert Date to datetime features
- Encode categorical variables

In [None]:
# Convert Date
df['Date'] = pd.to_datetime(df['Date'])
df['Listing_Year'] = df['Date'].dt.year
df['Listing_Month'] = df['Date'].dt.month
df.drop('Date', axis=1, inplace=True)

# Encode categorical columns
le = LabelEncoder()
for col in df.select_dtypes(include=['object']).columns:
    if col != 'IPO_Name':
        df[col] = le.fit_transform(df[col].astype(str))

df.head()

## 3. Scaling
Scale features for the model.

In [None]:
scaler = StandardScaler()
features_to_scale = df.select_dtypes(include=['float64', 'int64']).columns
df[features_to_scale] = scaler.fit_transform(df[features_to_scale])

print("Preprocessing Complete.")

## 4. Save Cleaned Data

In [None]:
df.to_csv('cleaned_dataset/cleaned_fundamental_data.csv', index=False)
print("Data saved to cleaned_fundamental_data.csv")