# GMP-Based IPO Preprocessing
This notebook cleans and preprocesses the GMP IPO data for the Support Vector Machine models.

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Load data
df = pd.read_csv('raw_dataset/Mainline IPO GMP Performance (1).csv')
print(f"Initial shape: {df.shape}")

Initial shape: (152, 12)


## 1. Data Cleaning
- Clean currency and percentage strings
- Handle missing values

In [2]:
def clean_value(val):
    if isinstance(val, str):
        # Remove currency symbols, commas and percent signs
        val = val.replace('₹', '').replace(',', '').replace('%', '').strip()
        try:
            return float(val)
        except ValueError:
            return np.nan
    return val

cols_to_clean = ['IPO_Size', 'Subscription', 'GMP', 'IPO Price', 'Estimated Price', 
                 'Estimated Percentage', 'Listing Price', 'Listing Percentage', 
                 'LT Price', 'LT Percentage']

for col in cols_to_clean:
    if col in df.columns:
        df[col] = df[col].apply(clean_value)

# Fill missing values
df.fillna(df.median(numeric_only=True), inplace=True)

print("Missing values after cleaning:")
print(df.isnull().sum())

Missing values after cleaning:
IPO                     0
Listing Date            0
IPO_Size                0
Subscription            0
GMP                     0
IPO Price               0
Estimated Price         0
Estimated Percentage    0
Listing Price           0
Listing Percentage      0
LT Price                0
LT Percentage           0
dtype: int64


## 2. Feature Engineering
- Define Risk Categories based on Listing Percentage or LT Percentage
- Bifurcate Listing Date into Year and Month
- Explicitly Map Risk Categories to Numerical Values

In [3]:
def classify_risk(perc):
    if perc < 10:
        return 'High'
    elif perc < 30:
        return 'Medium'
    else:
        return 'Low'

if 'LT Percentage' in df.columns:
    df['Risk_Category'] = df['LT Percentage'].apply(classify_risk)

# Date Bifurcation
if 'Listing Date' in df.columns:
    df['Listing Date'] = pd.to_datetime(df['Listing Date'])
    df['Listing_Year'] = df['Listing Date'].dt.year
    df['Listing_Month'] = df['Listing Date'].dt.month
    df.drop('Listing Date', axis=1, inplace=True)

# Explicit Risk Encoding (Low: 0, Medium: 1, High: 2)
risk_mapping = {'Low': 0, 'Medium': 1, 'High': 2}
df['Risk_Category_Encoded'] = df['Risk_Category'].map(risk_mapping)

df.head()

  df['Listing Date'] = pd.to_datetime(df['Listing Date'])


Unnamed: 0,IPO,IPO_Size,Subscription,GMP,IPO Price,Estimated Price,Estimated Percentage,Listing Price,Listing Percentage,LT Price,LT Percentage,Risk_Category,Listing_Year,Listing_Month,Risk_Category_Encoded
0,"ASK Automotive IPO (ASKAUTOLTD,544022)",834.0,51.14,28,282,310,9.93,303,7.55,294.0,4.43,High,2023,11,2
1,Protean eGov Technologies IPO (544021),490.33,23.86,48,792,840,6.06,792,0.0,1109.0,40.04,Low,2023,11,0
2,"ESAF Small Finance IPO (ESAFSFB,544020)",463.0,77.0,16,60,76,26.67,71,18.33,68.0,13.58,Medium,2023,11,1
3,"Honasa Consumer Limited IPO (HONASA,544014)",1701.44,7.61,24,324,348,7.41,330,1.85,475.0,46.64,Low,2023,11,0
4,"Cello World Limited IPO (CELLO,544012)",1900.0,41.69,160,648,808,24.69,829,27.93,781.0,20.54,Medium,2023,11,1


## 3. Scaling

In [4]:
scaler = StandardScaler()
features_to_scale = df.select_dtypes(include=['float64', 'int64']).columns
# Exclude target variable if necessary, here we scale everything numeric
df[features_to_scale] = scaler.fit_transform(df[features_to_scale])

print("Preprocessing Complete.")

Preprocessing Complete.


## 4. Save Cleaned Data

In [None]:
df.to_csv('cleaned_dataset/cleaned_gmp_data.csv', index=False)
print("Data saved to cleaned_gmp_data.csv")

Data saved to cleaned_gmp_data_1.csv
