In [4]:
# 📌 Step 1: Import Libraries
import pandas as pd
import numpy as np
import os

from sklearn.preprocessing import LabelEncoder

# 📌 Step 2: Load Dataset
df = pd.read_csv('../Data/raw_data.csv')
print("✅ Dataset Loaded. Shape:", df.shape)

# 📌 Step 3: Drop Unnecessary Columns
if 'Appeal_ID' in df.columns:
    df.drop('Appeal_ID', axis=1, inplace=True)

# 📌 Step 4: Handle Missing Values
df.dropna(inplace=True)
print("✅ After Dropping NA. Shape:", df.shape)

# 📌 Step 5: Encode Date (convert to numerical days)
df['Appeal_Date'] = pd.to_datetime(df['Appeal_Date'], errors='coerce')
df['Appeal_Date'] = df['Appeal_Date'].map(lambda x: x.toordinal())

# 📌 Step 6: Label Encode Categorical Columns
label_encoders = {}
categorical_cols = ['Applicant_Gender', 'Department_Name', 'Language', 'Appeal_Type']

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# 📌 Step 7: Encode Target Column (Appeal_Category)
target_encoder = LabelEncoder()
df['Appeal_Category'] = target_encoder.fit_transform(df['Appeal_Category'])

# (Optional) Save encoders if needed later
import joblib
os.makedirs('../App/model', exist_ok=True)
joblib.dump(label_encoders, '../App/model/label_encoders.pkl')
joblib.dump(target_encoder, '../App/model/target_encoder.pkl')

# 📌 Step 8: Save Preprocessed Data
os.makedirs('../Data', exist_ok=True)
df.to_csv('../Data/preprocessed_data.csv', index=False)
print("✅ Preprocessed data saved to 'Data/preprocessed_data.csv'")


✅ Dataset Loaded. Shape: (2500, 8)
✅ After Dropping NA. Shape: (2500, 7)
✅ Preprocessed data saved to 'Data/preprocessed_data.csv'
