# 01 - Data Preprocessing
This notebook covers data loading, cleaning, encoding, and normalization.

In [None]:
# Step 1: Import libraries
import pandas as pd
import numpy as np

In [None]:
# Step 2: Load dataset
# df = pd.read_csv('../data/raw/dataset.csv')
df = pd.read_csv('../data/sample_dataset_structure.csv')
df.head()

In [None]:
# Step 3: Handle missing values, fix column names, and drop unnecessary columns
# # Rename the column with a typo
df = df.rename(columns={"Conatct Person Role": "Contact Person Role"})

# Strip extra spaces from column names
df.columns = df.columns.str.strip()

for col in df.columns:
    df[col] = df[col].astype(str).str.strip()

# Convert Budget in USD to numeric
df['Budget in USD'] = pd.to_numeric(df['Budget in USD'], errors='coerce')

# Display the first few rows of the DataFrame (optional)
print(df.head())
# prompt: use Mean Imputation method for missing values

# Fill missing 'Budget in USD' values with the mean of the existing values
mean_budget = df['Budget in USD'].mean()
df['Budget in USD'].fillna(mean_budget, inplace=True)

print("\nAfter Mean Imputation:")
print(df.isnull().sum())
# prompt: drop Client ID column

df = df.drop(columns=['Client ID'])

print("\nDataFrame after dropping 'Client ID':")
print(df.head())

In [None]:
# Step 4: Encoding categorical variables
# prompt: use one hot encoder for "Company Size, Required Service, Meeting Type, Quotation Type, Contact Person Role"

import pandas as pd
from sklearn.preprocessing import OneHotEncoder

categorical_cols = ["Company Size", "Required Service", "Meeting Type", "Quotation Type", "Contact Person Role"]

# Initialize the OneHotEncoder
# handle_unknown='ignore' handles categories not seen during training
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

# Fit and transform the selected categorical columns
encoded_data = encoder.fit_transform(df[categorical_cols])

# Create a new DataFrame with the encoded features
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(categorical_cols))

# Concatenate the encoded features with the original DataFrame (dropping the original categorical columns)
df = pd.concat([df.drop(columns=categorical_cols), encoded_df], axis=1)

print("\nDataFrame after One-Hot Encoding:")
print(df.head())
print("\nColumn names after One-Hot Encoding:")
print(df.columns)
print("\nShape after One-Hot Encoding:")
df.shape
# prompt: use ordinal encoder for "Potential Label" with the order No=0, Low=1, Medium=2, High=3

from sklearn.preprocessing import OrdinalEncoder

# Define the order of the categories
category_order = [['No', 'Low', 'Medium', 'High']]

# Initialize the OrdinalEncoder
ordinal_encoder = OrdinalEncoder(categories=category_order)

# Fit and transform the 'Potential Label' column
df['Potential Label_Encoded'] = ordinal_encoder.fit_transform(df[['Potential Label']])

# Display the first few rows to verify
print("\nDataFrame after Ordinal Encoding:")
print(df[['Potential Label', 'Potential Label_Encoded']].head())
# prompt: use ordinal encoder for "Engagement Level" with the order Low=0, Medium=1, High=2

# Define the order for 'Engagement Level'
engagement_order = [['Low', 'Medium', 'High']]

# Initialize the OrdinalEncoder for 'Engagement Level'
engagement_encoder = OrdinalEncoder(categories=engagement_order)

# Fit and transform the 'Engagement Level' column
df['Engagement Level_Encoded'] = engagement_encoder.fit_transform(df[['Engagement Level']])

# Display the first few rows to verify the new encoding
print("\nDataFrame after Ordinal Encoding 'Engagement Level':")
print(df[['Engagement Level', 'Engagement Level_Encoded']].head())
# prompt: use ordinal encoder for "Urgency Level" with the order Low=0, Medium=1, High=2

# Define the order for 'Urgency Level'
urgency_order = [['Low', 'Medium', 'High']]

# Initialize the OrdinalEncoder for 'Urgency Level'
urgency_encoder = OrdinalEncoder(categories=urgency_order)

# Fit and transform the 'Urgency Level' column
df['Urgency Level_Encoded'] = urgency_encoder.fit_transform(df[['Urgency Level']])

# Display the first few rows to verify the new encoding
print("\nDataFrame after Ordinal Encoding 'Urgency Level':")
print(df[['Urgency Level', 'Urgency Level_Encoded']].head())


In [None]:
# Step 5: Apply MinMaxScaler or StandardScaler
# prompt: normalize numerical column and encoded columns exclude Potential Label_Encoded column using Min Max Scaler

import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Identify numerical and encoded columns to normalize
# Exclude the 'Potential Label_Encoded' column
columns_to_normalize = [col for col in df.select_dtypes(include=np.number).columns if col != 'Potential Label_Encoded']

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Apply Min-Max Scaling to the selected columns
df[columns_to_normalize] = scaler.fit_transform(df[columns_to_normalize])

print("\nDataFrame after Min-Max Scaling:")
print(df.head())
print("\nColumns that were normalized:")
columns_to_normalize