# Data Cleaning Notebook

This notebook handles the cleaning and preprocessing of raw vendor performance data.

In [None]:
import pandas as pd
import numpy as np

# Load raw data
df = pd.read_csv('../data/raw/vendor_data.csv')

# Display basic info about the dataset
print(df.info())
print(df.describe())

In [None]:
# Handle missing values
print("Missing values before cleaning:")
print(df.isnull().sum())

# Fill or drop missing values as appropriate
# Example: fill numeric columns with median, categorical with mode
for col in df.columns:
    if df[col].dtype == 'object':
        df[col].fillna(df[col].mode()[0], inplace=True)
    else:
        df[col].fillna(df[col].median(), inplace=True)

In [None]:
# Remove duplicates
initial_count = len(df)
df.drop_duplicates(inplace=True)
final_count = len(df)
print(f"Removed {initial_count - final_count} duplicate rows")

In [None]:
# Outlier detection and treatment
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

# Apply outlier removal to numerical columns
numeric_columns = df.select_dtypes(include=[np.number]).columns
for col in numeric_columns:
    df = remove_outliers(df, col)

In [None]:
# Save cleaned data
df.to_csv('../data/processed/cleaned_vendor_data.csv', index=False)
print("Cleaned data saved to ../data/processed/cleaned_vendor_data.csv")