In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

In [5]:
def prepare_lead_dataset(input_file, output_file=None):
    """
    Prepare and standardize lead dataset for ML pipeline

    Args:
        input_file: Path to your dataset file (CSV, Excel, or JSON)
        output_file: Optional output file path (defaults to input_file with '_prepared' suffix)
    """

    print(f"📂 Loading dataset from: {input_file}")

  # Load the dataset
    df_path = r'E:\AI Lead\Dataset\AI_lead_Dataset.csv'
    df = pd.read_csv(df_path)

    print(f"📊 Original dataset: {df.shape[0]} rows, {df.shape[1]} columns")
    print(f"📋 Original columns: {list(df.columns)}")

    required_columns = {
        'UID': 'UID',
        'MaritalStatus': 'MaritalStatus', 
        'CreditScore': 'CreditScore',
        'AnnualIncome': 'AnnualIncome',
        'NetWorth': 'NetWorth',
        'EmploymentStatus': 'EmploymentStatus',
        'RiskScore': 'RiskScore',
        'AgeGroup': 'AgeGroup',
        'PhoneNumber': 'PhoneNumber',
        'Email': 'Email'
    }

    column_mapping = {
        'id': 'UID',
        'user_id': 'UID',
        'lead_id': 'UID',
        'marital_status': 'MaritalStatus',
        'marital': 'MaritalStatus',
        'credit_score': 'CreditScore',
        'credit': 'CreditScore',
        'annual_income': 'AnnualIncome',
        'income': 'AnnualIncome',
        'yearly_income': 'AnnualIncome',
        'net_worth': 'NetWorth',
        'networth': 'NetWorth',
        'wealth': 'NetWorth',
        'employment_status': 'EmploymentStatus',
        'employment': 'EmploymentStatus',
        'job_status': 'EmploymentStatus',
        'risk_score': 'RiskScore',
        'risk': 'RiskScore',
        'age_group': 'AgeGroup',
        'age': 'AgeGroup',
        'age_bracket': 'AgeGroup',
        'phone_number': 'PhoneNumber',
        'phone': 'PhoneNumber',
        'contact': 'PhoneNumber',
        'email_address': 'Email',
        'email': 'Email',
        'mail': 'Email'
    }

    df_prepared = df.copy()
    for old_col, new_col in column_mapping.items():
        if old_col in df_prepared.columns:
            df_prepared = df_prepared.rename(columns={old_col: new_col})

    missing_columns = [col for col in required_columns.values() if col not in df_prepared.columns]
    if missing_columns:
        print(f"❌ Missing required columns: {missing_columns}")
        return None

    df_prepared = df_prepared[list(required_columns.values())]

    print("🧹 Cleaning and standardizing data...")

    if not df_prepared['UID'].astype(str).str.startswith('LEAD_').all():
        df_prepared['UID'] = 'LEAD_' + df_prepared['UID'].astype(str).str.zfill(6)

    marital_mapping = {
        'single': 'Single',
        's': 'Single',
        'married': 'Married',
        'm': 'Married',
        # Add the 'Married with Kids' mapping here
        'married with kids': 'Married with Kids', # Assuming your raw data might have this lowercase
        'mwk': 'Married with Kids', # Or any other shorthand from your raw data
        'divorced': 'Divorced',
        'd': 'Divorced',
        'widowed': 'Widowed',
        'w': 'Widowed'
    }
    df_prepared['MaritalStatus'] = df_prepared['MaritalStatus'].astype(str).str.lower().str.strip()
    # If the raw data for MaritalStatus already contains 'Married with Kids' exactly,
    # you might not need to map it if it's already lowercased and stripped.
    # But adding it here makes your mapping comprehensive.
    df_prepared['MaritalStatus'] = df_prepared['MaritalStatus'].map(marital_mapping).fillna('Single')
    employment_mapping = {
        'employed': 'Employed',
        'full-time': 'Employed',
        'part-time': 'Employed',
        'self-employed': 'Self-Employed',
        'self employed': 'Self-Employed',
        'entrepreneur': 'Self-Employed',
        'unemployed': 'Unemployed',
        'jobless': 'Unemployed',
        'retired': 'Retired',
        'student': 'Student',
        'studying': 'Student'
    }
    df_prepared['EmploymentStatus'] = df_prepared['EmploymentStatus'].astype(str).str.lower().str.strip()
    df_prepared['EmploymentStatus'] = df_prepared['EmploymentStatus'].map(employment_mapping).fillna('Employed')

    def standardize_age_group(age_value):
        if pd.isna(age_value):
            return '26-35'
        age_str = str(age_value).lower().strip()
        if any(age_str == ag.lower() for ag in ['18-25', '26-35', '36-50', '51+']):
            return age_str.title()
        if age_str in ['46-55', '56-65', '65+']:
            return '51+'
        try:
            age_num = int(float(age_str))
            if 18 <= age_num <= 25:
                return '18-25'
            elif 26 <= age_num <= 35:
                return '26-35'
            elif 36 <= age_num <= 50:
                return '36-50'
            else:
                return '51+'
        except:
            return '26-35'

    df_prepared['AgeGroup'] = df_prepared['AgeGroup'].apply(standardize_age_group)

    numerical_columns = ['CreditScore', 'AnnualIncome', 'NetWorth', 'RiskScore']
    for col in numerical_columns:
        df_prepared[col] = pd.to_numeric(df_prepared[col], errors='coerce')

    print("🔧 Handling missing values...")
    df_prepared['CreditScore'] = df_prepared['CreditScore'].fillna(df_prepared['CreditScore'].median())
    df_prepared['AnnualIncome'] = df_prepared['AnnualIncome'].fillna(df_prepared['AnnualIncome'].median())
    df_prepared['NetWorth'] = df_prepared['NetWorth'].fillna(df_prepared['NetWorth'].median())
    df_prepared['RiskScore'] = df_prepared['RiskScore'].fillna(df_prepared['RiskScore'].median())

    df_prepared['CreditScore'] = df_prepared['CreditScore'].clip(300, 850)
    df_prepared['RiskScore'] = df_prepared['RiskScore'].clip(0, 100)

    df_prepared['PhoneNumber'] = df_prepared['PhoneNumber'].astype(str).str.strip()
    df_prepared['Email'] = df_prepared['Email'].astype(str).str.lower().str.strip()

    df_prepared = df_prepared.drop_duplicates(subset=['UID'], keep='first')

    print("\n📊 Data Quality Report:")
    print(f"   - Final dataset size: {df_prepared.shape[0]} rows")
    print(f"   - Credit Score range: {df_prepared['CreditScore'].min():.0f} - {df_prepared['CreditScore'].max():.0f}")
    print(f"   - Annual Income range: ${df_prepared['AnnualIncome'].min():,.0f} - ${df_prepared['AnnualIncome'].max():,.0f}")
    print(f"   - Net Worth range: ${df_prepared['NetWorth'].min():,.0f} - ${df_prepared['NetWorth'].max():,.0f}")
    print(f"   - Risk Score range: {df_prepared['RiskScore'].min():.1f} - {df_prepared['RiskScore'].max():.1f}")
    print(f"   - Marital Status distribution: {df_prepared['MaritalStatus'].value_counts().to_dict()}")
    print(f"   - Employment Status distribution: {df_prepared['EmploymentStatus'].value_counts().to_dict()}")
    print(f"   - Age Group distribution: {df_prepared['AgeGroup'].value_counts().to_dict()}")

    if output_file is None:
        output_file = str(Path(input_file).with_suffix('')) + '_prepared.csv'

    df_prepared.to_csv(output_file, index=False)
    print(f"✅ Prepared dataset saved to: {output_file}")

    return df_prepared

In [3]:
# Run the function
input_file = r'E:\AI Lead\Dataset\AI_lead_Dataset.csv'
prepared_df = prepare_lead_dataset(input_file)

if prepared_df is not None:
    print("\n🎉 Dataset preparation completed successfully!")
    print(f"📝 Dataset shape: {prepared_df.shape}")
    print(f"📊 Age groups: {sorted(prepared_df['AgeGroup'].unique())}")
else:
    print("❌ Dataset preparation failed. Please check the column mappings.")

📂 Loading dataset from: E:\AI Lead\Dataset\AI_lead_Dataset.csv
📊 Original dataset: 20000 rows, 10 columns
📋 Original columns: ['UID', 'MaritalStatus', 'CreditScore', 'AnnualIncome', 'NetWorth', 'EmploymentStatus', 'RiskScore', 'AgeGroup', 'PhoneNumber', 'Email']
🧹 Cleaning and standardizing data...
🔧 Handling missing values...

📊 Data Quality Report:
   - Final dataset size: 20000 rows
   - Credit Score range: 343 - 712
   - Annual Income range: $15,000 - $485,341
   - Net Worth range: $1,000 - $2,603,208
   - Risk Score range: 28.8 - 84.0
   - Marital Status distribution: {'Married': 10041, 'Single': 6078, 'Divorced': 2882, 'Widowed': 999}
   - Employment Status distribution: {'Employed': 17036, 'Self-Employed': 1573, 'Unemployed': 1391}
   - Age Group distribution: {'36-50': 9028, '26-35': 4962, '51+': 3628, '18-25': 2382}
✅ Prepared dataset saved to: E:\AI Lead\Dataset\AI_lead_Dataset_prepared.csv

🎉 Dataset preparation completed successfully!
📝 Dataset shape: (20000, 10)
📊 Age grou

In [6]:
# During training
import joblib
from sklearn.pipeline import Pipeline
joblib.dump(Pipeline, "preprocessor.pkl")


['preprocessor.pkl']