In [2]:
import pandas as pd

# Load datasets
user_profiles = pd.read_csv(r"E:\Users\Pc\EYTechathonDemo\myproject\dataset\user_profiles.csv")
schemes = pd.read_csv(r"E:\Users\Pc\EYTechathonDemo\myproject\dataset\schemes.csv")

# Preprocessing User Profiles
def preprocess_user_profiles(df):
    # Ensure schema compliance
    required_columns = {
        'user_id': 'object',
        'name': 'object',
        'age': 'int64',
        'gender': 'object',
        'location': 'object',
        'income': 'float64',
        'employment_status': 'object',
        'education_level': 'object',
        'disability_status': 'bool',
        'user_preferences': 'object'
    }
    # Verify columns exist and convert types
    for col, dtype in required_columns.items():
        if col not in df.columns:
            raise ValueError(f"Column '{col}' is missing from user_profiles.")
        df[col] = df[col].astype(dtype)

    # Normalize categorical columns
    categorical_cols = ['gender', 'location', 'employment_status', 'education_level', 'user_preferences']
    for col in categorical_cols:
        df[col] = df[col].str.lower()

    # Handle missing values
    df.fillna({
        'gender': 'unknown',
        'location': 'unknown',
        'employment_status': 'unknown',
        'education_level': 'unknown',
        'user_preferences': 'none',
        'disability_status': False
    }, inplace=True)

    # Normalize income
    df['income'] = (df['income'] - df['income'].min()) / (df['income'].max() - df['income'].min())

    return df

# Preprocessing Schemes
def preprocess_schemes(df):
    # Ensure schema compliance
    required_columns = {
        'scheme_id': 'object',
        'scheme_name': 'object',
        'gender': 'object',
        'sector': 'object',
        'Disability': 'bool',
        'Age Group': 'object'
    }
    # Verify columns exist and convert types
    for col, dtype in required_columns.items():
        if col not in df.columns:
            raise ValueError(f"Column '{col}' is missing from schemes.")
        df[col] = df[col].astype(dtype)

    # Normalize categorical columns
    categorical_cols = ['gender', 'sector', 'Age Group']
    for col in categorical_cols:
        df[col] = df[col].str.lower()

    # Handle missing values
    df.fillna({
        'gender': 'unknown',
        'sector': 'unknown',
        'Age Group': 'all',
        'Disability': False
    }, inplace=True)

    return df

# Apply preprocessing functions
try:
    user_profiles_cleaned = preprocess_user_profiles(user_profiles)
    schemes_cleaned = preprocess_schemes(schemes)

    # Save the cleaned data
    user_profiles_cleaned.to_csv(r"E:\Users\Pc\EYTechathonDemo\myproject\cleaned_data\user_profiles_cleaned.csv", index=False)
    schemes_cleaned.to_csv(r"E:\Users\Pc\EYTechathonDemo\myproject\cleaned_data\schemes_cleaned.csv", index=False)

    # Display the first few rows of cleaned data
    print("User Profiles Cleaned:\n", user_profiles_cleaned.head())
    print("\nSchemes Cleaned:\n", schemes_cleaned.head())
except Exception as e:
    print(f"Error during preprocessing: {e}")

User Profiles Cleaned:
   user_id   name  age  gender     location    income employment_status  \
0      U1  User1   47  female  west bengal  0.960312          employed   
1      U2  User2   20  lgbtq+  maharashtra  0.148932        unemployed   
2      U3  User3   55  lgbtq+   tamil nadu  0.738814        unemployed   
3      U4  User4   47    male  west bengal  0.861808        unemployed   
4      U5  User5   26    male  west bengal  0.143222          employed   

  education_level  disability_status user_preferences  
0         primary               True           health  
1       secondary              False            study  
2       secondary              False            study  
3         primary              False            study  
4        graduate               True            study  

Schemes Cleaned:
   scheme_id              scheme_name  gender      sector  Disability  \
0        S1  Beti Bachao Beti Padhao  female       study       False   
1        S2   Janani Suraksha Yo