In [None]:
def clean_universities(df):
    # Document original stats
    original_stats = {
        'rows': len(df),
        'missing_values': df.isnull().sum().to_dict()
    }
    
    # 1. Handle missing data - no missing expected in this table
    # 2. Remove duplicates
    df = df.drop_duplicates(subset=['university_name'])
    
    # 3. Fix structural errors
    # Ensure university names are properly capitalized
    df['university_name'] = df['university_name'].str.title()
    
    # 4. Handle outliers - none expected in this table
    # 5. Standardize data
    # Standardize location format (City, ST)
    df['location'] = df['location'].str.replace(r',\s+', ', ', regex=True)
    
    # 6. Validate data integrity
    # Check tier values are valid
    valid_tiers = ['Top', 'Mid', 'Low']
    df = df[df['tier'].isin(valid_tiers)]
    
    # Check numeric ranges
    df = df[(df['avg_alumni_salary'] >= 20000) & (df['avg_alumni_salary'] <= 200000)]
    df = df[(df['placement_rate'] >= 0) & (df['placement_rate'] <= 100)]
    df = df[(df['value_score'] >= 0) & (df['value_score'] <= 10)]
    
    # Document cleaning results
    cleaned_stats = {
        'rows_after_cleaning': len(df),
        'duplicates_removed': original_stats['rows'] - len(df)
    }
    
    return df, {'original': original_stats, 'cleaned': cleaned_stats}

cleaned_universities, uni_stats = clean_universities(universities)