In [2]:
import pandas as pd
import numpy as np
from faker import Faker
import random
from datetime import datetime, timedelta
import string

# Initialize Faker
fake = Faker()
Faker.seed(42)
random.seed(42)
np.random.seed(42)

# Helper functions
def random_phone():
    return f"{random.randint(200, 999)}-{random.randint(200, 999)}-{random.randint(1000, 9999)}"

def random_date(start_date, end_date):
    return fake.date_between(start_date=start_date, end_date=end_date)

def random_gpa():
    return round(random.uniform(2.0, 4.0), 2)

def random_gre():
    return random.randint(260, 340)

def random_toefl():
    return random.randint(60, 120)

# Generate universities
def generate_universities(n=50):
    universities = []
    for _ in range(n):
        uni = {
            'university_name': fake.unique.company() + " University",
            'location': fake.city() + ", " + fake.state_abbr(),
            'application_deadline': random_date(datetime(2024, 1, 1), datetime(2024, 12, 31)),
            'tier': random.choice(['Top', 'Mid', 'Low']),
            'avg_alumni_salary': random.randint(40000, 120000),
            'placement_rate': round(random.uniform(50, 95), 2),
            'value_score': round(random.uniform(3.0, 9.5), 2)
        }
        universities.append(uni)
    return pd.DataFrame(universities)

# Generate students with some dirty data
def generate_students(n=1000):
    students = []
    for _ in range(n):
        # Randomly decide if we'll create a dirty record
        make_dirty = random.random() < 0.15
        
        if make_dirty:
            # Create records with various issues
            dirty_type = random.choice(['missing', 'invalid', 'outlier', 'duplicate'])
            
            if dirty_type == 'missing':
                student = {
                    'first_name': fake.first_name(),
                    'last_name': fake.last_name(),
                    'email': None,  # Missing
                    'gre_score': random_gre(),
                    'toefl_score': random_toefl(),
                    'gpa': random_gpa(),
                    'work_experience': random.randint(0, 60),
                    'preferred_location': fake.city(),
                    'phone': random_phone(),
                    'date_of_birth': fake.date_of_birth(minimum_age=18, maximum_age=35),
                    'financial_status': random.choice(['Low', 'Medium', 'High']),
                    'loan_risk_score': round(random.uniform(1.0, 10.0), 2)
                }
            elif dirty_type == 'invalid':
                student = {
                    'first_name': fake.first_name(),
                    'last_name': fake.last_name(),
                    'email': 'invalid_email',  # Invalid format
                    'gre_score': random.randint(100, 259),  # Below valid range
                    'toefl_score': random.randint(121, 150),  # Above valid range
                    'gpa': round(random.uniform(4.1, 5.0), 2),  # Above valid range
                    'work_experience': -5,  # Negative value
                    'preferred_location': fake.city(),
                    'phone': '123',  # Invalid phone
                    'date_of_birth': fake.date_of_birth(minimum_age=18, maximum_age=35),
                    'financial_status': 'Unknown',  # Invalid value
                    'loan_risk_score': round(random.uniform(1.0, 10.0), 2)
                }
            elif dirty_type == 'outlier':
                student = {
                    'first_name': fake.first_name(),
                    'last_name': fake.last_name(),
                    'email': fake.email(),
                    'gre_score': 340,  # Max score (not invalid but unusual)
                    'toefl_score': 120,  # Max score
                    'gpa': 4.0,  # Perfect GPA
                    'work_experience': 120,  # 10 years experience (unusual for student)
                    'preferred_location': fake.city(),
                    'phone': random_phone(),
                    'date_of_birth': fake.date_of_birth(minimum_age=40, maximum_age=50),  # Older than typical
                    'financial_status': random.choice(['Low', 'Medium', 'High']),
                    'loan_risk_score': round(random.uniform(1.0, 10.0), 2)
                }
        else:
            # Create clean record
            student = {
                'first_name': fake.first_name(),
                'last_name': fake.last_name(),
                'email': fake.email(),
                'gre_score': random_gre(),
                'toefl_score': random_toefl(),
                'gpa': random_gpa(),
                'work_experience': random.randint(0, 60),
                'preferred_location': fake.city(),
                'phone': random_phone(),
                'date_of_birth': fake.date_of_birth(minimum_age=18, maximum_age=35),
                'financial_status': random.choice(['Low', 'Medium', 'High']),
                'loan_risk_score': round(random.uniform(1.0, 10.0), 2)
            }
        
        students.append(student)
    
    # Add some duplicates (5% of records)
    num_duplicates = int(n * 0.05)
    for _ in range(num_duplicates):
        students.append(random.choice(students))
    
    return pd.DataFrame(students)

# Generate programs
def generate_programs(universities, n_per_uni=3):
    programs = []
    program_names = [
        "Computer Science", "Business Administration", "Electrical Engineering",
        "Biology", "Psychology", "Economics", "Mechanical Engineering",
        "Political Science", "Chemistry", "Mathematics", "English Literature",
        "History", "Physics", "Art", "Music"
    ]
    
    for uni_id, uni_row in universities.iterrows():
        for _ in range(random.randint(1, n_per_uni)):
            program = {
                'university_id': uni_id + 1,  # Assuming IDs start at 1
                'program_name': random.choice(program_names),
                'program_level': random.choice(['Undergraduate', 'Graduate']),
                'duration': random.choice([2, 4]),  # Years
                'tuition_fee': random.randint(5000, 50000),
                'avg_program_salary': random.randint(30000, 110000),
                'program_placement_rate': round(random.uniform(50, 95), 2),
                'description': fake.sentence()
            }
            programs.append(program)
    return pd.DataFrame(programs)

# Generate admissions criteria
def generate_admissions(programs):
    admissions = []
    for prog_id, prog_row in programs.iterrows():
        admission = {
            'program_id': prog_id + 1,
            'university_id': prog_row['university_id'],
            'min_gre': max(260, random_gre() - random.randint(0, 20)),
            'min_toefl': max(60, random_toefl() - random.randint(0, 15)),
            'min_gpa': max(2.5, round(random_gpa() - 0.3, 2)),
            'required_work_experience': random.randint(0, 12),
            'admission_year': 2024,
            'avg_admitted_gre': random_gre(),
            'avg_admitted_gpa': random_gpa(),
            'acceptance_rate': round(random.uniform(5, 50), 2)
        }
        admissions.append(admission)
    return pd.DataFrame(admissions)

# Generate historical admission data
def generate_historical_admissions(programs, students, n_per_program=20):
    historical = []
    for prog_id, prog_row in programs.iterrows():
        for _ in range(random.randint(5, n_per_program)):
            student = random.choice(students.to_dict('records'))
            historical.append({
                'program_id': prog_id + 1,
                'university_id': prog_row['university_id'],
                'student_gre': student['gre_score'] if random.random() > 0.1 else None,  # 10% missing
                'student_toefl': student['toefl_score'] if random.random() > 0.1 else None,
                'student_gpa': student['gpa'] if random.random() > 0.1 else None,
                'student_work_experience': student['work_experience'] if random.random() > 0.1 else None,
                'admission_result': random.choice(['Admitted', 'Rejected']),
                'admission_year': random.randint(2018, 2023)
            })
    return pd.DataFrame(historical)

# Generate all data
universities = generate_universities()
students = generate_students()
programs = generate_programs(universities)
admissions = generate_admissions(programs)
historical_admissions = generate_historical_admissions(programs, students)

# Add IDs
universities['university_id'] = range(1, len(universities) + 1)
students['student_id'] = range(1, len(students) + 1)
programs['program_id'] = range(1, len(programs) + 1)
admissions['admission_id'] = range(1, len(admissions) + 1)
historical_admissions['record_id'] = range(1, len(historical_admissions) + 1)

In [3]:
universities

Unnamed: 0,university_name,location,application_deadline,tier,avg_alumni_salary,placement_rate,value_score,university_id
0,"Rodriguez, Figueroa and Sanchez University","Lake Curtis, WA",2024-11-21,Low,54592,51.13,4.79,1
1,Garza Inc University,"North Jefferyhaven, PA",2024-01-10,Top,58289,83.14,7.4,2
2,Baldwin Ltd University,"Robinsonshire, KY",2024-10-22,Low,51395,76.57,3.21,3
3,Blake and Sons University,"Petersonberg, IL",2024-12-15,Top,68657,60.47,6.91,4
4,Garcia-James University,"Melanieview, AS",2024-01-16,Low,66062,82.22,7.56,5
5,Herrera-Dudley University,"Millerport, MP",2024-08-17,Mid,68893,70.21,4.81,6
6,Ray-Bush University,"Lake Jeremyport, CO",2024-11-08,Top,60926,81.42,5.21,7
7,James Group University,"Franciscostad, IL",2024-09-01,Top,68221,93.07,5.19,8
8,"Zuniga, Wong and Lynch University","New Jessica, GA",2024-06-17,Top,89797,54.35,8.51,9
9,Mayo-Bowman University,"Lake Mark, WI",2024-10-10,Low,74671,86.32,7.74,10
