In [None]:
import pandas as pd
import numpy as np

# Set random seed for reproducibility
np.random.seed(42)

# Generate 5,000 candidates
n_candidates = 5000

# Generate JAMB scores according to the specified distribution
scores = []
for _ in range(n_candidates):
    r = np.random.random()
    if r < 0.005:  # 0.5% scored 300 and above
        scores.append(np.random.randint(300, 368))
    elif r < 0.047:  # 4.2% scored 250 to 299
        scores.append(np.random.randint(250, 300))
    elif r < 0.287:  # 24% scored 200 to 249
        scores.append(np.random.randint(200, 250))
    else:  # The remaining scored less than 200
        scores.append(np.random.randint(100, 200))

# Ensure exactly 3 candidates have the highest score of 367
top_3_indices = np.random.choice(np.where(np.array(scores) >= 300)[0], 3, replace=False)
for idx in top_3_indices:
    scores[idx] = 367

# Create initial DataFrame with JAMB scores
df = pd.DataFrame({'JAMB_Score': scores})

# Adjust numerical features
def adjust_feature(correlation):
    noise = np.random.normal(0, 1, n_candidates)
    adjusted = correlation * (df['JAMB_Score'] - df['JAMB_Score'].mean()) / df['JAMB_Score'].std() + \
               np.sqrt(1 - correlation**2) * noise
    return (adjusted - adjusted.mean()) / adjusted.std()

# Add numerical features
df['Study_Hours_Per_Week'] = (adjust_feature(0.4) * 10 + 20).clip(0, 40).astype(int)
df['Attendance_Rate'] = (adjust_feature(0.3) * 10 + 85).clip(50, 100).astype(int)
df['Teacher_Quality'] = (adjust_feature(0.3) * 1 + 3).clip(1, 5).astype(int)
df['Distance_To_School'] = (-adjust_feature(0.1) * 5 + 10).clip(0, 20).round(1)

# Function to create correlated categorical variables
def create_correlated_categorical(categories, base_probs, correlation_strength=0.3):
    normalized_scores = (df['JAMB_Score'] - df['JAMB_Score'].min()) / (df['JAMB_Score'].max() - df['JAMB_Score'].min())
    
    result = []
    for score in normalized_scores:
        adjusted_probs = base_probs.copy()
        for i in range(len(categories)):
            adjustment = correlation_strength * (score - 0.5) * (i - (len(categories)-1)/2)
            adjusted_probs[i] += adjustment
        
        adjusted_probs = np.clip(adjusted_probs, 0.1, 0.9)
        adjusted_probs = adjusted_probs / adjusted_probs.sum()
        
        result.append(np.random.choice(categories, p=adjusted_probs))
    
    return result

# Add categorical variables with controlled correlations
df['School_Type'] = create_correlated_categorical(
    ['Public', 'Private'], 
    base_probs=np.array([0.7, 0.3]), 
    correlation_strength=0.4
)

df['School_Location'] = create_correlated_categorical(
    ['Rural', 'Urban'], 
    base_probs=np.array([0.4, 0.6]), 
    correlation_strength=0.3
)

df['Extra_Tutorials'] = create_correlated_categorical(
    ['No', 'Yes'], 
    base_probs=np.array([0.4, 0.6]), 
    correlation_strength=0.35
)

df['Access_To_Learning_Materials'] = create_correlated_categorical(
    ['No', 'Yes'], 
    base_probs=np.array([0.3, 0.7]), 
    correlation_strength=0.3
)

df['Parent_Involvement'] = create_correlated_categorical(
    ['Low', 'Medium', 'High'], 
    base_probs=np.array([0.3, 0.4, 0.3]), 
    correlation_strength=0.35
)

df['IT_Knowledge'] = create_correlated_categorical(
    ['Low', 'Medium', 'High'], 
    base_probs=np.array([0.3, 0.4, 0.3]), 
    correlation_strength=0.3
)

# Add remaining random features
df['Student_ID'] = range(1, n_candidates + 1)
df['Age'] = np.random.randint(15, 23, n_candidates)

# Adjust gender distribution to exactly match 50.6% female and 49.4% male
female_count = int(n_candidates * 0.506)
male_count = n_candidates - female_count
gender_list = ['Female'] * female_count + ['Male'] * male_count
np.random.shuffle(gender_list)
df['Gender'] = gender_list

df['Socioeconomic_Status'] = create_correlated_categorical(
    ['Low', 'Medium', 'High'], 
    base_probs=np.array([0.3, 0.4, 0.3]), 
    correlation_strength=0.4
)

# Create Parent_Education_Level based on Socioeconomic_Status and JAMB_Score
def assign_education_level(row):
    score_percentile = (row['JAMB_Score'] - df['JAMB_Score'].min()) / (df['JAMB_Score'].max() - df['JAMB_Score'].min())
    if row['Socioeconomic_Status'] == 'Low':
        probs = [0.4 - 0.2*score_percentile, 0.35, 0.2 + 0.1*score_percentile, 0.05 + 0.1*score_percentile]
    elif row['Socioeconomic_Status'] == 'Medium':
        probs = [0.1 - 0.05*score_percentile, 0.3 - 0.1*score_percentile, 0.4, 0.2 + 0.15*score_percentile]
    else:  # High
        probs = [0.05 - 0.04*score_percentile, 0.15 - 0.1*score_percentile, 0.3, 0.5 + 0.14*score_percentile]
    
    probs = np.clip(probs, 0.01, 0.99)
    probs = probs / np.sum(probs)
    return np.random.choice(['None', 'Primary', 'Secondary', 'Tertiary'], p=probs)

df['Parent_Education_Level'] = df.apply(assign_education_level, axis=1)

# Assignments_Completed is slightly correlated with study hours
df['Assignments_Completed'] = (df['Study_Hours_Per_Week'] / 10 + np.random.normal(0, 1, n_candidates)).clip(1, 5).astype(int)

# Verify gender distribution
gender_distribution = df['Gender'].value_counts(normalize=True)
print("\nGender Distribution:")
print(gender_distribution)

# Calculate and display correlations
def calculate_correlation(categorical_col):
    return df.groupby(categorical_col)['JAMB_Score'].mean().sort_values(ascending=False)

print("\nMean JAMB Scores by Category:")
categorical_vars = ['School_Type', 'School_Location', 'Extra_Tutorials', 'Parent_Involvement', 
                   'IT_Knowledge', 'Access_To_Learning_Materials', 'Socioeconomic_Status', 'Parent_Education_Level']

for var in categorical_vars:
    print(f"\n{var}:")
    print(calculate_correlation(var))

# Calculate correlation for numerical variables
numerical_vars = ['Study_Hours_Per_Week', 'Attendance_Rate', 'Teacher_Quality', 'Distance_To_School', 'Assignments_Completed']
correlation_matrix = df[numerical_vars + ['JAMB_Score']].corr()['JAMB_Score'].sort_values(ascending=False)
print("\nCorrelation with JAMB Score (Numerical Variables):")
print(correlation_matrix)

# Save to CSV
df.to_csv('jamb_exam_results.csv', index=False)
print("\nDataset saved as 'jamb_exam_results.csv'")