In [None]:
import numpy as np
import pandas as pd
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

np.random.seed(42)
random.seed(42)

print("ðŸŽ¯ Generating Improved Synthetic Dataset...")
print("=" * 60)

In [None]:
# Expanded program catalog with 50 programs across diverse fields
programs = [
    # STEM Programs
    ("Computer Science", "Programming, algorithms, software development", "programming algorithms software development technology computer coding"),
    ("Data Science", "Analytics, machine learning, big data", "data analytics statistics machine-learning programming python"),
    ("Artificial Intelligence", "AI, neural networks, deep learning", "ai machine-learning algorithms programming neural-networks deep-learning"),
    ("Cybersecurity", "Network security, ethical hacking", "security networks programming cryptography technology ethical-hacking"),
    ("Software Engineering", "Large-scale software development", "programming software engineering algorithms computer technology"),
    ("Information Systems", "Business technology integration", "technology business databases programming systems"),
    ("Mathematics", "Pure and applied mathematics", "math calculus algebra statistics problem-solving theory"),
    ("Statistics", "Statistical analysis and modeling", "statistics math data analytics research probability"),
    ("Physics", "Physical sciences and research", "physics math science research theory engineering"),
    ("Chemistry", "Chemical sciences and research", "chemistry science research lab experiments molecules"),
    ("Biology", "Life sciences and research", "biology genetics ecology science research life-sciences"),
    ("Biochemistry", "Chemical processes in living organisms", "chemistry biology science research lab molecules"),
    ("Molecular Biology", "Genetics and molecular research", "biology genetics research science molecules lab"),
    ("Environmental Science", "Ecology and sustainability", "environment ecology science biology sustainability research"),
    ("Biotechnology", "Applied biological sciences", "biology technology science research genetics lab"),
    
    # Engineering Programs
    ("Mechanical Engineering", "Machines and systems design", "engineering mechanics math physics design technology"),
    ("Electrical Engineering", "Electronics and circuits", "engineering electronics circuits math physics technology"),
    ("Civil Engineering", "Infrastructure and construction", "engineering construction design math structures"),
    ("Chemical Engineering", "Industrial chemistry and processes", "engineering chemistry math physics technology industrial"),
    ("Aerospace Engineering", "Aircraft and spacecraft design", "engineering physics math aerospace design technology"),
    ("Biomedical Engineering", "Medical devices and systems", "engineering biology medicine technology medical design"),
    ("Environmental Engineering", "Sustainability engineering solutions", "engineering environment ecology sustainability technology"),
    ("Industrial Engineering", "Optimization and efficiency", "engineering business math optimization technology systems"),
    
    # Business Programs
    ("Business Administration", "General business management", "business management leadership finance marketing strategy"),
    ("Finance", "Financial markets and investment", "finance business math economics investment banking"),
    ("Accounting", "Financial reporting and auditing", "accounting finance business math reporting analysis"),
    ("Marketing", "Brand strategy and consumer insights", "marketing business creativity analytics communication strategy"),
    ("Economics", "Markets and economic systems", "economics math business statistics theory research"),
    ("Business Analytics", "Data-driven business decisions", "analytics data business statistics programming technology"),
    ("Entrepreneurship", "Startups and innovation", "business entrepreneurship innovation strategy management creativity"),
    ("Human Resources", "People management and organizational development", "business management psychology communication people leadership"),
    ("Supply Chain Management", "Logistics and operations", "business operations logistics management strategy systems"),
    
    # Arts and Humanities
    ("Fine Arts", "Painting, sculpture, visual arts", "art painting drawing sculpture creativity expression"),
    ("Graphic Design", "Visual communication and branding", "design art creativity software technology branding visual"),
    ("Architecture", "Building design and urban planning", "design architecture drawing engineering creativity structures"),
    ("Landscape Architecture", "Outdoor spaces and environmental design", "design ecology architecture drawing environment creativity"),
    ("Interior Design", "Space planning and aesthetics", "design creativity art architecture space aesthetics"),
    ("Fashion Design", "Clothing and textile design", "design creativity art fashion textiles drawing"),
    ("Literature", "Writing and literary analysis", "literature writing reading analysis creativity research"),
    ("Creative Writing", "Fiction, poetry, and storytelling", "writing creativity storytelling literature expression imagination"),
    ("Journalism", "News and media reporting", "writing journalism communication media research investigation"),
    ("Communications", "Media and public relations", "communication writing media marketing public-relations strategy"),
    ("Film Production", "Video and cinematography", "film creativity technology production storytelling media"),
    
    # Social Sciences
    ("Psychology", "Human behavior and mental processes", "psychology science research behavior therapy counseling"),
    ("Sociology", "Society and social behavior", "sociology research society behavior culture analysis"),
    ("Political Science", "Government and political systems", "politics government research policy law society"),
    ("Anthropology", "Human cultures and evolution", "anthropology culture research society history biology"),
    ("History", "Historical research and analysis", "history research analysis writing culture society"),
    
    # Health Sciences
    ("Nursing", "Healthcare and patient care", "nursing healthcare medicine biology science patient-care"),
    ("Public Health", "Community health and epidemiology", "health biology science research community epidemiology"),
]

programs_df = pd.DataFrame(programs, columns=["name", "description", "tags_text"])
programs_df["program_id"] = ["p_" + str(i).zfill(3) for i in range(len(programs_df))]

print(f"âœ“ Generated {len(programs_df)} programs")
programs_df.head()

In [None]:
# Expanded interest keywords matching program tags
interest_keywords = [
    "programming", "algorithms", "software", "technology", "computer", "coding",
    "data", "analytics", "statistics", "machine-learning", "python",
    "ai", "neural-networks", "deep-learning",
    "security", "networks", "cryptography", "ethical-hacking",
    "math", "calculus", "algebra", "probability", "theory",
    "physics", "science", "research", "lab", "experiments",
    "chemistry", "molecules", "biology", "genetics", "ecology", "life-sciences",
    "environment", "sustainability",
    "engineering", "mechanics", "electronics", "circuits", "construction",
    "aerospace", "medicine", "medical", "optimization", "systems",
    "business", "management", "leadership", "finance", "investment", "banking",
    "accounting", "reporting", "marketing", "communication", "strategy",
    "economics", "entrepreneurship", "innovation", "operations", "logistics",
    "art", "painting", "drawing", "sculpture", "creativity", "expression",
    "design", "branding", "visual", "architecture", "structures", "space",
    "aesthetics", "fashion", "textiles",
    "literature", "writing", "reading", "storytelling", "imagination",
    "journalism", "media", "public-relations", "film", "production",
    "psychology", "behavior", "therapy", "counseling",
    "sociology", "society", "culture", "politics", "government", "policy", "law",
    "anthropology", "history", "analysis",
    "nursing", "healthcare", "patient-care", "health", "community", "epidemiology"
]

# Generate 500 diverse users with realistic interest patterns
num_users = 500
users = []

for i in range(num_users):
    # Each user has 4-8 interests (more realistic)
    num_interests = random.randint(4, 8)
    interests = random.sample(interest_keywords, num_interests)
    interests_text = " ".join(interests)
    
    # Grades with realistic distribution
    math_grade = int(np.clip(np.random.normal(75, 12), 40, 100))
    science_grade = int(np.clip(np.random.normal(75, 12), 40, 100))
    language_grade = int(np.clip(np.random.normal(75, 12), 40, 100))
    
    users.append((
        f"u_{str(i).zfill(4)}", 
        interests_text,
        math_grade,
        science_grade,
        language_grade
    ))

users_df = pd.DataFrame(users, columns=["user_id", "interests_text", "math_grade", "science_grade", "language_grade"])

print(f"âœ“ Generated {len(users_df)} users")
print(f"  Avg interests per user: {users_df['interests_text'].str.split().str.len().mean():.1f}")
users_df.head()

In [None]:
# Generate realistic interactions based on content similarity
# This creates patterns that collaborative filtering can learn from

# Use TF-IDF to find similarity between user interests and programs
vectorizer = TfidfVectorizer(stop_words='english')
all_text = list(programs_df['tags_text']) + list(users_df['interests_text'])
vectorizer.fit(all_text)

program_vectors = vectorizer.transform(programs_df['tags_text'])
user_vectors = vectorizer.transform(users_df['interests_text'])

# Calculate similarity between users and programs
similarity_matrix = cosine_similarity(user_vectors, program_vectors)

print("âœ“ Calculated user-program similarity matrix")
print(f"  Shape: {similarity_matrix.shape}")
print(f"  Avg similarity: {similarity_matrix.mean():.3f}")

In [None]:
# Generate interactions with realistic patterns
# Users are more likely to interact with programs similar to their interests
# This creates learnable patterns for collaborative filtering

interactions = []

for user_idx, (_, user) in enumerate(users_df.iterrows()):
    user_id = user['user_id']
    
    # Get similarity scores for this user
    scores = similarity_matrix[user_idx]
    
    # Number of interactions: 12-20 per user (realistic engagement)
    num_interactions = random.randint(12, 20)
    
    # Probabilistic selection: higher similarity = higher probability
    # Add some randomness so it's not purely content-based
    probabilities = scores ** 2  # Square to emphasize high similarities
    probabilities = probabilities / probabilities.sum()  # Normalize
    
    # Add noise: 70% based on similarity, 30% random exploration
    noise = np.random.random(len(programs_df))
    combined_scores = 0.7 * probabilities + 0.3 * (noise / noise.sum())
    combined_scores = combined_scores / combined_scores.sum()
    
    # Select programs (without replacement)
    selected_programs = np.random.choice(
        programs_df['program_id'].values,
        size=num_interactions,
        replace=False,
        p=combined_scores
    )
    
    # Create interactions (all positive for this dataset)
    for program_id in selected_programs:
        interactions.append((user_id, program_id, 1))

interactions_df = pd.DataFrame(interactions, columns=["user_id", "program_id", "interaction"])

print(f"âœ“ Generated {len(interactions_df)} interactions")
print(f"  Avg interactions per user: {len(interactions_df) / len(users_df):.1f}")
print(f"  Avg interactions per program: {len(interactions_df) / len(programs_df):.1f}")

# Calculate sparsity
total_possible = len(users_df) * len(programs_df)
sparsity = (1 - len(interactions_df) / total_possible) * 100
density = (len(interactions_df) / total_possible) * 100

print(f"  Density: {density:.2f}%")
print(f"  Sparsity: {sparsity:.2f}%")

interactions_df.head(10)

## Save Datasets

Save the improved synthetic data with:
- **500 users** with diverse interests and realistic grades
- **50 programs** across multiple fields  
- **~8,000 interactions** with learnable patterns
- **Low sparsity** (~68%) for better CF performance

In [None]:
# Save datasets to CSV
users_df.to_csv("../data/raw/users.csv", index=False)
programs_df.to_csv("../data/raw/programs.csv", index=False)
interactions_df.to_csv("../data/raw/interactions.csv", index=False)

print("\n" + "=" * 60)
print("âœ… DATA GENERATION COMPLETE!")
print("=" * 60)
print(f"\nðŸ“Š Dataset Summary:")
print(f"  Users: {len(users_df)}")
print(f"  Programs: {len(programs_df)}")
print(f"  Interactions: {len(interactions_df)}")
print(f"  Sparsity: {sparsity:.2f}%")
print(f"\nðŸ’¾ Files saved:")
print(f"  - data/raw/users.csv")
print(f"  - data/raw/programs.csv")
print(f"  - data/raw/interactions.csv")
print(f"\nðŸš€ Next Steps:")
print(f"  1. Run notebook 02 to train content-based model")
print(f"  2. Run notebook 03 to train collaborative filtering model")
print(f"  3. Run notebook 04 to create hybrid model")
print(f"  4. Run notebook 05 to evaluate with 80/20 split")