In [20]:
import pandas as pd
import numpy as np
from scipy.stats import truncnorm
from typing import List, Dict

In [21]:
columns = [
    "user",
    "course",
    "course_grade",
    "course_topic",
    "easy_correct",
    "medium_correct",
    "hard_correct",
    "upcoming_assignment",
    "days_to_deadline",
    "target",
]
df = pd.DataFrame(columns=columns)
df.to_csv("synthetic_data.csv", mode="w", index=False)

courses = ["CMSC320", "CMSC351", "COMM107", "MATH240", "MATH241", "MATH246"]
course_topics = {
    "CMSC320": [
        "Introduction to Data Science and Data Type",
        "Experiment Design",
        "Pandas and SQL",
        "Probability, Distributions, and Summary Stats",
        "Hypothesis Testing",
        "Data Visualization",
        "Data Exploration",
        "Data Cleaning",
        "Introduction to Machine Learning",
        "Feature Engineering",
        "Decision Tree",
        "Classifications",
        "Regression",
        "Unsupervised Learning & Dimensionality Reduction",
        "Introduction to Neural Network",
        "Image Processing",
        "Intro to Natural Language Processing",
        "Introduction to Graph Theory",
        "Recommender System",
        "Data Ethics",
    ],
    "CMSC351": [
        "Introduction",
        "Coin Changing",
        "Big Notation",
        "Rigorous Time",
        "Maximum Contiguous Sum",
        "BubbleSort",
        "SelectionSort",
        "InsertionSort",
        "Binary Search",
        "Recurrence Relations, Trees, Digging Down",
        "Master Theorem",
        "MergeSort",
        "Heaps and HeapSort",
        "QuickSort",
        "Limitations on Comparison-Based Sorting",
        "CountingSort",
        "RadixSort",
        "Integer Multiplication",
        "Graphs",
        "Shortest Path",
        "Breadth-First Traverse",
        "Depth-First Traverse",
        "Dijkstras Algorithm",
        "Floyds Algorithm",
        "Spanning Trees",
        "Minimax Algorithm",
        "Huffmans Algorithm",
        "P and NP",
    ],
    "COMM107": [
        "Informative Presentation",
        "Persuasive Presentation",
        "Small Group Presentation",
        "Final Presentation",
    ],
    "MATH240": [
        "Systems of Linear Equations",
        "Row Reduction and Echelon Forms",
        "Vector Equations",
        "The Matrix Equation",
        "Solution Sets of Linear Systems",
        "Linear Independence",
        "Introduction to Linear Transformations",
        "The Matrix of a Linear Transformation",
        "Matrix Operations",
        "The Inverse of a Matrix",
        "Characterizations of Invertible Matrices",
        "Subspaces of Rn",
        "Introduction to Determinants",
        "Properties of Determinants",
        "Cramers Rule, Volume, and Linear Transformations",
        "Vector Spaces and Subspaces",
        "Null Spaces, Column Spaces, Row Spaces, and Linear Transformations",
        "Linearly Independent Sets and Bases",
        "Coordinate Systems",
        "The Dimension of a Vector Space",
        "Change of Basis",
        "Eigenvectors and Eigenvalues",
        "The Characteristic Equation",
        "Diagonalization",
        "Eigenvectors and Linear Transformations",
        "Complex Eigenvalues",
        "Inner Product, Length, and Orthogonality",
        "Orthogonal Sets",
        "Orthogonal Projections",
        "The Gram-Schmidt Process",
        "Least-Squares Problems",
        "Machine Learning and Linear Models",
        "Inner Product Spaces",
        "Diagonalization of Symmetric Matrices",
        "The Singular Value Decomposition",
    ],
    "MATH241": [
        "3D",
        "Points, Axes, Spheres, Distance",
        "Vectors",
        "Dot Product",
        "Cross Product",
        "Equations of Lines",
        "Equations of Planes",
        "Basics of Vector Valued Functions",
        "Limits of Vector Valued Functions",
        "Derivatives and Integrals of Vector Valued Functions",
        "Curves and Associated Definitions",
        "Tangents and Normals to Curves",
        "Functions of Several Variables",
        "Partial Derivatives",
        "The Chain Rule",
        "The Directional Derivative",
        "The Gradient",
        "Tangent Plane Approximation",
        "Maxima and Minima",
        "Lagrange Multipliers",
        "Double Integrals",
        "Double Integrals in Polar",
        "Triple Integrals in Rectangular",
        "Triple Integrals in Polar",
        "Triple Integrals in Spherical",
        "Change of Variables",
        "Parametrization of Surfaces",
        "Vector Fields",
        "Line Integrals of Functions and VFs",
        "The Fundamental Theorem of Line Integrals",
        "Greens Theorem",
        "Surface Integrals of Functions",
        "Surface Integrals of Vector Fields",
        "Stokes Theorem",
        "The Divergence Theorem (Gauss's theorem)",
    ],
    "MATH246": [
        "Introduction to First-Order DEs",
        "Linear First-Order DEs",
        "Separable DEs",
        "Graphical Methods",
        "Applications",
        "Approximation Methods",
        "Exact Differential Equations",
        "Introduction to Higher Order Linear Differential Equations",
        "Matrices and Determinants",
        "Homogeneous DEs - Method and Theory",
        "Homogeneous DEs with Constant Coefficients",
        "Nonhomogeneous DEs - Method and Theory",
        "The Method of Undetermined Coefficients",
        "Variation of Parameters",
        "Mechanical Vibrations",
        "Laplace Transforms",
        "Introduction to First Order Systems",
        "Matrix and Vector Essentials",
        "Notations and Theory for Systems",
        "Using Eigenpairs to Construct Solutions",
        "Graphing Solutions",
        "Hamiltonian Systems",
        "Linearization of Nonlinear Systems",
        "Population Dynamics",
    ],
}

In [22]:
def get_truncated_normal(mean: float, sd: float, low: float, upp: float) -> float:
    return truncnorm((low - mean) / sd, (upp - mean) / sd, loc=mean, scale=sd).rvs()


def generate_synthetic_data(num_users, columns, courses, course_topics) -> pd.DataFrame:
    data = []

    for user in range(1, num_users + 1):
        user_performance = np.random.uniform(0.6, 1.0)  # Base performance for the user

        for course in courses:
            course_grade = get_truncated_normal(user_performance * 100, 10, 60, 100)

            for topic in course_topics[course]:
                easy_correct = get_truncated_normal(user_performance * 100, 15, 60, 100)
                medium_correct = get_truncated_normal(
                    user_performance * 100, 20, 50, 100
                )
                hard_correct = get_truncated_normal(user_performance * 100, 25, 40, 100)

                upcoming_assignment = np.random.choice([0, 1], p=[0.3, 0.7])
                days_to_deadline = (
                    np.random.uniform(0, 1) if upcoming_assignment else np.nan
                )

                data.append(
                    [
                        user,
                        course,
                        course_grade,
                        topic,
                        easy_correct,
                        medium_correct,
                        hard_correct,
                        upcoming_assignment,
                        days_to_deadline,
                        np.nan,
                    ]
                )

    df = pd.DataFrame(data, columns=columns)
    return df

In [42]:
df = generate_synthetic_data(50, columns, courses, course_topics)

In [43]:
df.head()

Unnamed: 0,user,course,course_grade,course_topic,easy_correct,medium_correct,hard_correct,upcoming_assignment,days_to_deadline,target
0,1,CMSC320,89.515972,Introduction to Data Science and Data Type,69.06518,94.121975,64.144431,1,0.284417,
1,1,CMSC320,89.515972,Experiment Design,92.452853,54.696491,88.512116,1,0.201891,
2,1,CMSC320,89.515972,Pandas and SQL,99.738995,84.056038,99.828937,1,0.207723,
3,1,CMSC320,89.515972,"Probability, Distributions, and Summary Stats",82.686111,80.484904,99.596845,0,,
4,1,CMSC320,89.515972,Hypothesis Testing,88.862974,99.901244,68.752573,0,,


In [44]:
df.describe()

Unnamed: 0,user,course_grade,easy_correct,medium_correct,hard_correct,upcoming_assignment,days_to_deadline,target
count,7300.0,7300.0,7300.0,7300.0,7300.0,7300.0,5062.0,0.0
mean,25.5,81.211628,80.685094,77.586944,74.527489,0.693425,0.497603,
std,14.431858,10.951696,10.901934,13.371497,15.756425,0.461103,0.293077,
min,1.0,60.046291,60.022478,50.018703,40.035731,0.0,5e-05,
25%,13.0,72.869084,71.850204,67.136178,62.535673,0.0,0.235668,
50%,25.5,81.174498,80.985069,78.369867,76.073629,1.0,0.498647,
75%,38.0,90.007077,89.708456,88.886129,87.786743,1.0,0.752928,
max,50.0,99.863903,99.99879,99.987762,99.988508,1.0,0.999947,


In [89]:
# def calculate_target(row, weights):
#     dynamic_weights = {
#         key: value * np.random.uniform(0.8, 1.2) for key, value in weights.items()
#     }
#     weights = dynamic_weights
#     base_target = (
#         weights['course_grade'] * row['course_grade'] / 100 +
#         weights['easy_correct'] * row['easy_correct'] / 100 +
#         weights['medium_correct'] * row['medium_correct'] / 100 +
#         weights['hard_correct'] * row['hard_correct'] / 100
#     )

#     if row['upcoming_assignment'] == 1:
#         base_target += weights['upcoming_assignment']
#         if row['days_to_deadline']!= 'NA':
#             base_target += weights['days_to_deadline'] * (1 - float(row['days_to_deadline']))
    
#     noise = np.random.normal(0, 0.1)
#     base_target += noise
#     additional_noise = np.random.uniform(-0.1, 0.1)
#     base_target += additional_noise
#     return min(max(base_target, 0), 1) 

def calculate_target(row, weights):
    # Apply non-linear transformations with a small negative bias
    course_grade_transformed = np.sqrt(row['course_grade']) / 10 - 0.05
    easy_correct_transformed = np.log(row['easy_correct'] + 1) / 100 - 0.01
    medium_correct_transformed = np.log(row['medium_correct'] + 1) / 100 - 0.01
    hard_correct_transformed = np.log(row['hard_correct'] + 1) / 100 - 0.01

    base_target = (
        weights['course_grade'] * course_grade_transformed +
        weights['easy_correct'] * easy_correct_transformed +
        weights['medium_correct'] * medium_correct_transformed +
        weights['hard_correct'] * hard_correct_transformed
    )

    if row['upcoming_assignment'] == 1:
        base_target += weights['upcoming_assignment']
        if row['days_to_deadline'] != 'NA':
            base_target += weights['days_to_deadline'] * (1 - float(row['days_to_deadline']))

    noise = np.random.normal(0, 0.1)
    base_target += noise

    return min(max(base_target, 0), 1)

def apply_targets(df: pd.DataFrame, weights: Dict[str, float]) -> pd.DataFrame:
    df['target'] = df.apply(lambda row: calculate_target(row, weights), axis=1)
    return df

In [94]:
weights = {
    'course_grade': 0.35,
    'easy_correct': 0.15,
    'medium_correct': 0.25,
    'hard_correct': 0.3,
    'upcoming_assignment': 0.15,
    'days_to_deadline': 0.15
}

In [47]:
df = apply_targets(df, weights)

In [48]:
df.head()

Unnamed: 0,user,course,course_grade,course_topic,easy_correct,medium_correct,hard_correct,upcoming_assignment,days_to_deadline,target
0,1,CMSC320,89.515972,Introduction to Data Science and Data Type,69.06518,94.121975,64.144431,1,0.284417,0.828643
1,1,CMSC320,89.515972,Experiment Design,92.452853,54.696491,88.512116,1,0.201891,0.849881
2,1,CMSC320,89.515972,Pandas and SQL,99.738995,84.056038,99.828937,1,0.207723,0.923256
3,1,CMSC320,89.515972,"Probability, Distributions, and Summary Stats",82.686111,80.484904,99.596845,0,,0.671155
4,1,CMSC320,89.515972,Hypothesis Testing,88.862974,99.901244,68.752573,0,,0.644768


In [50]:
df.describe()

Unnamed: 0,user,course_grade,easy_correct,medium_correct,hard_correct,upcoming_assignment,days_to_deadline,target
count,7300.0,7300.0,7300.0,7300.0,7300.0,7300.0,5062.0,7300.0
mean,25.5,81.211628,80.685094,77.586944,74.527489,0.693425,0.497603,0.728607
std,14.431858,10.951696,10.901934,13.371497,15.756425,0.461103,0.293077,0.114738
min,1.0,60.046291,60.022478,50.018703,40.035731,0.0,5e-05,0.412379
25%,13.0,72.869084,71.850204,67.136178,62.535673,0.0,0.235668,0.642173
50%,25.5,81.174498,80.985069,78.369867,76.073629,1.0,0.498647,0.750892
75%,38.0,90.007077,89.708456,88.886129,87.786743,1.0,0.752928,0.819831
max,50.0,99.863903,99.99879,99.987762,99.988508,1.0,0.999947,0.980245


In [38]:
df.to_csv("synthetic_data.csv", mode="w", index=False)

In [95]:
df = generate_synthetic_data(50, columns, courses, course_topics)
df = apply_targets(df, weights)
df.describe()

Unnamed: 0,user,course_grade,easy_correct,medium_correct,hard_correct,upcoming_assignment,days_to_deadline,target
count,7300.0,7300.0,7300.0,7300.0,7300.0,7300.0,5047.0,7300.0
mean,25.5,78.854356,79.209022,76.340501,73.096683,0.69137,0.507029,0.471289
std,14.431858,10.340504,10.785782,13.391382,15.890059,0.46196,0.288277,0.149529
min,1.0,60.011168,60.001141,50.004627,40.030664,0.0,0.000369,0.0
25%,13.0,70.299855,70.147568,65.764839,60.835293,0.0,0.263418,0.364784
50%,25.5,78.229828,78.973963,76.806423,74.163785,1.0,0.504142,0.483302
75%,38.0,87.716967,88.028858,87.629554,86.106202,1.0,0.757019,0.58145
max,50.0,99.872027,99.996753,99.978954,99.997927,1.0,0.999994,0.919125


In [96]:
df.to_csv("synthetic_data.csv", mode="w", index=False)