In [2]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
 

In [6]:
# lets set random seed for reproducibility
np.random.seed(42)


In [13]:
# Lets simulate a  dataset for AI Engineering students
#1. Traditional Learning -classroom-based
# - lets create it with a 25 hours/week and standard deviation of 5 hours
traditional_study_hours=np.random.normal(25,5,100)

#. 2. Accelerated Learning (project-based and Hands-on style)
# This one will be a 35 hours/week and a standard deviation of 8 hours
accelerated_study_hours = np.random.normal(35, 8, 100)

# Lets generate corresponding performance scores between 0-100
# We would let the performance correlate with study hours but has some randomness
traditional_scores= np.random.normal(75, 12,100)  # mean of 75, SD of 12
accelerated_scores= np.random.normal(82, 15, 100)  # Mean of 82, SD of 15

# Lets generate project completion counts
traditional_projects = np.random.poisson(8, 100)  # Average 8 projects
accelerated_projects= np.random.poisson(12,100)  # average 12 projects



In [14]:
# Now, lets create the DataFrame
data = pd.DataFrame({
    'Study_Hours_Per_Week': np.concatenate([traditional_study_hours, accelerated_study_hours]),
    'Performance_Score': np.concatenate([traditional_scores, accelerated_scores]),
    'Projects_Completed': np.concatenate([traditional_projects, accelerated_projects]),
    'Learning_Track': ['Traditional'] * 100 + ['Accelerated'] * 100
})

In [16]:
data

Unnamed: 0,Study_Hours_Per_Week,Performance_Score,Projects_Completed,Learning_Track
0,25.922757,74.921744,14,Traditional
1,30.088307,82.185533,8,Traditional
2,26.357477,83.020086,10,Traditional
3,27.757381,66.189914,6,Traditional
4,26.702946,75.983947,9,Traditional
...,...,...,...,...
195,42.176530,95.730854,11,Accelerated
196,45.199000,73.756896,10,Accelerated
197,40.212176,80.248511,12,Accelerated
198,25.898515,72.466632,17,Accelerated


In [17]:
# clean the data - to keep it in range or in realistic bounds by cliping it 
data['Study_Hours_Per_Week']=round(np.clip(data['Study_Hours_Per_Week'],10,60),1) # 10 -60 hours/week
data['Performance_Score']=round(np.clip(data['Performance_Score'],0,100),1) # 0-100 score
data['Projects_Completed']=np.clip(data['Projects_Completed'], 1,25) # 1-25 projects

In [19]:
data

Unnamed: 0,Study_Hours_Per_Week,Performance_Score,Projects_Completed,Learning_Track
0,25.9,74.9,14,Traditional
1,30.1,82.2,8,Traditional
2,26.4,83.0,10,Traditional
3,27.8,66.2,6,Traditional
4,26.7,76.0,9,Traditional
...,...,...,...,...
195,42.2,95.7,11,Accelerated
196,45.2,73.8,10,Accelerated
197,40.2,80.2,12,Accelerated
198,25.9,72.5,17,Accelerated


In [None]:
# Lets take a snapshot of the data
print(f"Total AI Engineering Students: {len(data)}")
print(f"Learning Tracks: {data['Learning_Track'].unique()}")
print("\nFirst 10 students in our dataset:")
print(data.head(10).round(1)) # you can you round as a method, just by the way

Total AI Engineering Students: 200
Learning Tracks: ['Traditional' 'Accelerated']

First 10 students in our dataset:
   Study_Hours_Per_Week  Performance_Score  Projects_Completed Learning_Track
0                  25.9               74.9                  14    Traditional
1                  30.1               82.2                   8    Traditional
2                  26.4               83.0                  10    Traditional
3                  27.8               66.2                   6    Traditional
4                  26.7               76.0                   9    Traditional
5                  27.0               80.5                   7    Traditional
6                  18.4               92.5                   3    Traditional
7                  30.2               83.5                   7    Traditional
8                  30.8               84.5                   8    Traditional
9                  23.9               76.0                   8    Traditional


 - Descriptive statistics for this data

In [25]:
# lets get the description for each learning track by study hours

data.groupby('Learning_Track')['Study_Hours_Per_Week'].describe().round(2)

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Learning_Track,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Accelerated,100.0,34.78,8.64,13.9,29.12,34.6,39.92,58.9
Traditional,100.0,25.48,4.63,15.4,22.08,25.3,28.42,39.7
