# Statistics

In [2]:
import numpy as np
import pandas as pd
import matplotlib as plt
# import scipy.stats as stats

In [3]:
#Setting a random seed for reproducibility
np.random.seed(2)

In [4]:
#Simulating a dataset for AI Engineering students

#1. Traditional learning: classroom based
# Creating a 25 hours/week and standard deviation of 5 hours
traditional_study_hours = np.random.normal(25, 5, 100) #creates 100 datapoints

#2. Accelerated learning (project-based and hands-on style)
#This will be a 35 hours/week and a standard deviation 8 hours
accelerated_study_hours = np.random.normal(35, 8, 100) #this creates a normal distribution with the given datapoints

#Generating  corresponding performance scores between 0 - 100
#Letting the performance coorelate with study hours but has some randomness

traditional_scores = np.random.normal(75, 12, 100)
accelerated_scores = np.random.normal(35, 8, 100)

# Generating project completion counts
traditional_projects = np.random.poisson(8, 100)
accelerated_projects = np.random.poisson(12, 100)

In [5]:
#Creating a DataFrame

data = pd.DataFrame({
    "Study_Hours_Per_Week": np.concatenate([traditional_study_hours, accelerated_study_hours]),
    'Performance_Score': np.concatenate([traditional_scores, accelerated_scores]),
    "Projects_Completed": np.concatenate([traditional_projects, accelerated_projects]),
    "Learning_Track": ["Traditional"]*100 + ["Accelerated"]*100
})

In [6]:
data.head()

Unnamed: 0,Study_Hours_Per_Week,Performance_Score,Projects_Completed,Learning_Track
0,22.916211,61.187965,7,Traditional
1,24.718666,69.88173,12,Traditional
2,14.31902,73.222234,7,Traditional
3,33.201354,93.017243,6,Traditional
4,16.032822,85.435178,8,Traditional


In [7]:
data.tail()

Unnamed: 0,Study_Hours_Per_Week,Performance_Score,Projects_Completed,Learning_Track
195,49.435813,40.701466,11,Accelerated
196,36.443278,33.330371,14,Accelerated
197,39.425314,36.100944,11,Accelerated
198,43.264233,34.173183,8,Accelerated
199,32.367981,41.786805,15,Accelerated


In [8]:
# Cleaning the data to keep it in range or in realistic bounds by clippping it
#this clips the study hours to a range of 10 to 60 and keeps it a d.p
data["Study_Hours_Per_Week"] = round(np.clip(data["Study_Hours_Per_Week"], 10, 60), 1)

data["Performance_Score"] = round(np.clip(data["Performance_Score"], 0, 100), 1)
data["Projects_Completed"] = np.clip(data["Projects_Completed"], 1, 25)
data.tail()

Unnamed: 0,Study_Hours_Per_Week,Performance_Score,Projects_Completed,Learning_Track
195,49.4,40.7,11,Accelerated
196,36.4,33.3,14,Accelerated
197,39.4,36.1,11,Accelerated
198,43.3,34.2,8,Accelerated
199,32.4,41.8,15,Accelerated


In [None]:
# Taking a snapshot of the data 

print(f'Total AI Engineering Students: {len(data)}')
print(f"Learning Tracks: {data["Learning_Track"].unique()}")
print(f"\nFirst 10 students in our dataset:")
print(data.head(10))

Total AI Engineering Students: 200
Learning Tracks: ['Traditional' 'Accelerated']

First 10 students in our dataset:
   Study_Hours_Per_Week  Performance_Score  Projects_Completed Learning_Track
0                  22.9               61.2                   7    Traditional
1                  24.7               69.9                  12    Traditional
2                  14.3               73.2                   7    Traditional
3                  33.2               93.0                   6    Traditional
4                  16.0               85.4                   8    Traditional
5                  20.8               62.0                   6    Traditional
6                  27.5               83.0                   6    Traditional
7                  18.8               83.8                   7    Traditional
8                  19.7               62.3                   7    Traditional
9                  20.5               73.7                  10    Traditional


## Descriptive Statistics for the data

In [None]:
# Getting the description for each learning track by study hours
data.groupby("Learning_Track")["Study_Hours_Per_Week"].describe().round(2)