# Statistics

In [1]:
import numpy as np
import pandas as pd
import matplotlib as plt
# import scipy.stats as stats

In [2]:
#Setting a random seed for reproducibility
np.random.seed(2)

In [11]:
#Simulating a dataset for AI Engineering students

#1. Traditional learning: classroom based
# Creating a 25 hours/week and standard deviation of 5 hours
traditional_study_hours = np.random.normal(25, 5, 100) #creates 100 datapoints

#2. Accelerated learning (project-based and hands-on style)
#This will be a 35 hours/week and a standard deviation 8 hours
accelerated_study_hours = np.random.normal(35, 8, 100) #this creates a normal distribution with the given datapoints

#Generating  corresponding performance scores between 0 - 100
#Letting the performance coorelate with study hours but has some randomness

traditional_scores = np.random.normal(75, 12, 100)
accelerated_scores = np.random.normal(35, 8, 100)

# Generating project completion counts
traditional_projects = np.random.poisson(8, 100)
accelerated_projects = np.random.poisson(12, 100)

In [12]:
#Creating a DataFrame

data = pd.DataFrame({
    "Study_Hours_Per_Week": np.concatenate([traditional_study_hours, accelerated_study_hours]),
    'Performance_Score': np.concatenate([traditional_scores, accelerated_scores]),
    "Projects_Completed": np.concatenate([traditional_projects, accelerated_projects]),
    "Learning_Track": ["Traditional"]*100 + ["Accelerated"]*100
})

In [13]:
data.head()

Unnamed: 0,Study_Hours_Per_Week,Performance_Score,Projects_Completed,Learning_Track
0,25.014437,73.832965,11,Traditional
1,10.016192,87.24014,7,Traditional
2,16.606451,75.694567,5,Traditional
3,30.609433,67.958785,7,Traditional
4,18.544255,73.055766,8,Traditional


In [14]:
data.tail()

Unnamed: 0,Study_Hours_Per_Week,Performance_Score,Projects_Completed,Learning_Track
195,29.889402,27.578567,15,Accelerated
196,45.258921,38.564314,10,Accelerated
197,37.793261,32.408612,15,Accelerated
198,38.044391,49.434497,22,Accelerated
199,22.181486,35.771171,12,Accelerated


In [15]:
# Cleaning the data to keep it in range or in realistic bounds by clippping it
#this clips the study hours to a range of 10 to 60 and keeps it a d.p
data["Study_Hours_Per_Week"] = round(np.clip(data["Study_Hours_Per_Week"], 10, 60), 1)

data["Performance_Score"] = round(np.clip(data["Performance_Score"], 0, 100), 1)
data["Projects_Completed"] = np.clip(data["Projects_Completed"], 1, 25)
data.tail()

Unnamed: 0,Study_Hours_Per_Week,Performance_Score,Projects_Completed,Learning_Track
195,29.9,27.6,15,Accelerated
196,45.3,38.6,10,Accelerated
197,37.8,32.4,15,Accelerated
198,38.0,49.4,22,Accelerated
199,22.2,35.8,12,Accelerated
