In [406]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm, t
import config 
import seaborn as sns

In [407]:
# get module variables
ROOT_DIR = config.ROOT_DIR
sns.set_style('whitegrid')

In [408]:
student_grades = pd.read_csv(rf"{ROOT_DIR}\data\student_grades.csv")

In [409]:
student_grades

Unnamed: 0,Student ID,Undergrad Degree,Undergrad Grade,MBA Grade,Work Experience,Employability (Before),Employability (After),Status,Annual Salary
0,1,Business,68.4,90.2,No,252,276,Placed,111000.0
1,2,Business,62.1,92.8,No,423,410,Not Placed,
2,3,Computer Science,70.2,68.7,Yes,101,119,Placed,107000.0
3,4,Engineering,75.1,80.7,No,288,334,Not Placed,
4,5,Finance,60.9,74.9,No,248,252,Not Placed,
...,...,...,...,...,...,...,...,...,...
90,91,Business,76.0,77.9,No,326,369,Placed,99500.0
91,92,Computer Science,67.7,86.1,No,421,457,Placed,107000.0
92,93,Engineering,75.3,89.9,No,368,421,Not Placed,
93,94,Engineering,68.1,83.1,No,279,282,Placed,84000.0


#### Estimate employability of students before they graduate with alpha 0.05

In [410]:
employability = student_grades.loc[:, ["Student ID", "Employability (Before)"]]
sample_mean = employability["Employability (Before)"].mean()
sample_std = employability["Employability (Before)"].std()
sample_size = employability["Employability (Before)"].count()
sample_mean, sample_std, sample_size

(239.90526315789472, 85.94048827645774, 95)

In [411]:
# get the t statistic for given alpha
t_stat = t.ppf(1-0.025, df=sample_size-1)
t_stat

1.9855234417658298

In [412]:
# get the marging of error and range of mean for given alpha
margin_of_error = t_stat * (sample_std / np.sqrt(sample_size))
confidence_interval = (sample_mean - margin_of_error, sample_mean + margin_of_error)
confidence_interval

(222.3982913235513, 257.41223499223815)

In [413]:
# using scipy function to compute confidence interval in one step
t.interval(confidence=0.95, df=sample_size-1, loc=sample_mean, scale=sample_std/np.sqrt(sample_size))

(222.3982913235513, 257.41223499223815)

#### Estimate employability of students after they graduate with alpha 0.1

In [414]:
employability_after = student_grades.loc[:, ["Student ID", "Employability (After)"]]
employability_after

Unnamed: 0,Student ID,Employability (After)
0,1,276
1,2,410
2,3,119
3,4,334
4,5,252
...,...,...
90,91,369
91,92,457
92,93,421
93,94,282


In [415]:
# get sample statistics
sample_mean = employability_after["Employability (After)"].mean()
sample_size = employability_after["Employability (After)"].count()
sample_std = employability_after["Employability (After)"].std()
sample_mean, sample_std, sample_size

(289.34736842105264, 93.52104876523815, 95)

In [416]:
# get the t statistic for given alpha
t_stat = t.ppf(1-0.05, df=sample_size-1)
t_stat

1.6612258552697985

In [417]:
# get margin of error and range of mean for given alpha
margin_of_error = t_stat * (sample_std / np.sqrt(sample_size))
confidence_interval = (sample_mean - margin_of_error, sample_mean + margin_of_error)
confidence_interval

(273.4078114011075, 305.2869254409978)

In [418]:
# use scipy interval function to compute confidence interval 
sample_size = employability_after["Employability (After)"].count()
t.interval(confidence=0.9, df=sample_size-1, loc=employability_after["Employability (After)"].mean(), scale=employability_after["Employability (After)"].std()/np.sqrt(sample_size))

(273.4078114011075, 305.2869254409978)

#### Estimate annual salary of graduates with alpha of 0.05 - no population parameters are given - client ask is we have enough graduates in our program to compute the mean salary of graduates with 95% confidence

In [419]:
salary = student_grades.loc[:, ["Student ID", "Annual Salary"]]
salary

Unnamed: 0,Student ID,Annual Salary
0,1,111000.0
1,2,
2,3,107000.0
3,4,
4,5,
...,...,...
90,91,99500.0
91,92,107000.0
92,93,
93,94,84000.0


In [420]:
# compute sample statistics for annual salary
sample_mean = salary["Annual Salary"].mean()
sample_size = salary["Annual Salary"].count()
sample_std = salary["Annual Salary"].std()
sample_mean, sample_std, sample_size

(119386.7924528302, 45546.95817647072, 53)

In [421]:
# get the t statistic for given alpha
t_stat = t.ppf(1-0.025, df=sample_size-1)
t_stat

2.0066468031022113

In [422]:
# get margin of error and range of mean for given alpha
margin_of_error = t_stat * (sample_std / np.sqrt(sample_size))
confidence_interval = (sample_mean - margin_of_error, sample_mean + margin_of_error)
confidence_interval

(106832.49596474825, 131941.08894091216)

In [423]:
# use scipy interval function to compute confidence interval
t.interval(confidence=0.95, df=sample_size-1, loc=sample_mean, scale=sample_std/np.sqrt(sample_size))

(106832.49596474825, 131941.08894091216)

In [424]:
student_grades["Work Experience Bernoulli"] = np.where(student_grades["Work Experience"] == "No", 0, 1)

#### Find proportion of students recruited into the program with work experience with alpha 0.1

In [425]:
# Assuming binonmial distribution, compute sample statistics with p (probability of success), q(probabiiity of failure) and n (sample size)
# p - proportion of students with work experience
sample_size = student_grades["Work Experience Bernoulli"].count()
p = student_grades["Work Experience Bernoulli"].sum() / sample_size
q= 1 - p
p, q, sample_size

(0.24210526315789474, 0.7578947368421053, 95)

In [426]:
# estimate population standard deviation using p
sample_std = np.sqrt((p*q)/sample_size)
z_score = norm.ppf(1-0.05)
z_score, sample_std

(1.6448536269514722, 0.04394857098558691)

In [427]:
# Proportion of students with work experience with confidence interval 90 percent
margin_of_error = z_score * sample_std
confidence_interval = (p - margin_of_error, p + margin_of_error)
confidence_interval

(0.16981629677291787, 0.3143942295428716)

#### Find proportion of students placed out of program with work experience with alpha 0.05

In [428]:
student_grades["Status"].unique()

array(['Placed', 'Not Placed'], dtype=object)

In [429]:
# set up bernoulli success and failure based on above unique values for Status column
student_grades["Status Bernoulli"] = np.where(student_grades["Status"] == "Placed", 1, 0)

In [430]:
student_grades.loc[:, ["Status", "Status Bernoulli"]]

Unnamed: 0,Status,Status Bernoulli
0,Placed,1
1,Not Placed,0
2,Placed,1
3,Not Placed,0
4,Not Placed,0
...,...,...
90,Placed,1
91,Placed,1
92,Not Placed,0
93,Placed,1


In [431]:
# get p and q for bernoulli distribution
sample_size = student_grades["Status Bernoulli"].count()
p = student_grades["Status Bernoulli"].sum() / sample_size
q = 1 - p
sample_size * p, sample_size * q
# if below two values are greater than 10, we can use Central Limit Theorem to assume normal distribution for sample distribution of sample proportion

(52.99999999999999, 42.00000000000001)

In [432]:
z_score = norm.ppf(1-0.025)
margin_of_error = z_score * np.sqrt((p*q)/sample_size)
confidence_interval = (p - margin_of_error, p + margin_of_error)
confidence_interval

(0.458026987378912, 0.6577624863052984)