# Descriptive Statistics:

### Read a CSV file

In [1]:
import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv('StudentsPerformance.csv')

# Display first few rows
df.head()


Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


### Calculate mean, median, variance, and standard deviation for all three scores

In [7]:
# Define the score columns
score_columns = ['math score', 'reading score', 'writing score']

# Calculate descriptive statistics
desc_stats = df[score_columns].agg(['mean', 'median', 'var', 'std'])

# Display the result
print("Descriptive Statistics for Math, Reading, and Writing Scores:")
print(desc_stats)


Descriptive Statistics for Math, Reading, and Writing Scores:
        math score  reading score  writing score
mean     66.089000      69.169000      68.054000
median   66.000000      70.000000      69.000000
var     229.918998     213.165605     230.907992
std      15.163080      14.600192      15.195657


### Create summary tables showing average scores per category (e.g., lunch type, test prep)

In [8]:
# Define the score columns
score_columns = ['math score', 'reading score', 'writing score']

# Average scores by lunch type
avg_by_lunch = df.groupby('lunch')[score_columns].mean()
print("Average Scores by Lunch Type:")
print(avg_by_lunch)
print("\n")

# Average scores by test preparation course
avg_by_test_prep = df.groupby('test preparation course')[score_columns].mean()
print("Average Scores by Test Preparation Course:")
print(avg_by_test_prep)


Average Scores by Lunch Type:
              math score  reading score  writing score
lunch                                                 
free/reduced   58.921127      64.653521      63.022535
standard       70.034109      71.654264      70.823256


Average Scores by Test Preparation Course:
                         math score  reading score  writing score
test preparation course                                          
completed                 69.695531      73.893855      74.418994
none                      64.077882      66.534268      64.504673


### Calculate coefficient of variation for math, reading, and writing scores.

In [9]:
# Coefficient of Variation (CV) = Standard Deviation / Mean
cv_scores = df[score_columns].std() / df[score_columns].mean()

# Convert to percentage for easier interpretation
cv_percent = cv_scores * 100

print("Coefficient of Variation for Scores (in %):")
print(cv_percent.round(2))


Coefficient of Variation for Scores (in %):
math score       22.94
reading score    21.11
writing score    22.33
dtype: float64


### Rank top 3 factors associated with high performance (based on group averages).

In [10]:
# Step 1: Add a column for average score
df['average score'] = df[['math score', 'reading score', 'writing score']].mean(axis=1)

# Step 2: List of categorical features to evaluate
categorical_features = ['gender', 'race/ethnicity', 'parental level of education', 'lunch', 'test preparation course']

# Step 3: Compute max average score within each category of each feature
group_max_scores = {
    feature: df.groupby(feature)['average score'].mean().max()
    for feature in categorical_features
}

# Step 4: Rank the features by their best-performing group
sorted_factors = sorted(group_max_scores.items(), key=lambda x: x[1], reverse=True)

# Step 5: Display top 3 factors
print("Top 3 Factors Associated with High Performance:")
for i, (feature, score) in enumerate(sorted_factors[:3], 1):
    print(f"{i}. {feature} — Highest Group Avg Score: {score:.2f}")


Top 3 Factors Associated with High Performance:
1. parental level of education — Highest Group Avg Score: 73.60
2. race/ethnicity — Highest Group Avg Score: 72.75
3. test preparation course — Highest Group Avg Score: 72.67


### Determine which feature (e.g., lunch type, gender) has the largest score variance.

In [11]:
# Ensure average score column exists
if 'average score' not in df.columns:
    df['average score'] = df[['math score', 'reading score', 'writing score']].mean(axis=1)

# Categorical features to evaluate
categorical_features = ['gender', 'race/ethnicity', 'parental level of education', 'lunch', 'test preparation course']

# Compute variance of average scores within each group
feature_variances = {
    feature: df.groupby(feature)['average score'].var().max()
    for feature in categorical_features
}

# Find the feature with the largest group variance
max_variance_feature = max(feature_variances, key=feature_variances.get)
max_variance_value = feature_variances[max_variance_feature]

# Print result
print("Feature with the Largest Score Variance:")
print(f"{max_variance_feature} — Max Group Variance: {max_variance_value:.2f}")


Feature with the Largest Score Variance:
parental level of education — Max Group Variance: 224.52
