In [126]:
"""
Student Performance Dataset Analysis
Dataset Information
Student Performance Dataset (StudentsPerformance.csv)

1000 students with academic performance data
8 columns: gender, race/ethnicity, parental level of education, lunch, test preparation course, math score, reading score, writing score
Score Range: 0-100 for each subject
Demographics: Male/Female, 5 ethnic groups, 6 education levels, standard/free lunch, test prep completed/none
"""

'\nStudent Performance Dataset Analysis\nDataset Information\nStudent Performance Dataset (StudentsPerformance.csv)\n\n1000 students with academic performance data\n8 columns: gender, race/ethnicity, parental level of education, lunch, test preparation course, math score, reading score, writing score\nScore Range: 0-100 for each subject\nDemographics: Male/Female, 5 ethnic groups, 6 education levels, standard/free lunch, test prep completed/none\n'

In [127]:
#Task 1.1: Data Loading and Exploration
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from google.colab import drive
import warnings
warnings.filterwarnings('ignore')

# Load the dataset
drive.mount('/content/drive')
file_path ='/content/drive/MyDrive/dataset/StudentsPerformance.csv'
columns = ['gender', 'race/ethnicity', 'parental level of education', 'lunch', 'test preparation course', 'math score','reading score', 'writing score']
df = pd.read_csv(file_path, names=columns)
print('Dataset loaded successfully, Dataset:\n', df.shape)

# Your tasks:
# 1. Display first 10 rows
# 2. Show dataset shape and info()
# 3. Display unique values in each categorical column
# 4. Show basic statistics for all score columns

# Your code here:

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Dataset loaded successfully, Dataset:
 (1001, 8)


In [128]:
#first 10 row
df.head(10)

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
1,female,group B,bachelor's degree,standard,none,72,72,74
2,female,group C,some college,standard,completed,69,90,88
3,female,group B,master's degree,standard,none,90,95,93
4,male,group A,associate's degree,free/reduced,none,47,57,44
5,male,group C,some college,standard,none,76,78,75
6,female,group B,associate's degree,standard,none,71,83,78
7,female,group B,some college,standard,completed,88,95,92
8,male,group B,some college,free/reduced,none,40,43,39
9,male,group D,high school,free/reduced,completed,64,64,67


In [129]:
#2. Shape
df.shape

(1001, 8)

In [130]:
#info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1001 entries, 0 to 1000
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1001 non-null   object
 1   race/ethnicity               1001 non-null   object
 2   parental level of education  1001 non-null   object
 3   lunch                        1001 non-null   object
 4   test preparation course      1001 non-null   object
 5   math score                   1001 non-null   object
 6   reading score                1001 non-null   object
 7   writing score                1001 non-null   object
dtypes: object(8)
memory usage: 62.7+ KB


In [131]:
#4.Basic statistics(mean, median, std, min, max)
df = df.iloc[1:].reset_index(drop=True)
df['math score']= df['math score'].astype(int)
df['reading score']= df['reading score'].astype(int)
df['writing score']= df['writing score'].astype(int)

math = df['math score'].values
read = df['reading score'].values
write = df['writing score'].values

df.dtypes
df.head()

print('Statistics math-mean:', round(np.mean(math),2))
print('Statistics math-median:', round(np.median(math),2))
print('Statistics math-std:', round(np.std(math),2))
print('Statistics math-min/max:', np.min(math), np.max(math))

print('Statistics read-mean:', round(np.mean(read),2))
print('Statistics read-median:', round(np.median(read),2))
print('Statistics read-std:', round(np.std(read),2))
print('Statistics math-min/max:', np.min(read), np.max(read))

print('Statistics write-mean:', round(np.mean(write),2))
print('Statistics write-median:', round(np.median(write),2))
print('Statistics write-std:', round(np.std(write),2))
print('Statistics math-min/max:', np.min(write), np.max(write))

Statistics math-mean: 66.09
Statistics math-median: 66.0
Statistics math-std: 15.16
Statistics math-min/max: 0 100
Statistics read-mean: 69.17
Statistics read-median: 70.0
Statistics read-std: 14.59
Statistics math-min/max: 17 100
Statistics write-mean: 68.05
Statistics write-median: 69.0
Statistics write-std: 15.19
Statistics math-min/max: 10 100


In [132]:
#Task 1.2: NumPy Array Operations
# Convert score columns to numpy arrays
math_scores = df['math score'].values
reading_scores = df['reading score'].values
writing_scores = df['writing score'].values

# Your tasks:
# 1. Calculate mean, median, std, min, max for each subject using NumPy
# 2. Find total students who scored above 80 in math
# 3. Find total students who scored below 50 in any subject
# 4. Calculate the overall average score across all three subjects

# Expected output example:
# Math - Mean: 66.1, Median: 66.0, Std: 15.2
# Students with math > 80: 132
# Students with any score < 50: 178

# Your code here:


In [133]:
#1.2 students who score more than 80 in math
high_math_scores = (math> 80)
print('student who has high math scores:', np.sum(high_math_scores))

#poorest student who have less than 50 numbers in any subject
below_numbers = (math<50) & (read<50) & (write<50)
print('student who have less than 50 numbers in any subject:', np.sum(below_numbers))

#Average score
average_score = np.array([math,read,write])
mean_three_columns = np.mean(average_score)
print('Average mean of read, write and math is:', round(mean_three_columns, 2))

student who has high math scores: 176
student who have less than 50 numbers in any subject: 55
Average mean of read, write and math is: 67.77


In [134]:
# Task 2.1: Identify and Handle Missing Data
# Check the dataset for any missing values
# Your tasks:
# 1. Check for null values in each column using .isnull().sum()
# 2. Check for any impossible values (scores > 100 or < 0)
# 3. Display summary of data quality

# Note: This dataset is clean, but let's verify!

# Your code here:


In [135]:
#2.1 checking missing values
print(df.isnull().sum())

gender                         0
race/ethnicity                 0
parental level of education    0
lunch                          0
test preparation course        0
math score                     0
reading score                  0
writing score                  0
dtype: int64


In [136]:
#2.2 checking impossible values
imp_values_math = (math>100) | (math<0)
print('Impossible values for math:',(imp_values_math).sum())

imp_values_read = (read>100) | (read<0)
print('Impossible values for read:',(imp_values_read).sum())

imp_values_write = (write>100) | (write<0)
print('Impossible values for write:',(imp_values_write).sum())

Impossible values for math: 0
Impossible values for read: 0
Impossible values for write: 0


In [137]:
#checking data quality
print(df.duplicated().sum())

0


In [138]:
#Task 2.2: Create and Fix Missing Value
# Artificially create missing data for practice
import numpy as np

# Your tasks:
# 1. Make a copy of the dataset: df_practice = df.copy()
# 2. Randomly set 50 math scores to np.nan
# 3. Randomly set 30 writing scores to np.nan
# 4. Fill missing math scores with the MEDIAN math score
# 5. Fill missing writing scores with the MEAN writing score
# 6. Verify no missing values remain

# Steps to follow:
# Step 1: Create copy and show original missing values
df_practice = df.copy()
print("Original missing values:", df_practice.isnull().sum())

# Step 2: Create missing data
np.random.seed(42)  # For reproducible results
missing_math_idx = np.random.choice(df_practice.index, 50, replace=False)
missing_writing_idx = np.random.choice(df_practice.index, 30, replace=False)

# Continue the task...
# Your code here:


Original missing values: gender                         0
race/ethnicity                 0
parental level of education    0
lunch                          0
test preparation course        0
math score                     0
reading score                  0
writing score                  0
dtype: int64


In [139]:
#Create copy and show original missing values
df_practice = df.copy()
print("Original missing values:", df_practice.isnull().sum())

Original missing values: gender                         0
race/ethnicity                 0
parental level of education    0
lunch                          0
test preparation course        0
math score                     0
reading score                  0
writing score                  0
dtype: int64


In [140]:
#Create missing data
np.random.seed(42)  # For reproducible results
missing_math_idx = np.random.choice(df_practice.index, 4, replace=False)
missing_writing_idx = np.random.choice(df_practice.index, 30, replace=False)

In [141]:
df['math score'] = df['math score'].replace(0, np.nan)
df['writing score'] = df['writing score'].replace(0, np.nan)
print("Missing values after replacement:", df['math score'].isnull().sum())
print("Missing values after replacement:", df['writing score'].isnull().sum())

Missing values after replacement: 1
Missing values after replacement: 0


In [142]:
#Task 3.1:Answer Simple Questions About Student Performance
# Your tasks:
# 1. Display first 5 rows using .head()
# 2. Display last 5 rows using .tail()
# 3. Show dataset shape using .shape
# 4. Show column names using .columns
# 5. Show data types using .dtypes
# 6. Show basic info using .info()

# Your code here:
print(df_practice.head())
print(df_practice.tail())
print(df_practice.shape)
print(df_practice.columns)
print(df_practice.dtypes)
print(df_practice.info())

   gender race/ethnicity parental level of education         lunch  \
0  female        group B           bachelor's degree      standard   
1  female        group C                some college      standard   
2  female        group B             master's degree      standard   
3    male        group A          associate's degree  free/reduced   
4    male        group C                some college      standard   

  test preparation course  math score  reading score  writing score  
0                    none          72             72             74  
1               completed          69             90             88  
2                    none          90             95             93  
3                    none          47             57             44  
4                    none          76             78             75  
     gender race/ethnicity parental level of education         lunch  \
995  female        group E             master's degree      standard   
996    male    

In [151]:
#Task 3.2: Basic Groupby Analysis
# Comprehensive groupby analysis
# Your tasks:

# 1. Group by 'gender' and calculate:
#    - Average scores in all subjects
#    - Count of students

# 2. Group by 'test preparation course' and show:
#    - Mean scores for each subject
#    - Show which subject benefits most from test prep

# 3. Group by 'parental level of education' and calculate:
#    - Average math score for each education level
#    - Rank education levels by math performance

# 4. Group by 'lunch' type and show:
#    - Average scores for each subject
#    - Count of students in each lunch category

# Your code here:
math_scores = df['math score'].values
reading_scores = df['reading score'].values
writing_scores = df['writing score'].values
summary = df.groupby('gender')[['math score', 'reading score', 'writing score']].mean()
print("Summary by gender:")
print(round(summary,2))

Summary by gender:
        math score  reading score  writing score
gender                                          
female       63.76          72.61          72.47
male         68.73          65.47          63.31


In [169]:
#1 Average score
average_score = np.array([math,read,write])
mean_three_columns = np.mean(average_score)
print('Average scores of read, write and math is:', round(mean_three_columns, 2))

#count
gender = df['gender'].count()
print('Total number of students:', gender)

Average scores of read, write and math is: 67.77
Total number of students: 1000


Total number of students: 1000


In [145]:
#Task 3.3: Use your data analysis skills to answer these easy questions:

# 1. Who performs better in math - males or females?
#    - Calculate average math score by gender
#    - Show the difference between male and female average

# 2. Which subject do students perform best in overall?
#    - Calculate the overall average for math, reading, and writing
#    - Rank the subjects from highest to lowest average score

# 3. What's the impact of parental education?
#    - Find the average total score for each parental education level
#    - Identify which education level leads to highest student performance
