In [1]:
import pandas as pd

# Load dataset
df = pd.read_csv("StudentsPerformance.csv")

# Show first few rows
df.head()


Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [2]:
# Show data types and non-null counts
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race/ethnicity               1000 non-null   object
 2   parental level of education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test preparation course      1000 non-null   object
 5   math score                   1000 non-null   int64 
 6   reading score                1000 non-null   int64 
 7   writing score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB


In [3]:
# Check for null values
df.isnull().sum()


gender                         0
race/ethnicity                 0
parental level of education    0
lunch                          0
test preparation course        0
math score                     0
reading score                  0
writing score                  0
dtype: int64

In [4]:
# Check for duplicate rows
duplicate_count = df.duplicated().sum()
print("Duplicate rows:", duplicate_count)


Duplicate rows: 0


In [5]:
# Simplify education levels
df['parental level of education'] = df['parental level of education'].replace({
    "some high school": "High School",
    "high school": "High School",
    "some college": "College",
    "associate's degree": "College",
    "bachelor's degree": "University",
    "master's degree": "University"
})

# Rename ethnic groups
df['race/ethnicity'] = df['race/ethnicity'].replace({
    "group A": "Group 1",
    "group B": "Group 2",
    "group C": "Group 3",
    "group D": "Group 4",
    "group E": "Group 5"
})

# Preview changes
df[['race/ethnicity', 'parental level of education']].head()


Unnamed: 0,race/ethnicity,parental level of education
0,Group 2,University
1,Group 3,College
2,Group 2,University
3,Group 1,College
4,Group 3,College


In [6]:
# Calculate average score
df['average score'] = df[['math score', 'reading score', 'writing score']].mean(axis=1)
df[['math score', 'reading score', 'writing score', 'average score']].head()


Unnamed: 0,math score,reading score,writing score,average score
0,72,72,74,72.666667
1,69,90,88,82.333333
2,90,95,93,92.666667
3,47,57,44,49.333333
4,76,78,75,76.333333


In [None]:
# Define performance category
def categorize(avg):
    if avg >= 80:
        return 'High'
    elif avg >= 60:
        return 'Medium'
    else:
        return 'Low'

df['performance category'] = df['average score'].apply(categorize)
df[['average score', 'performance category']].head()


Unnamed: 0,average score,performance category
0,72.666667,Medium
1,82.333333,High
2,92.666667,High
3,49.333333,Low
4,76.333333,Medium


In [8]:
# Compare scores based on test preparation
prep_effectiveness = df.groupby('test preparation course')['average score'].mean()
print("Preparation Effectiveness:\n", prep_effectiveness)


Preparation Effectiveness:
 test preparation course
completed    72.669460
none         65.038941
Name: average score, dtype: float64
