In [47]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [48]:
df = pd.read_csv("Dataset\StudentsPerformance.csv")

## 1. Dataset Information

8 columns : String <br>
5 columns : Integer


- gender : (M/F)
- race/ethnicity : Groups (A, B, C, D, E)	
- parental level of education : (Bachelor's Degree, Some college, masters's degree, associate's degree, high school, some high school)	
- lunch	: (standard/free, reduced)
- test preparation course : (none/completed)	
- math score	
- reading score	
- writing score

In [49]:
df.head(5)

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [50]:
df.shape

(1000, 8)

### 1.1 Changing the names of the columns for better EDA

In [51]:
df = df.rename(columns={'race/ethnicity': 'race_ethnicity', 'parental level of education': 'parental_level_of_education', 'test preparation course': 'test_preparation_course', 'math score': 'math_score', 'reading score': 'reading_score', 'writing score': 'writing_score'})

In [52]:
df.columns

Index(['gender', 'race_ethnicity', 'parental_level_of_education', 'lunch',
       'test_preparation_course', 'math_score', 'reading_score',
       'writing_score'],
      dtype='object')

## 2. Data Checks

- Check missing value
- Check duplicates	
- Check data types	
- Check the number of unique values of each column
- Check statistics of dataset
- Check various categories present in the different categorical column

### 2.1 Check Missing Values

In [53]:
df.isna().sum()                                                                    

gender                         0
race_ethnicity                 0
parental_level_of_education    0
lunch                          0
test_preparation_course        0
math_score                     0
reading_score                  0
writing_score                  0
dtype: int64

There are no missing values present in the dataset

### 2.2 Check Duplicates

In [54]:
df.duplicated().sum()

np.int64(0)

There are no duplicate record present in the dataset

### 2.3 Check for datatypes and Null values

In [55]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race_ethnicity               1000 non-null   object
 2   parental_level_of_education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test_preparation_course      1000 non-null   object
 5   math_score                   1000 non-null   int64 
 6   reading_score                1000 non-null   int64 
 7   writing_score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB


### 2.4 Check the number of unique values for each column

In [56]:
df.nunique()

gender                          2
race_ethnicity                  5
parental_level_of_education     6
lunch                           2
test_preparation_course         2
math_score                     81
reading_score                  72
writing_score                  77
dtype: int64

In [57]:
df['parental_level_of_education'].unique()

array(["bachelor's degree", 'some college', "master's degree",
       "associate's degree", 'high school', 'some high school'],
      dtype=object)

In [58]:
df['lunch'].unique()

array(['standard', 'free/reduced'], dtype=object)

In [59]:
df['race_ethnicity'].unique()

array(['group B', 'group C', 'group A', 'group D', 'group E'],
      dtype=object)

In [60]:
df['test_preparation_course'].unique()

array(['none', 'completed'], dtype=object)

In [61]:
df['gender'].unique()

array(['female', 'male'], dtype=object)

### 2.5 Check Statistics of Data

In [62]:
# defining numerical and categorical columns
numerical_columns = []
categorical_columns = []

for feature in df.columns:
    if(df[feature].dtype == 'O'):
        categorical_columns.append(feature)
    else:
        numerical_columns.append(feature)

print("We have {} numerical features : {}".format(len(numerical_columns), numerical_columns))
print("We have {} categorical features : {}".format(len(categorical_columns), categorical_columns))

We have 3 numerical features : ['math_score', 'reading_score', 'writing_score']
We have 5 categorical features : ['gender', 'race_ethnicity', 'parental_level_of_education', 'lunch', 'test_preparation_course']


### 2.6 Adding columns for total score and average

In [63]:
df['total_score'] = df['math_score'] + df['reading_score'] + df['writing_score']
df['average_score'] = df['total_score']/3

In [64]:
df.head(5)

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score,total_score,average_score
0,female,group B,bachelor's degree,standard,none,72,72,74,218,72.666667
1,female,group C,some college,standard,completed,69,90,88,247,82.333333
2,female,group B,master's degree,standard,none,90,95,93,278,92.666667
3,male,group A,associate's degree,free/reduced,none,47,57,44,148,49.333333
4,male,group C,some college,standard,none,76,78,75,229,76.333333


## 3. EDA

Use graphs to gain insights from the data

## 4. Export Data to CSV

In [65]:
df = df.drop(['math_score', 'reading_score', 'writing_score'], axis=1)

In [66]:
df.head(5)

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,total_score,average_score
0,female,group B,bachelor's degree,standard,none,218,72.666667
1,female,group C,some college,standard,completed,247,82.333333
2,female,group B,master's degree,standard,none,278,92.666667
3,male,group A,associate's degree,free/reduced,none,148,49.333333
4,male,group C,some college,standard,none,229,76.333333


In [67]:
df.to_csv('Dataset/StudentsPerformance_modified.csv', index=False)