In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

## Data issues

* Duplicates and unwanted observations
* Missing values
* Structural issues
* Outliers (maths)

In [6]:
data = pd.read_csv("../Data Analysis/Data/students_data.csv", index_col = 0, sep=",")

In [7]:
data.head()

Unnamed: 0,names,admission number,house,balance,english,kiswahili,mathematics,science,sst/cre,Creative Arts,music
0,"JERIEL NDEDA, OBURA",13259.0,,,81.0,39.0,50.0,30.0,59.0,99%,80%
1,"MUKUHA TIMOTHY, KAMAU",13243.0,,,85.0,74.0,68.0,49.0,78.0,38%,86%
2,"JOB, NGARA",13307.0,,,54.0,49.0,53.0,59.0,72.0,86%,62%
3,"CHEGE DAVID, KAMAU",13258.0,,,71.0,97.0,92.0,41.0,81.0,77%,80%
4,"RAMADHAN MUSA, TEPO",13363.0,,,40.0,84.0,74.0,82.0,89.0,64%,46%


In [9]:
# brief preview of the data
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 147 entries, 0 to 146
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   names             147 non-null    object 
 1   admission number  124 non-null    float64
 2   house             26 non-null     object 
 3   balance           58 non-null     object 
 4   english           121 non-null    float64
 5   kiswahili         119 non-null    float64
 6   mathematics       130 non-null    float64
 7   science           117 non-null    float64
 8   sst/cre           132 non-null    float64
 9   Creative Arts     143 non-null    object 
 10  music             147 non-null    object 
dtypes: float64(6), object(5)
memory usage: 13.8+ KB


In [10]:
# shape
data.shape

(147, 11)

In [11]:
# columns
data.columns

Index(['names', 'admission number', 'house', 'balance', 'english', 'kiswahili',
       'mathematics', 'science', 'sst/cre', 'Creative Arts', 'music'],
      dtype='object')

###  Duplicates and unwanted observations

In [14]:
# check for any duplicates
data.duplicated(subset=None, keep='first').any()

True

In [15]:
# number of duplicates
data.duplicated().value_counts()

False    139
True       8
dtype: int64

In [16]:
data.drop_duplicates(subset=None, keep='first', inplace=True)

In [17]:
data.shape

(139, 11)

In [18]:
data.drop(columns = ['house'], inplace = True)

## Missing Values

**Handling The Values:**
Always handle missing values column by column not multiple coz for e.g mean can be different.

* Replace:
    ```
    * with actual values
    * with mean for small numbers.
    *  with mode(categorical data)
    ```
* Drop:
    ```
    * - % of missing values is high,
    * unneccesary column
    ```

In [20]:
# checking for any missing values~ alias .isna
data.isnull().any()

names               False
admission number     True
balance              True
english              True
kiswahili            True
mathematics          True
science              True
sst/cre              True
Creative Arts        True
music               False
dtype: bool

In [21]:
data["english"].isnull().any()

True

In [23]:
# number of missing values and sort
data.isnull().sum().sort_values(ascending = False)

balance             82
science             30
kiswahili           28
english             26
admission number    23
mathematics         17
sst/cre             15
Creative Arts        4
music                0
names                0
dtype: int64

In [24]:
(data.isnull().sum()/ len(data)).sort_values(ascending = False)

balance             0.589928
science             0.215827
kiswahili           0.201439
english             0.187050
admission number    0.165468
mathematics         0.122302
sst/cre             0.107914
Creative Arts       0.028777
music               0.000000
names               0.000000
dtype: float64

In [25]:
data.dtypes

names                object
admission number    float64
balance              object
english             float64
kiswahili           float64
mathematics         float64
science             float64
sst/cre             float64
Creative Arts        object
music                object
dtype: object

In [28]:
# Handling missings~assumming the students with NaNs have no balances (0)
# regex= True; treat as string
# np.NaN looking for missing values
data["balance"].replace(to_replace = np.NaN, value = "0", inplace = True, regex = True)

In [29]:
data.isnull().sum()

names                0
admission number    23
balance              0
english             26
kiswahili           28
mathematics         17
science             30
sst/cre             15
Creative Arts        4
music                0
dtype: int64

In [30]:
subjects_num = ['english', 'kiswahili',
       'mathematics', 'science', 'sst/cre']

for col in subjects_num:
    data[col].replace(to_replace = np.NaN, value = data[col].mean(), inplace = True)

In [31]:
data.isnull().sum()

names                0
admission number    23
balance              0
english              0
kiswahili            0
mathematics          0
science              0
sst/cre              0
Creative Arts        4
music                0
dtype: int64

In [35]:
data["Creative Arts"] = data["Creative Arts"].str.strip("%")
data["music"] = data["music"].str.strip("%")

In [36]:
data.head()

Unnamed: 0,names,admission number,balance,english,kiswahili,mathematics,science,sst/cre,Creative Arts,music
0,"JERIEL NDEDA, OBURA",13259.0,0,81.0,39.0,50.0,30.0,59.0,99,80
1,"MUKUHA TIMOTHY, KAMAU",13243.0,0,85.0,74.0,68.0,49.0,78.0,38,86
2,"JOB, NGARA",13307.0,0,54.0,49.0,53.0,59.0,72.0,86,62
3,"CHEGE DAVID, KAMAU",13258.0,0,71.0,97.0,92.0,41.0,81.0,77,80
4,"RAMADHAN MUSA, TEPO",13363.0,0,40.0,84.0,74.0,82.0,89.0,64,46


In [46]:
# missing values in creative arts
data["Creative Arts"].replace(to_replace = np.NaN, value = "0", inplace = True, regex = True)

In [47]:
data["Creative Arts"].astype(int)

0      99
1      38
2      86
3      77
4      64
       ..
142    99
143    56
144    49
145    88
146    76
Name: Creative Arts, Length: 139, dtype: int32

In [48]:
data["Creative Arts"].replace("&","", inplace =True, regex = True)

In [49]:
data["Creative Arts"].astype(int)

0      99
1      38
2      86
3      77
4      64
       ..
142    99
143    56
144    49
145    88
146    76
Name: Creative Arts, Length: 139, dtype: int32

In [50]:
data.isnull().sum()

names                0
admission number    23
balance              0
english              0
kiswahili            0
mathematics          0
science              0
sst/cre              0
Creative Arts        0
music                0
dtype: int64

In [51]:
data.dtypes

names                object
admission number    float64
balance              object
english             float64
kiswahili           float64
mathematics         float64
science             float64
sst/cre             float64
Creative Arts        object
music                object
dtype: object

In [52]:
data["Creative Arts"].replace(to_replace = 0,value = data["Creative Arts"].mean(), inplace =True, regex = True)

In [53]:
data["Creative Arts"]

0      99
1      38
2      86
3      77
4      64
       ..
142    99
143    56
144    49
145    88
146    76
Name: Creative Arts, Length: 139, dtype: object