# Import Libraries

In [1]:
import pandas as pd
import numpy as np

# Loading Dataset

In [2]:
df = pd.read_csv('assessment.csv')

# Seeing basic information about the dataset

In [3]:
df.head()

Unnamed: 0,assessment score 1,assessment score 2
0,37.0,12
1,72.0,9
2,75.0,5
3,79.0,64
4,16.0,1


In [4]:
df.describe()

Unnamed: 0,assessment score 1
count,9996.0
mean,49.433373
std,28.710692
min,0.0
25%,24.0
50%,49.0
75%,74.0
max,99.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 2 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   assessment score 1  9996 non-null   float64
 1   assessment score 2  10000 non-null  object 
dtypes: float64(1), object(1)
memory usage: 156.4+ KB


In [6]:
df.sample(5,random_state=70)

Unnamed: 0,assessment score 1,assessment score 2
39,83.0,38
1360,37.0,7
3714,84.0,14
5981,83.0,74
1295,66.0,#


# Viewing the missing data in `assessment score 2`

Missing data in this columna re stored as '#'

In [7]:
df.loc[df['assessment score 2'].isin(['#'])]

Unnamed: 0,assessment score 1,assessment score 2
239,17.0,#
396,21.0,#
398,43.0,#
417,7.0,#
484,52.0,#
...,...,...
9357,60.0,#
9507,50.0,#
9638,44.0,#
9816,37.0,#


## Replacing missing data with np.nan

In [8]:
df['assessment score 2'] = df['assessment score 2'].replace({'#':np.nan})
df

Unnamed: 0,assessment score 1,assessment score 2
0,37.0,12
1,72.0,9
2,75.0,5
3,79.0,64
4,16.0,1
...,...,...
9995,44.0,21
9996,71.0,55
9997,80.0,34
9998,11.0,62


### Verifying the missing data

In [9]:
df.isna().sum()

assessment score 1     4
assessment score 2    92
dtype: int64

In [12]:
df.loc[df['assessment score 2'].isin(['#'])]

Unnamed: 0,assessment score 1,assessment score 2


# Option 1: Dropping the missing rows

In [13]:
cleaned_df = df.dropna()
cleaned_df.describe()

Unnamed: 0,assessment score 1
count,9904.0
mean,49.471123
std,28.70159
min,0.0
25%,24.0
50%,49.0
75%,74.0
max,99.0


In [14]:
cleaned_df.isna().sum()

assessment score 1    0
assessment score 2    0
dtype: int64

# Option 2: drop columns

In [16]:
problem_df = pd.read_csv('./assessment_problem.csv')
problem_df.head()

Unnamed: 0,assessment score 1,assessment score 2
0,357,
1,514,
2,686,
3,39,
4,963,


In [17]:
problem_df.isna().sum()

assessment score 1       0
assessment score 2    9890
dtype: int64

98.9% of data in `assessment score 2` is missing, the column can be dropped as it won't serve much purpose in the analysis.

In [18]:
problem_df_cleaned = problem_df.drop('assessment score 2',axis=1)
problem_df_cleaned.head()

Unnamed: 0,assessment score 1
0,357
1,514
2,686
3,39
4,963


In [19]:
problem_df_cleaned.isna().sum()

assessment score 1    0
dtype: int64

# Option 3: impute NaNs

Impuing means replacing missing values with values such as the mean or mode etc.

In [21]:
df = pd.read_csv('assessment.csv')
df['assessment score 2'] = df['assessment score 2'].replace({'#':np.nan})
df['assessment score 2'] = df['assessment score 2'].astype(float)

In [22]:
df.isna().sum()

assessment score 1     4
assessment score 2    92
dtype: int64

## Filling in the missing values with the mean

In [24]:
t_df = df.copy()
t_df['assessment score 2'] = t_df['assessment score 2'].fillna(t_df['assessment score 2'].mean())

In [25]:
t_df.isna().sum()

assessment score 1    4
assessment score 2    0
dtype: int64

checking the stats before imputing the data

In [28]:
cleaned_df.describe()

Unnamed: 0,assessment score 1
count,9904.0
mean,49.471123
std,28.70159
min,0.0
25%,24.0
50%,49.0
75%,74.0
max,99.0


In [29]:
df.describe()

Unnamed: 0,assessment score 1,assessment score 2
count,9996.0,9908.0
mean,49.433373,49.570751
std,28.710692,28.915779
min,0.0,0.0
25%,24.0,24.0
50%,49.0,50.0
75%,74.0,74.0
max,99.0,99.0


# Option 4: Create Bins

Bins basically are ranges which values of a column are placed in, they can be created using the `cut()` function, replacing the value with the range in belongs in.

In [30]:
df['assessment score 1'] = pd.cut(df['assessment score 1'],4)
df['assessment score 2'] = pd.cut(df['assessment score 2'],4)

In [31]:
df['assessment score 2'].value_counts()

assessment score 2
(49.5, 74.25]      2560
(-0.099, 24.75]    2482
(74.25, 99.0]      2475
(24.75, 49.5]      2391
Name: count, dtype: int64

In [32]:
df[df.isnull().any(axis=1)]

Unnamed: 0,assessment score 1,assessment score 2
56,,"(49.5, 74.25]"
152,,"(74.25, 99.0]"
231,,"(-0.099, 24.75]"
239,"(-0.099, 24.75]",
275,,"(24.75, 49.5]"
...,...,...
9357,"(49.5, 74.25]",
9507,"(49.5, 74.25]",
9638,"(24.75, 49.5]",
9816,"(24.75, 49.5]",
