<a href="https://colab.research.google.com/github/MelchiorKaczmarzyk/InternshipTutorialCode/blob/main/basics01/InternshipExcersices02_missingData.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

###Imports

In [160]:
import numpy as np
import pandas as pd
import sklearn

###Defining data

In [161]:
data = {
    "size": ["XL", "L", "M", np.nan, "M", 'M'],
    "color": ["red", "green", "blue", "green", "red", 'green'],
    "gender": ["female", "male", np.nan, "female", "female", 'man'],
    "price": [199.0, 89.0, np.nan, 129.0, 79.0, 89.0],
    "weight": [500, 450, 300, np.nan, 410, np.nan],
    "bought": ["yes", "no", "yes", "no", "yes", 'no']
}

df_raw = pd.DataFrame(data=data)
df_raw

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500.0,yes
1,L,green,male,89.0,450.0,no
2,M,blue,,,300.0,yes
3,,green,female,129.0,,no
4,M,red,female,79.0,410.0,yes
5,M,green,man,89.0,,no


###Getting information about missing values from .info()

In [162]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   size    5 non-null      object 
 1   color   6 non-null      object 
 2   gender  5 non-null      object 
 3   price   5 non-null      float64
 4   weight  4 non-null      float64
 5   bought  6 non-null      object 
dtypes: float64(2), object(4)
memory usage: 416.0+ bytes


###Calculating how much data is *missing*

In [163]:
df = df_raw.copy()
# "%" of missing data in each column
df.isnull().sum() / len(df)
# It could be useful to check % of missing data in a row to figure out if we can delete it
# Wait for it... *

size      0.166667
color     0.000000
gender    0.166667
price     0.166667
weight    0.333333
bought    0.000000
dtype: float64

###Using SimpleUmputer with various imputing strategies

In [164]:
from ast import Constant
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
df[['weight']] = imputer.fit_transform(df[['weight']])

In [165]:
imputer = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=99)
df[['price']] == imputer.fit_transform(df[['price']])

Unnamed: 0,price
0,True
1,True
2,False
3,True
4,True
5,True


In [166]:
imputer = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value='L')
df[['size']] = imputer.fit_transform(df[['size']])
df

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500.0,yes
1,L,green,male,89.0,450.0,no
2,M,blue,,,300.0,yes
3,L,green,female,129.0,415.0,no
4,M,red,female,79.0,410.0,yes
5,M,green,man,89.0,415.0,no


In [167]:
df[['size']] = df_raw[['size']].copy()
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imputer.fit_transform(df[['size']])

array([['XL'],
       ['L'],
       ['M'],
       ['M'],
       ['M'],
       ['M']], dtype=object)

###Back to where we started


In [168]:
df = df_raw.copy()
df

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500.0,yes
1,L,green,male,89.0,450.0,no
2,M,blue,,,300.0,yes
3,,green,female,129.0,,no
4,M,red,female,79.0,410.0,yes
5,M,green,man,89.0,,no


###Finding null values in columns


In [169]:
# * IT'S HERE!
pd.isnull(df['weight']).sum() / len(df['weight'])

0.3333333333333333

###Finding only rows with null values in a given column

In [170]:
df[pd.isnull(df['weight'])]

Unnamed: 0,size,color,gender,price,weight,bought
3,,green,female,129.0,,no
5,M,green,man,89.0,,no


In [171]:
df[~pd.isnull(df['weight'])]

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500.0,yes
1,L,green,male,89.0,450.0,no
2,M,blue,,,300.0,yes
4,M,red,female,79.0,410.0,yes


### Simple filling

In [172]:
# Can also be used on specific rows
#inplace=True makes the changes to actually happen to the df object
df.fillna(value='missing!', inplace=True)

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500.0,yes
1,L,green,male,89.0,450.0,no
2,M,blue,missing!,missing!,300.0,yes
3,missing!,green,female,129.0,missing!,no
4,M,red,female,79.0,410.0,yes
5,M,green,man,89.0,missing!,no


### Deleting rows with missing values

In [None]:
# Can be honed using parameters to delete rows with various part of it's data missing
df = df.dropna()