# Notebook showing how to use the functions

In [1]:
import warnings

warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

In [2]:
data = {'col_1': [3, 2, 1, 0], 'col_2': [4,5,6,np.nan], 'col_3': [4,5,6,np.nan]}
df = pd.DataFrame.from_dict(data)

In [3]:
print("Number of missing values col_2: ", df.col_2.isnull().sum())
print("Number of missing values col_3: ", df.col_3.isnull().sum())

Number of missing values col_2:  1
Number of missing values col_3:  1


In [4]:
# having the function in the notebook

# function to impute NaNs with mean value
def impute(series: pd.Series) -> pd.Series:
    mean = series.mean()
    return series.fillna(mean)

In [5]:
df["col_2"] = impute(df["col_2"])
print("Number of missing values col_2: ", df.col_2.isnull().sum())
print("Number of missing values col_3: ", df.col_3.isnull().sum())

Number of missing values col_2:  0
Number of missing values col_3:  1


### After data cleaning step

Move functions to python scripts, and then also update the notebook to call the functions. This will greatly reduce the copy pasting.. especially when you have more notebooks in larger projects

In [6]:
# calling the function from the script
# as the notebook is in the same folder the import doesn't need scripts.data_preprocessing
from data_processing import impute_mean
df["col_3"] = impute_mean(df["col_3"])
print("Number of missing values col_2: ", df.col_2.isnull().sum())
print("Number of missing values col_3: ", df.col_3.isnull().sum())

Number of missing values col_2:  0
Number of missing values col_3:  0
