Put all the data cleaning and formatting steps into functions, and create a main function that performs all the cleaning and formatting.

Write these functions in a separate .py file(s). By putting these steps into functions, we can make the code more modular and easier to maintain.

In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame
import clean_data as cd

def main_function(df:str)->DataFrame:
    df1 = pd.read_csv(df)
    df1=cd.change_col_names_to_lower_case(df1) 
    df1=cd.replace_white_spaces_in_col_names(df1)
    df1=cd.rename_col_names(df1,{'customer':'id', 'st':'state'})
    df1 =cd.clean_column_by_mapping(df1)
    string_operations = {
    'customer-lifetime-value': cd.strip_percentage,
    'vehicle-class': cd.map_vehicle_class,}
 # Add more columns and their string operations as needed}

    df1 = cd.clean_column_by_string_operation(df1,string_operations)
    df1 = cd.handle_missing_values(
    df1,
    numerical_columns1_mean=['customer-lifetime-value', 'income', 'monthly-premium-auto', 'number-of-open-complaints', 'total-claim-amount'],
    numerical_columns2_median=['income'],
    categorical_columns_mode=['state', 'gender', 'education', 'customer-lifetime-value',"policy-type","vehicle-class"])
    df1 =cd.duplicate_rows(df1,subset_columns = ['id','income','customer-lifetime-value','monthly-premium-auto','number-of-open-complaints'])
    return df1   

In [2]:
df1=main_function('https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file1.csv')
display(df1.tail(25))

Column 'id' has 2937 null values.
Column 'state' has 2937 null values.
Column 'gender' has 3054 null values.
Column 'education' has 2937 null values.
Column 'income' has 2937 null values.
Column 'monthly-premium-auto' has 2937 null values.
Column 'number-of-open-complaints' has 2937 null values.
Column 'policy-type' has 2937 null values.
Column 'vehicle-class' has 2937 null values.
Column 'total-claim-amount' has 2937 null values.
Duplicate Rows:
id                           2936
state                        2936
gender                       2936
education                    2936
customer-lifetime-value      2936
income                       2936
monthly-premium-auto         2936
number-of-open-complaints    2936
policy-type                  2936
vehicle-class                2936
total-claim-amount           2936
dtype: int64
Duplicate Rows:
id                           0
state                        0
gender                       0
education                    0
customer-lifetime-valu

Unnamed: 0,id,state,gender,education,customer-lifetime-value,income,monthly-premium-auto,number-of-open-complaints,policy-type,vehicle-class,total-claim-amount
3983,9207029e-2346-491d-881b-05a24ddc9fd5,California,F,Bachelor,793690,39295,193,0,Personal Auto,Four-Door Car,404
3984,9207029e-2346-491d-881b-05a24ddc9fd5,California,F,Bachelor,793690,39295,193,0,Personal Auto,Four-Door Car,404
3985,9207029e-2346-491d-881b-05a24ddc9fd5,California,F,Bachelor,793690,39295,193,0,Personal Auto,Four-Door Car,404
3986,9207029e-2346-491d-881b-05a24ddc9fd5,California,F,Bachelor,793690,39295,193,0,Personal Auto,Four-Door Car,404
3987,9207029e-2346-491d-881b-05a24ddc9fd5,California,F,Bachelor,793690,39295,193,0,Personal Auto,Four-Door Car,404
3988,9207029e-2346-491d-881b-05a24ddc9fd5,California,F,Bachelor,793690,39295,193,0,Personal Auto,Four-Door Car,404
3989,9207029e-2346-491d-881b-05a24ddc9fd5,California,F,Bachelor,793690,39295,193,0,Personal Auto,Four-Door Car,404
3990,9207029e-2346-491d-881b-05a24ddc9fd5,California,F,Bachelor,793690,39295,193,0,Personal Auto,Four-Door Car,404
3991,9207029e-2346-491d-881b-05a24ddc9fd5,California,F,Bachelor,793690,39295,193,0,Personal Auto,Four-Door Car,404
3992,9207029e-2346-491d-881b-05a24ddc9fd5,California,F,Bachelor,793690,39295,193,0,Personal Auto,Four-Door Car,404
