# Data Cleaning and Transformation Test

Use file `./ref/sample_user_data.csv` for input

Use `pandas` lib or other lib if you want

Write your code and display the result in specific cell

In [346]:
import pandas as pd

In [347]:
# read csv file

df = pd.read_csv("./ref/sample_user_data.csv")
df.info()
df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29 entries, 0 to 28
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   user_id    29 non-null     int64  
 1   name       29 non-null     object 
 2   age        26 non-null     float64
 3   join_date  27 non-null     object 
 4   salary     29 non-null     object 
dtypes: float64(1), int64(1), object(3)
memory usage: 1.3+ KB


Unnamed: 0,user_id,name,age,join_date,salary
0,1,John,28.0,2023-02-01,50000.0
1,2,Alice,,2023-02-02,52000.0
2,3,Bob,34.0,02/03/2023,49000.0
3,3,Bob,34.0,2023-03-02,49000.0
4,4,Emma,45.0,,47000.0
5,5,Grek,32.0,,23000.32
6,6,allen,26.0,2023-09-23,30000.0
7,7,Allen,26.0,2023-09-23,30000.0
8,8,chris,28.0,2020-11-13,35000.25
9,9,Bob,33.0,2023-04-02,49000.0


### Question 1
There're data quality issues, such as outliers, null values, and incorrect data types.  

1. Validate fields.
2. Clean data by handling missing values.
3. Deduplicate record (if any).
4. Standardize any inconsistent data (e.g., date formats).


There're several conditions,
1.  Replace NaN in age value with "-"
2.  Format join_date to date type.
    -   YYYY-MM-DD i.e. 2024-11-03
    -   DD/MM/YYYY i.e. 03/12/2023
    -   remove NaN value
3. Name should be captitalized.



#### Example Data Quality Checks:
- Identify outliers in numerical fields (e.g., ages > 100).
- Ensure no future dates in a date column.
- user_id not be empty




In [423]:
# Write you code and display question 1 result here.

# copy dataframe to use in cleansing
cleansing_df = df.copy()

# Check user_id is not empty
empty_user_id = cleansing_df[(cleansing_df["user_id"].isnull()) | (cleansing_df["user_id"] == '')]
if not empty_user_id.empty: print("This following data have empty user_id", empty_user_id)

# Identify outliers in ages columns, since there are no instruction for ages to remove or replace
# I will create new column that will use to identify user with ages more than 100 for future use such as cleasning or reinput
cleansing_df["is_valid_age"] = cleansing_df["age"] > 100

# Use fillna function to replace NaN value in age columns with "-"
cleansing_df["age"].fillna("-", inplace=True)

# Remove "," in salary column and change data type to float
cleansing_df["salary"] = cleansing_df["salary"].apply(lambda x: x.replace(',', '')).astype(float)
# Use numpy floor function to remove any decimal and change data type to int
import numpy as np
cleansing_df["salary"] = np.floor(cleansing_df["salary"]).astype(int)

# Use capitalize function to capitalize name columns
cleansing_df["name"] = cleansing_df["name"].str.capitalize()

# use apply function with pandas to_datetime to check format date 
# if found error then convert invalid date to Not a Time (NaT) to compare not pd.Nat to return False if found NaT
cleansing_df["is_valid_date"] = cleansing_df["join_date"].apply(lambda x: pd.to_datetime(x, format="%Y-%m-%d", errors='coerce') is not pd.NaT)

# use is_valid_date column to determine date that is not in right format and not NaN
wrong_date_list = cleansing_df[(cleansing_df["is_valid_date"] == False) & (cleansing_df["join_date"].notna())]["join_date"]
# store index of wrong date format
wrong_date_index = cleansing_df[(cleansing_df["is_valid_date"] == False) & (cleansing_df["join_date"].notna())].index

# for loop if there are more than one date in wrong format
fix_date_list = []
for date_value in wrong_date_list:
    # Assume that wrong format date is DD/MM/YYYY i.e. 03/12/2023
    replace_date = date_value.replace("/","")
    # Remove "/" then check length if exactly 8 which can easy to slice into day, month, year 
    if len(replace_date) == 8:
        day = replace_date[0:2]
        month = replace_date[2:4]
        year = replace_date[4:8]
    else:       
    # find "/" then slice number before "/" and put in day, month, year in sequence
        first_slash = date_value.find("/")
        day = date_value[0:first_slash]
        date_value_2 = date_value[first_slash+1:]
        second_slash = date_value[first_slash+1:].find("/")
        month = date_value_2[0:second_slash]
        year = date_value_2[second_slash+1:]
    
    # print error text when list of date as day, month, year is over than usual value which could be value or input format
    error_text = ""
    if(int(day) > 31): error_text += "day has more than 31 [error value]:" + day
    if(int(month) > 12): error_text += "\nmonth has more than 12 [error value]:" + str(int(month))
    if(len(year) > 4): error_text += "\nyear has more than 4 digits [error value]:" + str(year)
    if(error_text != ""): print("Date: " + date_value + " unable to convert and contains error below" + error_text)
    
    # store list if day, month, year in range and if not store non transform value
    if((int(day) <= 31) & (int(month) <= 12) & (len(year) == 4)):
        fix_date_list.append(pd.Timestamp(year+"-"+str(int(month))+"-"+str(int(day))).date())
    else :
        fix_date_list.append(date_value)
# Replace list of fix date format to wrong date format list
cleansing_df["join_date"].loc[wrong_date_index] = fix_date_list

# After cleansing date then remove NaN value with dropna()
cleansing_df = cleansing_df.dropna()

# Convert join_date column to datetime format
cleansing_df["join_date"] = cleansing_df["join_date"].apply(pd.to_datetime)

# Ensure no future date in date column by using datetime today() check with column join_date to store index
import datetime
future_date_index = cleansing_df[cleansing_df["join_date"] >= pd.Timestamp(datetime.date.today())].index
# Drop row with future date, but if row is requried it can be replace with mode() dealing with DataQuality
cleansing_df = cleansing_df.drop(future_date_index)

# Drop duplicate record with pandas function
cleansing_df = cleansing_df.drop_duplicates(subset=['user_id', 'name', 'age'])

cleansing_df = cleansing_df.reset_index(drop=True)
cleansing_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleansing_df["join_date"].loc[wrong_date_index] = fix_date_list


Unnamed: 0,user_id,name,age,join_date,salary,is_valid_age,is_valid_date
0,1,John,28.0,2023-02-01,50000,False,True
1,2,Alice,-,2023-02-02,52000,False,True
2,3,Bob,34.0,2023-03-02,49000,False,False
3,6,Allen,26.0,2023-09-23,30000,False,True
4,7,Allen,26.0,2023-09-23,30000,False,True
5,8,Chris,28.0,2020-11-13,35000,False,True
6,9,Bob,33.0,2023-04-02,49000,False,True
7,10,Bobb,34.0,2023-03-01,49000,False,True
8,12,Bobby,34.0,2023-03-01,49000,False,True
9,13,Bab,39.0,2023-02-02,49000,False,True


### Question 2
From question 1, display summary statistics on specific columns (mean, median, max, min, average)


In [424]:
# Write you code and display question 2 result here.

# Use describe function on dataframe to display (mean, median, max, min, average)
print(cleansing_df.describe())

# As age contain "-" it cannot use statistics so we need to exclude those record, also age still contain outlier
cleansing_df[["user_id", "age", "salary"]][cleansing_df["age"] != '-'].astype(int).describe()

         user_id         salary
count  19.000000      19.000000
mean   12.789474   66210.526316
std     7.091301   38552.387082
min     1.000000   22000.000000
25%     7.500000   49000.000000
50%    13.000000   49000.000000
75%    18.500000   84500.000000
max    24.000000  149000.000000


Unnamed: 0,user_id,age,salary
count,16.0,16.0,16.0
mean,12.625,96.3125,62875.0
std,6.820313,226.496422,40074.721875
min,1.0,22.0,22000.0
25%,7.75,28.0,45500.0
50%,12.5,34.0,49000.0
75%,17.25,39.0,60750.0
max,24.0,940.0,149000.0


### Question 3
Display user record who has filled all data (data not be NaN, empty or "-")



In [430]:
# Write you code and display question 3 result here.

# If using cleansing dataframe from Question 1 then 
# Drop rows with NaN and Checks each row for "-" and keeps only rows that don't contain "-"
filtered_df = cleansing_df.dropna() 
filtered_df = filtered_df[filtered_df.ne('-').all(axis=1)] 
filtered_df[["user_id", "name", "age", "join_date", "salary"]].reset_index(drop=True)

Unnamed: 0,user_id,name,age,join_date,salary
0,1,John,28.0,2023-02-01,50000
1,3,Bob,34.0,2023-03-02,49000
2,6,Allen,26.0,2023-09-23,30000
3,7,Allen,26.0,2023-09-23,30000
4,8,Chris,28.0,2020-11-13,35000
5,9,Bob,33.0,2023-04-02,49000
6,10,Bobb,34.0,2023-03-01,49000
7,12,Bobby,34.0,2023-03-01,49000
8,13,Bab,39.0,2023-02-02,49000
9,15,Bob,39.0,2024-03-02,69000


In [431]:
# If using non cleansing dataframe then
# Drop rows with NaN and Checks each row for "-" and keeps only rows that don't contain "-"
filtered_ori_df = df.dropna()  # Drop rows with NaN
filtered_ori_df = filtered_ori_df[filtered_ori_df.ne('-').all(axis=1)] 
filtered_ori_df.reset_index(drop=True)

Unnamed: 0,user_id,name,age,join_date,salary
0,1,John,28.0,2023-02-01,50000.0
1,3,Bob,34.0,02/03/2023,49000.0
2,3,Bob,34.0,2023-03-02,49000.0
3,6,allen,26.0,2023-09-23,30000.0
4,7,Allen,26.0,2023-09-23,30000.0
5,8,chris,28.0,2020-11-13,35000.25
6,9,Bob,33.0,2023-04-02,49000.0
7,10,Bobb,34.0,2023-03-01,49000.0
8,9,Bob,33.0,2023-04-02,49000.0
9,12,Bobby,34.0,2023-03-01,49000.0
