In [49]:
import pandas as pd
import numpy as np

In [50]:
df = pd.read_csv("Mine.csv")
df.head()

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
0,60,2023/10/01',110.0,130.0,409.1
1,60,2023/10/02',117.0,145.0,479.0
2,60,2023/10/03',103.0,135.0,340.3
3,45,2023/10/04',109.0,175.0,282.4
4,45,2023/10/05',117.0,150.0,405.1


# 1. Handle Missing Values 

In [52]:
# Lets fill missing pulses with median since it is less affected by outliers
pulse_median = df['Pulse'].median()
df['Pulse'] = df['Pulse'].fillna(pulse_median)
df.head(15)

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
0,60,2023/10/01',110.0,130.0,409.1
1,60,2023/10/02',117.0,145.0,479.0
2,60,2023/10/03',103.0,135.0,340.3
3,45,2023/10/04',109.0,175.0,282.4
4,45,2023/10/05',117.0,150.0,405.1
5,60,2023/10/06',103.0,125.0,300.0
6,60,2023/10/07',110.0,135.0,374.0
7,400,2023/10/08',114.0,133.0,
8,60,2023/10/09',112.0,126.0,193.8
9,30,2023/10/10',102.0,147.0,234.8


In [53]:
# Now we fill missing maxpulses with median as well
maxpulse_median = df['Maxpulse'].median()
df['Maxpulse'] = df['Maxpulse'].fillna(maxpulse_median)
df.head(15)

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
0,60,2023/10/01',110.0,130.0,409.1
1,60,2023/10/02',117.0,145.0,479.0
2,60,2023/10/03',103.0,135.0,340.3
3,45,2023/10/04',109.0,175.0,282.4
4,45,2023/10/05',117.0,150.0,405.1
5,60,2023/10/06',103.0,125.0,300.0
6,60,2023/10/07',110.0,135.0,374.0
7,400,2023/10/08',114.0,133.0,
8,60,2023/10/09',112.0,126.0,193.8
9,30,2023/10/10',102.0,147.0,234.8


In [54]:
# We also fill missing calories with median
calories_median = df['Calories'].median()
df['Calories'] = df['Calories'].fillna(calories_median)
df.head(15)

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
0,60,2023/10/01',110.0,130.0,409.1
1,60,2023/10/02',117.0,145.0,479.0
2,60,2023/10/03',103.0,135.0,340.3
3,45,2023/10/04',109.0,175.0,282.4
4,45,2023/10/05',117.0,150.0,405.1
5,60,2023/10/06',103.0,125.0,300.0
6,60,2023/10/07',110.0,135.0,374.0
7,400,2023/10/08',114.0,133.0,282.4
8,60,2023/10/09',112.0,126.0,193.8
9,30,2023/10/10',102.0,147.0,234.8


# 2. Fix Inconsistent Date Formats 

In [56]:
# Removing trailing apostrophe characters from dates
df['Date'] = df['Date'].str.replace("'", '', regex=True)
df.head(15)

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
0,60,2023/10/01,110.0,130.0,409.1
1,60,2023/10/02,117.0,145.0,479.0
2,60,2023/10/03,103.0,135.0,340.3
3,45,2023/10/04,109.0,175.0,282.4
4,45,2023/10/05,117.0,150.0,405.1
5,60,2023/10/06,103.0,125.0,300.0
6,60,2023/10/07,110.0,135.0,374.0
7,400,2023/10/08,114.0,133.0,282.4
8,60,2023/10/09,112.0,126.0,193.8
9,30,2023/10/10,102.0,147.0,234.8


In [57]:
# Converting to datetime while handling both formats
def parse_date(date_str):
    if pd.isna(date_str):
        return pd.NaT
    try:
        return pd.to_datetime(date_str, format='%Y/%m/%d')
    except:
        try:
            return pd.to_datetime(date_str, format='%Y%m%d')
        except:
            return pd.NaT

df['Date'] = df['Date'].apply(parse_date)
df.head(20)

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
0,60,2023-10-01,110.0,130.0,409.1
1,60,2023-10-02,117.0,145.0,479.0
2,60,2023-10-03,103.0,135.0,340.3
3,45,2023-10-04,109.0,175.0,282.4
4,45,2023-10-05,117.0,150.0,405.1
5,60,2023-10-06,103.0,125.0,300.0
6,60,2023-10-07,110.0,135.0,374.0
7,400,2023-10-08,114.0,133.0,282.4
8,60,2023-10-09,112.0,126.0,193.8
9,30,2023-10-10,102.0,147.0,234.8


In [58]:
# Drop rows with missing dates
df = df.dropna(subset=['Date'])
df

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
0,60,2023-10-01,110.0,130.0,409.1
1,60,2023-10-02,117.0,145.0,479.0
2,60,2023-10-03,103.0,135.0,340.3
3,45,2023-10-04,109.0,175.0,282.4
4,45,2023-10-05,117.0,150.0,405.1
5,60,2023-10-06,103.0,125.0,300.0
6,60,2023-10-07,110.0,135.0,374.0
7,400,2023-10-08,114.0,133.0,282.4
8,60,2023-10-09,112.0,126.0,193.8
9,30,2023-10-10,102.0,147.0,234.8


# 3. Remove Duplicate Rows 

In [60]:
df = df.drop_duplicates()
df

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
0,60,2023-10-01,110.0,130.0,409.1
1,60,2023-10-02,117.0,145.0,479.0
2,60,2023-10-03,103.0,135.0,340.3
3,45,2023-10-04,109.0,175.0,282.4
4,45,2023-10-05,117.0,150.0,405.1
5,60,2023-10-06,103.0,125.0,300.0
6,60,2023-10-07,110.0,135.0,374.0
7,400,2023-10-08,114.0,133.0,282.4
8,60,2023-10-09,112.0,126.0,193.8
9,30,2023-10-10,102.0,147.0,234.8


In [61]:
# Removing exact duplicates (such as the two entries for 2023-10-15)
df = df.drop_duplicates(subset=['Date'], keep='first')
df

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
0,60,2023-10-01,110.0,130.0,409.1
1,60,2023-10-02,117.0,145.0,479.0
2,60,2023-10-03,103.0,135.0,340.3
3,45,2023-10-04,109.0,175.0,282.4
4,45,2023-10-05,117.0,150.0,405.1
5,60,2023-10-06,103.0,125.0,300.0
6,60,2023-10-07,110.0,135.0,374.0
7,400,2023-10-08,114.0,133.0,282.4
8,60,2023-10-09,112.0,126.0,193.8
9,30,2023-10-10,102.0,147.0,234.8


# 4. Fix Wrong Data

In [63]:
# 400 minutes is likely a typo, it is unrealistic
df.loc[df['Duration'] == 400, 'Duration'] = 40
df

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
0,60,2023-10-01,110.0,130.0,409.1
1,60,2023-10-02,117.0,145.0,479.0
2,60,2023-10-03,103.0,135.0,340.3
3,45,2023-10-04,109.0,175.0,282.4
4,45,2023-10-05,117.0,150.0,405.1
5,60,2023-10-06,103.0,125.0,300.0
6,60,2023-10-07,110.0,135.0,374.0
7,40,2023-10-08,114.0,133.0,282.4
8,60,2023-10-09,112.0,126.0,193.8
9,30,2023-10-10,102.0,147.0,234.8


In [64]:
# Fixing cases where Maxpulse is less than Pulse
mask = df['Maxpulse'] < df['Pulse']
df.loc[mask, 'Maxpulse'] = df.loc[mask, 'Pulse'] * 1.2  # Setting the maxpulse to 20% higher than Pulse
df

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
0,60,2023-10-01,110.0,130.0,409.1
1,60,2023-10-02,117.0,145.0,479.0
2,60,2023-10-03,103.0,135.0,340.3
3,45,2023-10-04,109.0,175.0,282.4
4,45,2023-10-05,117.0,150.0,405.1
5,60,2023-10-06,103.0,125.0,300.0
6,60,2023-10-07,110.0,135.0,374.0
7,40,2023-10-08,114.0,133.0,282.4
8,60,2023-10-09,112.0,126.0,193.8
9,30,2023-10-10,102.0,147.0,234.8


# 5. Check for Unnecessary Columns 
All columns seem relevant in my view

In [66]:
# Finally, Data Type Conversions
# Convert multiple columns at once using .loc
df.loc[:, ['Duration', 'Pulse', 'Maxpulse']] = df[['Duration', 'Pulse', 'Maxpulse']].astype(int)

In [67]:
# Resetting index after cleaning
df = df.reset_index(drop=True)

In [68]:
# Data Summary

In [69]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28 entries, 0 to 27
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   Duration  28 non-null     int64         
 1   Date      28 non-null     datetime64[ns]
 2   Pulse     28 non-null     float64       
 3   Maxpulse  28 non-null     float64       
 4   Calories  28 non-null     float64       
dtypes: datetime64[ns](1), float64(3), int64(1)
memory usage: 1.2 KB


In [70]:
df.describe()

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
count,28.0,28,28.0,28.0,28.0
mean,56.607143,2023-10-15 07:42:51.428571392,119.214286,145.392857,301.953571
min,30.0,2023-10-01 00:00:00,94.0,106.0,193.8
25%,60.0,2023-10-07 18:00:00,103.0,129.75,241.25
50%,60.0,2023-10-15 12:00:00,114.5,141.5,282.4
75%,60.0,2023-10-22 12:00:00,134.25,162.5,345.675
max,60.0,2023-10-31 00:00:00,162.0,194.0,479.0
std,7.823702,,17.554903,21.789123,68.648517


In [73]:
# Saving cleaned data
df.to_csv('cleaned_fitness_data.csv', index=False)