In [10]:
import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv('mine.csv')

## 1. Handle Missing Values / Empty Cells
# Convert all columns to appropriate numeric types where applicable
numeric_cols = ['Duration', 'Pulse', 'Maxpulse', 'Calories']
for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Fill missing numeric values with column medians
for col in numeric_cols:
    median_val = df[col].median()
    df[col] = df[col].fillna(median_val)

## 2. Fix Inconsistent Date Formats
# Clean and standardize date format
df['Date'] = df['Date'].astype(str).str.replace("'", "")  # Remove stray quotes
df['Date'] = df['Date'].str.strip()  # Remove whitespace

# Fix the malformed date (20231018 -> 2023/10/18)
df['Date'] = df['Date'].str.replace(r'(\d{4})(\d{2})(\d{2})', r'\1/\2/\3', regex=True)

# Convert to datetime and fill any remaining missing dates
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
df['Date'] = df['Date'].fillna(pd.to_datetime('2023-10-15'))  # Using mid-month as default

## 3. Remove Duplicate Rows
# Remove exact duplicate rows
df = df.drop_duplicates()

# Check for duplicate dates (keeping first occurrence)
df = df.drop_duplicates(subset=['Date'], keep='first')

## 4. Fix Wrong Data
# Validate physiological values
df['Duration'] = df['Duration'].clip(0, 180)  # Assuming max 3 hours (180 mins) per session
df['Pulse'] = df['Pulse'].clip(40, 220)  # Reasonable heart rate range
df['Maxpulse'] = df['Maxpulse'].clip(df['Pulse'], 220)  # Maxpulse should be ≥ pulse

# Validate calories (assuming 100-1000 is reasonable range for these durations)
df['Calories'] = df['Calories'].clip(100, 1000)

## 5. Remove Unnecessary Columns
# (All columns appear relevant for fitness analysis - none removed)
# If needed: df = df.drop(['Unneeded_Column'], axis=1)

## Final Verification
print("\nCleaned Dataset Info:")
print(df.info())

print("\nMissing Values After Cleaning:")
print(df.isnull().sum())

print("\nDuplicate Rows After Cleaning:", df.duplicated().sum())

# Save cleaned dataset
df.to_csv('cleaned_mine.csv', index=False)
print("\nCleaned dataset saved as 'cleaned_mine.csv'")

# Display sample of cleaned data
print("\nSample of Cleaned Data:")
print(df.head())


Cleaned Dataset Info:
<class 'pandas.core.frame.DataFrame'>
Index: 28 entries, 0 to 30
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   Duration  28 non-null     int64         
 1   Date      28 non-null     datetime64[ns]
 2   Pulse     28 non-null     float64       
 3   Maxpulse  28 non-null     float64       
 4   Calories  28 non-null     float64       
dtypes: datetime64[ns](1), float64(3), int64(1)
memory usage: 1.3 KB
None

Missing Values After Cleaning:
Duration    0
Date        0
Pulse       0
Maxpulse    0
Calories    0
dtype: int64

Duplicate Rows After Cleaning: 0

Cleaned dataset saved as 'cleaned_mine.csv'

Sample of Cleaned Data:
   Duration       Date  Pulse  Maxpulse  Calories
0        60 2023-10-01  110.0     130.0     409.1
1        60 2023-10-02  117.0     145.0     479.0
2        60 2023-10-03  103.0     135.0     340.3
3        45 2023-10-04  109.0     175.0     282.4
4        45 202