In [None]:
#1) Missing values / Empty cells 
import pandas as pd
import numpy as np

# Load the dataset (make sure the CSV is in your working directory or provide full path)
df = pd.read_csv('Mine.csv')

# Display original data for reference
print("Original Dataset:")
print(df.head())

# Step 1: Fix the Date column (remove stray apostrophes and fix inconsistent formats)
df['Date'] = df['Date'].astype(str).str.replace("'", "", regex=False)

# Fix any dates not in the correct format (e.g., 20231018)
df['Date'] = pd.to_datetime(df['Date'], errors='coerce', format='%Y/%m/%d')

# Step 2: Convert numeric columns to proper dtype (if not already)
numeric_cols = ['Duration', 'Pulse', 'Maxpulse', 'Calories']
df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')

# Step 3: Handle missing values
# Option 1: Drop rows with any missing values
# df_cleaned = df.dropna()

# Option 2: Fill missing values
df['Calories'].fillna(df['Calories'].mean(), inplace=True)
df['Pulse'].fillna(df['Pulse'].median(), inplace=True)
df['Maxpulse'].fillna(df['Maxpulse'].median(), inplace=True)
df['Date'].fillna(method='ffill', inplace=True)

# Final cleaned dataset
print("\nCleaned Dataset:")
print(df)

# Save cleaned version (optional)
# df.to_csv('Mine_cleaned.csv', index=False)


Original Dataset:
   Duration         Date  Pulse  Maxpulse  Calories
0        60  2023/10/01'  110.0     130.0     409.1
1        60  2023/10/02'  117.0     145.0     479.0
2        60  2023/10/03'  103.0     135.0     340.3
3        45  2023/10/04'  109.0     175.0     282.4
4        45  2023/10/05'  117.0     150.0     405.1

Cleaned Dataset:
    Duration       Date  Pulse  Maxpulse    Calories
0         60 2023-10-01  110.0     130.0  409.100000
1         60 2023-10-02  117.0     145.0  479.000000
2         60 2023-10-03  103.0     135.0  340.300000
3         45 2023-10-04  109.0     175.0  282.400000
4         45 2023-10-05  117.0     150.0  405.100000
5         60 2023-10-06  103.0     125.0  300.000000
6         60 2023-10-07  110.0     135.0  374.000000
7        400 2023-10-08  114.0     133.0  302.859259
8         60 2023-10-09  112.0     126.0  193.800000
9         30 2023-10-10  102.0     147.0  234.800000
10        60 2023-10-11  100.0     129.0  375.300000
11        60 202

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Calories'].fillna(df['Calories'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Pulse'].fillna(df['Pulse'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which w

In [None]:
#2) Inconsistent date formats   
import pandas as pd
import numpy as np

# Step 0: Simulate raw dataset (you can replace this with df = pd.read_csv("Mine.csv"))
df = pd.read_csv("Mine.csv")

# Display dataset with inconsistent date formats
print("🔍 Dataset with Inconsistent Dates:\n")
print(df[['Date']].head(15))  # Show part of the date column before cleaning

# Step 1: Fix inconsistent dates
df['Date'] = df['Date'].astype(str).str.replace("'", "", regex=False)
df['Date'] = pd.to_datetime(df['Date'], errors='coerce', format='%Y/%m/%d')
df['Date'].fillna(method='ffill', inplace=True)
df['Date'] = df['Date'].dt.strftime('%Y-%m-%d')  # format cleanly as string

# Show dataset after date cleaning only
print("\n✅ Dataset After Cleaning Dates Only:\n")
print(df[['Date']].head(15))

# Step 2: Fully clean the dataset (handle NaNs, correct types)
numeric_cols = ['Duration', 'Pulse', 'Maxpulse', 'Calories']
df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')

# Fill missing numeric values with mean or median
df['Calories'].fillna(df['Calories'].mean(), inplace=True)
df['Pulse'].fillna(df['Pulse'].median(), inplace=True)
df['Maxpulse'].fillna(df['Maxpulse'].median(), inplace=True)

# Final cleaned dataset
print("\n🧼 Final Cleaned Dataset:\n")
print(df.head)  # Show first 10 rows of fully cleaned dataset


🔍 Dataset with Inconsistent Dates:

           Date
0   2023/10/01'
1   2023/10/02'
2   2023/10/03'
3   2023/10/04'
4   2023/10/05'
5   2023/10/06'
6   2023/10/07'
7   2023/10/08'
8   2023/10/09'
9   2023/10/10'
10  2023/10/11'
11  2023/10/12'
12  2023/10/13'
13  2023/10/15'
14  2023/10/15'

✅ Dataset After Cleaning Dates Only:

          Date
0   2023-10-01
1   2023-10-02
2   2023-10-03
3   2023-10-04
4   2023-10-05
5   2023-10-06
6   2023-10-07
7   2023-10-08
8   2023-10-09
9   2023-10-10
10  2023-10-11
11  2023-10-12
12  2023-10-13
13  2023-10-15
14  2023-10-15

🧼 Final Cleaned Dataset:

<bound method NDFrame.head of     Duration        Date  Pulse  Maxpulse    Calories
0         60  2023-10-01  110.0     130.0  409.100000
1         60  2023-10-02  117.0     145.0  479.000000
2         60  2023-10-03  103.0     135.0  340.300000
3         45  2023-10-04  109.0     175.0  282.400000
4         45  2023-10-05  117.0     150.0  405.100000
5         60  2023-10-06  103.0     125.0  300.0

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Date'].fillna(method='ffill', inplace=True)
  df['Date'].fillna(method='ffill', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Calories'].fillna(df['Calories'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work

In [None]:
#3) Duplicate rows. 
import pandas as pd
import numpy as np

# Simulating the dataset (replace with df = pd.read_csv("Mine.csv"))
df = pd.read_csv("Mine.csv")

# Step 0: Show original dataset (may contain duplicates)
print("📋 Original Dataset:\n")
print(df)

# Step 1: Identify duplicate rows
duplicates = df[df.duplicated()]
print("\n🔍 Duplicate Rows Detected:\n")
print(duplicates)

# Step 2: Remove duplicates
df = df.drop_duplicates()

# Step 3: Fix inconsistent date formats
df['Date'] = df['Date'].astype(str).str.replace("'", "", regex=False)
df['Date'] = pd.to_datetime(df['Date'], errors='coerce', format='%Y/%m/%d')
df['Date'].fillna(method='ffill', inplace=True)
df['Date'] = df['Date'].dt.strftime('%Y-%m-%d')

# Step 4: Handle numeric conversions and fill missing values
numeric_cols = ['Duration', 'Pulse', 'Maxpulse', 'Calories']
df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')

df['Calories'].fillna(df['Calories'].mean(), inplace=True)
df['Pulse'].fillna(df['Pulse'].median(), inplace=True)
df['Maxpulse'].fillna(df['Maxpulse'].median(), inplace=True)

# Step 5: Show final cleaned dataset
pd.set_option('display.max_rows', None)
print("\n✅ Final Cleaned Dataset (All Duplicates Removed and Data Cleaned):\n")
print(df)


📋 Original Dataset:

    Duration         Date  Pulse  Maxpulse  Calories
0         60  2023/10/01'  110.0     130.0     409.1
1         60  2023/10/02'  117.0     145.0     479.0
2         60  2023/10/03'  103.0     135.0     340.3
3         45  2023/10/04'  109.0     175.0     282.4
4         45  2023/10/05'  117.0     150.0     405.1
5         60  2023/10/06'  103.0     125.0     300.0
6         60  2023/10/07'  110.0     135.0     374.0
7        400  2023/10/08'  114.0     133.0       NaN
8         60  2023/10/09'  112.0     126.0     193.8
9         30  2023/10/10'  102.0     147.0     234.8
10        60  2023/10/11'  100.0     129.0     375.3
11        60  2023/10/12'  109.0     131.0     345.6
12        60  2023/10/13'  103.0     136.0     239.2
13        60  2023/10/15'  120.0       NaN     240.8
14        60  2023/10/15'  120.0     100.0     240.8
15        60  2023/10/16'    NaN     101.0     243.8
16        60  2023/10/17'  127.0     102.0     380.2
17        45     20231018

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Date'].fillna(method='ffill', inplace=True)
  df['Date'].fillna(method='ffill', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Calories'].fillna(df['Calories'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work

In [None]:
#4) Wrong data  
import pandas as pd
import numpy as np

# Step 0: Read the dataset
df = pd.read_csv("Mine.csv")

# Step 1: Fix date format
df['Date'] = df['Date'].astype(str).str.replace("'", "", regex=False)
df['Date'] = pd.to_datetime(df['Date'], errors='coerce', format='%Y/%m/%d')
df['Date'].fillna(method='ffill', inplace=True)
df['Date'] = df['Date'].dt.strftime('%Y-%m-%d')

# Step 2: Convert numeric columns
numeric_cols = ['Duration', 'Pulse', 'Maxpulse', 'Calories']
df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')

# Step 3: Identify wrong data

wrong_duration = df[df['Duration'] > 180]
wrong_pulse = df[(df['Pulse'] < 60) | (df['Pulse'] > 200)]
wrong_maxpulse = df[df['Maxpulse'] < df['Pulse']]
wrong_calories = df[df['Calories'] <= 0]

# Combine all wrong data
wrong_data = pd.concat([wrong_duration, wrong_pulse, wrong_maxpulse, wrong_calories]).drop_duplicates()

# Step 4: Display wrong data
pd.set_option('display.max_rows', None)
print("\n⚠️ Rows with Wrong Data:\n")
print(wrong_data)

# Step 5: Remove wrong data
df_cleaned = df[~df.index.isin(wrong_data.index)]

# Step 6: Fill missing values
df_cleaned['Calories'].fillna(df_cleaned['Calories'].mean(), inplace=True)
df_cleaned['Pulse'].fillna(df_cleaned['Pulse'].median(), inplace=True)
df_cleaned['Maxpulse'].fillna(df_cleaned['Maxpulse'].median(), inplace=True)

# Step 7: Display cleaned dataset
print("\n✅ Final Cleaned Dataset:\n")
print(df_cleaned)




⚠️ Rows with Wrong Data:

    Duration        Date  Pulse  Maxpulse  Calories
7        400  2023-10-08  114.0     133.0       NaN
14        60  2023-10-15  120.0     100.0     240.8
16        60  2023-10-17  127.0     102.0     380.2
17        45  2023-10-17  142.0     103.0     241.4
18        60  2023-10-19  151.0     104.0       NaN
19        60  2023-10-20  162.0     105.0     300.9
22        60  2023-10-22  130.0     108.0     230.8
24        60  2023-10-25  132.0     110.0     236.9
25        60  2023-10-26  135.0     118.0     278.8
26        60  2023-10-27  137.0     119.0     212.9
27        60  2023-10-28  138.0     121.0     345.9
28        60  2023-10-29  139.0     122.0     345.2

✅ Final Cleaned Dataset:

    Duration        Date  Pulse  Maxpulse    Calories
0         60  2023-10-01  110.0     130.0  409.100000
1         60  2023-10-02  117.0     145.0  479.000000
2         60  2023-10-03  103.0     135.0  340.300000
3         45  2023-10-04  109.0     175.0  282.400000


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Date'].fillna(method='ffill', inplace=True)
  df['Date'].fillna(method='ffill', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned['Calories'].fillna(df_cleaned['Calories'].mean(), inplace=True)
A value is trying to be set on a copy of a slice from a Dat

In [None]:
#5) Unnecessary columns that are not relevant to the analysis. 
import pandas as pd
import numpy as np

# Step 0: Load dataset
df = pd.read_csv("Mine.csv")

# Step 1: Display original columns
print("\n📋 Original Columns in Dataset:\n")
print(df.columns)

# Step 2: Fix date format
df['Date'] = df['Date'].astype(str).str.replace("'", "", regex=False)
df['Date'] = pd.to_datetime(df['Date'], errors='coerce', format='%Y/%m/%d')
df['Date'].fillna(method='ffill', inplace=True)
df['Date'] = df['Date'].dt.strftime('%Y-%m-%d')

# Step 3: Convert numeric columns
numeric_cols = ['Duration', 'Pulse', 'Maxpulse', 'Calories']
df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')

# Step 4: Identify unnecessary columns (anything not in our target list)
relevant_cols = ['Duration', 'Date', 'Pulse', 'Maxpulse', 'Calories']
unnecessary_cols = [col for col in df.columns if col not in relevant_cols]

# Step 5: Show unnecessary columns (if any)
print("\n🗑️ Unnecessary Columns to be Dropped:\n")
print(unnecessary_cols if unnecessary_cols else "No unnecessary columns found.")

# Step 6: Drop unnecessary columns
df_cleaned = df[relevant_cols]

# Step 7: Fill missing values
df_cleaned['Calories'].fillna(df_cleaned['Calories'].mean(), inplace=True)
df_cleaned['Pulse'].fillna(df_cleaned['Pulse'].median(), inplace=True)
df_cleaned['Maxpulse'].fillna(df_cleaned['Maxpulse'].median(), inplace=True)

# Step 8: Display cleaned dataset
pd.set_option('display.max_rows', None)
print("\n✅ Final Cleaned Dataset Without Unnecessary Columns:\n")
print(df_cleaned)



📋 Original Columns in Dataset:

Index(['Duration', 'Date', 'Pulse', 'Maxpulse', 'Calories'], dtype='object')

🗑️ Unnecessary Columns to be Dropped:

No unnecessary columns found.

✅ Final Cleaned Dataset Without Unnecessary Columns:

    Duration        Date  Pulse  Maxpulse    Calories
0         60  2023-10-01  110.0     130.0  409.100000
1         60  2023-10-02  117.0     145.0  479.000000
2         60  2023-10-03  103.0     135.0  340.300000
3         45  2023-10-04  109.0     175.0  282.400000
4         45  2023-10-05  117.0     150.0  405.100000
5         60  2023-10-06  103.0     125.0  300.000000
6         60  2023-10-07  110.0     135.0  374.000000
7        400  2023-10-08  114.0     133.0  302.859259
8         60  2023-10-09  112.0     126.0  193.800000
9         30  2023-10-10  102.0     147.0  234.800000
10        60  2023-10-11  100.0     129.0  375.300000
11        60  2023-10-12  109.0     131.0  345.600000
12        60  2023-10-13  103.0     136.0  239.200000
13       

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Date'].fillna(method='ffill', inplace=True)
  df['Date'].fillna(method='ffill', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned['Calories'].fillna(df_cleaned['Calories'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method