<center>
    <h1>Data Cleaning Process</h1>
    <a href="https://www.kaggle.com/datasets/shaikhjaveriya/dirtydata-csv">Dataset link</a>
</center>


In [122]:
import pandas as pd

### Load the dataset

In [123]:
file_path = "C:\\Users\\METHUNKUMAR\\Downloads\\dirtydata.csv"
df = pd.read_csv(file_path)

In [124]:
print("Original Dataframe")
df

Original Dataframe


Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
0,60,'2020/12/01',110.0,130,409.1
1,60,'2020/12/02',117.0,145,479.0
2,60,'2020/12/03',103.0,135,340.0
3,45,'2020/12/04',109.0,175,282.4
4,45,'2020/12/05',117.0,148,406.0
5,60,'2020/12/06',102.0,127,300.0
6,60,'2020/12/07',44.22,136,374.0
7,450,'2020/12/08',104.0,134,253.3
8,30,'2020/12/09',109.0,133,195.1
9,60,'2020/12/10',98.0,124,269.0


### Step 1: Display the summary of the dataset

In [125]:
print("Initial Dataset Summary:")
df.info()

Initial Dataset Summary:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32 entries, 0 to 31
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Duration  32 non-null     int64  
 1   Date      30 non-null     object 
 2   Pulse     32 non-null     float64
 3   Maxpulse  32 non-null     int64  
 4   Calories  30 non-null     float64
dtypes: float64(2), int64(2), object(1)
memory usage: 1.4+ KB


### Step 2: Check for missing values

In [126]:
missing_values = df.isnull().sum()
print("\nMissing Values")
print(missing_values)


Missing Values
Duration    0
Date        2
Pulse       0
Maxpulse    0
Calories    2
dtype: int64


### Step 3: Handling missing values

#### Fill numeric missing values with mean

In [127]:
for column in df.select_dtypes(include=['float64', 'int64']).columns:
    df.fillna({column: df[column].mean()}, inplace=True)

#### Fill missing values in categorical columns with the mode

In [128]:
for column in df.select_dtypes(include=['object']).columns:
    df.fillna({column: df[column].mode()[0]}, inplace=True)

### Step 4: Removing duplicates

In [129]:
df.drop_duplicates(inplace=True)

### Step 5: Fixing data types

#### Convert 'date' column to datetime if present

In [130]:
if 'date' in df.columns:
    df['date'] = pd.to_datetime(df['date'], errors='coerce')

### Step 6: Standardizing text data

In [131]:
df = df.map(lambda x: x.strip().lower() if isinstance(x, str) else x)

### Step 7: Remove invalid values or outliers

In [132]:
df = df[(df['Duration'] >= 0) & (df['Duration'] <= 60)]

### Step 8: Check for dataset after cleaning

In [133]:
df.info()
df

<class 'pandas.core.frame.DataFrame'>
Index: 30 entries, 0 to 31
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Duration  30 non-null     int64  
 1   Date      30 non-null     object 
 2   Pulse     30 non-null     float64
 3   Maxpulse  30 non-null     int64  
 4   Calories  30 non-null     float64
dtypes: float64(2), int64(2), object(1)
memory usage: 1.4+ KB


Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
0,60,'2020/12/01',110.0,130,409.1
1,60,'2020/12/02',117.0,145,479.0
2,60,'2020/12/03',103.0,135,340.0
3,45,'2020/12/04',109.0,175,282.4
4,45,'2020/12/05',117.0,148,406.0
5,60,'2020/12/06',102.0,127,300.0
6,60,'2020/12/07',44.22,136,374.0
8,30,'2020/12/09',109.0,133,195.1
9,60,'2020/12/10',98.0,124,269.0
10,60,'2020/12/11',103.0,147,329.3


### Step 9: Save the cleaned data to a new CSV file

In [134]:
cleaned_file_path = "C:\\Users\\METHUNKUMAR\\Downloads\\cleaned_data.csv"
df.to_csv(cleaned_file_path, index=False, header=True)

### Step 10: Check cleaned_data.csv file

In [135]:
cleaned_df = pd.read_csv("C:\\Users\\METHUNKUMAR\\Downloads\\cleaned_data.csv")
cleaned_df.info()
cleaned_df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Duration  30 non-null     int64  
 1   Date      30 non-null     object 
 2   Pulse     30 non-null     float64
 3   Maxpulse  30 non-null     int64  
 4   Calories  30 non-null     float64
dtypes: float64(2), int64(2), object(1)
memory usage: 1.3+ KB


Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
0,60,'2020/12/01',110.0,130,409.1
1,60,'2020/12/02',117.0,145,479.0
2,60,'2020/12/03',103.0,135,340.0
3,45,'2020/12/04',109.0,175,282.4
4,45,'2020/12/05',117.0,148,406.0
5,60,'2020/12/06',102.0,127,300.0
6,60,'2020/12/07',44.22,136,374.0
7,30,'2020/12/09',109.0,133,195.1
8,60,'2020/12/10',98.0,124,269.0
9,60,'2020/12/11',103.0,147,329.3
