In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Sample dataset with issues: missing values, duplicates, inconsistent data, outliers
data = {
    'ID': [1, 2, 3, 4, 5, 5, 6, 7, 8, 9, 10],
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva', 'Eva', 'Frank', 'Grace', 'Hannah', 'Ian', 'Jack'],
    'Age': [25, np.nan, 30, 22, 29, 29, 120, 27, np.nan, 31, 28],
    'Gender': ['F', 'M', 'male', 'Male', 'female', 'Female', 'M', 'F', 'F', 'Male', 'm'],
    'Join_Date': ['2022-01-10', '2021-05-12', '2022-03-15', '2022-02-20', np.nan, '2020-07-07', '2019-10-10', '2021-12-01', '2022-04-04', '2022-01-01', '2022-06-06']
}

df = pd.DataFrame(data)
df


Unnamed: 0,ID,Name,Age,Gender,Join_Date
0,1,Alice,25.0,F,2022-01-10
1,2,Bob,,M,2021-05-12
2,3,Charlie,30.0,male,2022-03-15
3,4,David,22.0,Male,2022-02-20
4,5,Eva,29.0,female,
5,5,Eva,29.0,Female,2020-07-07
6,6,Frank,120.0,M,2019-10-10
7,7,Grace,27.0,F,2021-12-01
8,8,Hannah,,F,2022-04-04
9,9,Ian,31.0,Male,2022-01-01


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11 entries, 0 to 10
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   ID         11 non-null     int64  
 1   Name       11 non-null     object 
 2   Age        9 non-null      float64
 3   Gender     11 non-null     object 
 4   Join_Date  10 non-null     object 
dtypes: float64(1), int64(1), object(3)
memory usage: 572.0+ bytes


In [4]:
df['Age'].fillna(df['Age'].mean(), inplace=True)
df

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].mean(), inplace=True)


Unnamed: 0,ID,Name,Age,Gender,Join_Date
0,1,Alice,25.0,F,2022-01-10
1,2,Bob,37.888889,M,2021-05-12
2,3,Charlie,30.0,male,2022-03-15
3,4,David,22.0,Male,2022-02-20
4,5,Eva,29.0,female,
5,5,Eva,29.0,Female,2020-07-07
6,6,Frank,120.0,M,2019-10-10
7,7,Grace,27.0,F,2021-12-01
8,8,Hannah,37.888889,F,2022-04-04
9,9,Ian,31.0,Male,2022-01-01


In [6]:
df['Join_Date'].fillna(df['Join_Date'].mode()[0], inplace=True)
df

Unnamed: 0,ID,Name,Age,Gender,Join_Date
0,1,Alice,25.0,F,2022-01-10
1,2,Bob,37.888889,M,2021-05-12
2,3,Charlie,30.0,male,2022-03-15
3,4,David,22.0,Male,2022-02-20
4,5,Eva,29.0,female,2019-10-10
5,5,Eva,29.0,Female,2020-07-07
6,6,Frank,120.0,M,2019-10-10
7,7,Grace,27.0,F,2021-12-01
8,8,Hannah,37.888889,F,2022-04-04
9,9,Ian,31.0,Male,2022-01-01


In [7]:
df.duplicated()

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
dtype: bool

In [9]:
print(df.duplicated().sum())

0


In [10]:
print(df[df.duplicated()])

Empty DataFrame
Columns: [ID, Name, Age, Gender, Join_Date]
Index: []


In [11]:
df.dtypes

ID             int64
Name          object
Age          float64
Gender        object
Join_Date     object
dtype: object

In [12]:
df['Age'] = df['Age'].astype(int)

In [13]:
df.dtypes

ID            int64
Name         object
Age           int64
Gender       object
Join_Date    object
dtype: object

In [16]:
df['Join_Date'] = pd.to_datetime(df['Join_Date'])
df.dtypes

ID                    int64
Name                 object
Age                   int64
Gender               object
Join_Date    datetime64[ns]
dtype: object

In [18]:
df['Gender']=df['Gender'].str.lower().replace({'m':'male', 'f':'female'})
df

Unnamed: 0,ID,Name,Age,Gender,Join_Date
0,1,Alice,25,female,2022-01-10
1,2,Bob,37,male,2021-05-12
2,3,Charlie,30,male,2022-03-15
3,4,David,22,male,2022-02-20
4,5,Eva,29,female,2019-10-10
5,5,Eva,29,female,2020-07-07
6,6,Frank,120,male,2019-10-10
7,7,Grace,27,female,2021-12-01
8,8,Hannah,37,female,2022-04-04
9,9,Ian,31,male,2022-01-01


In [19]:
df.describe()

Unnamed: 0,ID,Age,Join_Date
count,11.0,11.0,11
mean,5.454545,37.727273,2021-06-28 04:21:49.090909184
min,1.0,22.0,2019-10-10 00:00:00
25%,3.5,27.5,2020-12-08 12:00:00
50%,5.0,29.0,2022-01-01 00:00:00
75%,7.5,34.0,2022-03-03 12:00:00
max,10.0,120.0,2022-06-06 00:00:00
std,2.876235,27.65173,


In [21]:
df = df[df['Age'] <= 100]
df

Unnamed: 0,ID,Name,Age,Gender,Join_Date
0,1,Alice,25,female,2022-01-10
1,2,Bob,37,male,2021-05-12
2,3,Charlie,30,male,2022-03-15
3,4,David,22,male,2022-02-20
4,5,Eva,29,female,2019-10-10
5,5,Eva,29,female,2020-07-07
7,7,Grace,27,female,2021-12-01
8,8,Hannah,37,female,2022-04-04
9,9,Ian,31,male,2022-01-01
10,10,Jack,28,male,2022-06-06


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10 entries, 0 to 10
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   ID         10 non-null     int64         
 1   Name       10 non-null     object        
 2   Age        10 non-null     int64         
 3   Gender     10 non-null     object        
 4   Join_Date  10 non-null     datetime64[ns]
dtypes: datetime64[ns](1), int64(2), object(2)
memory usage: 480.0+ bytes


In [25]:
file_path = r"C:\Users\PC\Documents\Projects\ML\Deta set clean"
df.to_csv(file_path, index=False)

PermissionError: [Errno 13] Permission denied: 'C:\\Users\\PC\\Documents\\Projects\\ML\\Deta set clean'

In [26]:
print(f"CSV file saved at: {file_path}")

CSV file saved at: C:\Users\PC\Documents\Projects\ML\Deta set clean
