# Import necessary libraries

In [None]:
import numpy as np
import pandas as pd

# 1. Load the Dataset

data = pd.read_csv('messy_data.csv', sep=';')

In [None]:
data.shape

Display the first few rows to understand the data

data.head(10)

#  2. Handle Missing Values

2.1 Identify missing values

In [None]:
data.isnull().sum()

2.2 Impute missing values (replace with mean for numerical features)

data['Age'].fillna(data['Age'].mean(), inplace=True)

In [None]:
print(data.isnull().sum()) 

In [None]:
# 2.1 Identify NaN values
print(data.isna().sum())

2.3 Drop rows with missing values (if appropriate)

data.dropna(subset=['City', 'Text'], inplace=True)

In [None]:
# 2.1 Identify NaN values
print(data.isna().sum())

# 3. Correct Data Types

In [None]:
# 3.1 Convert a column to datetime format
data['Date'] = pd.to_datetime(data['Date'])

In [None]:
data.head(10)

In [None]:
data.info()

3.2 Convert a column to numeric

data['Date'] = pd.to_numeric(data['Date'])

In [None]:
data.info()

In [None]:
data.head()

duplicated() method to detect dubplicates

duplicates = data.duplicated()
duplicates

In [None]:
# --- To see the actual duplicate rows:
duplicate_rows = data[data.duplicated()]
duplicate_rows

#  4. Remove Duplicates 

In [None]:
# Identify and remove duplicate rows
data.drop_duplicates(inplace=True)

In [None]:
data.head(10)

#  5. Clean and Transform Text Data ---

## 5.1 Convert text to lowercase

data['Text'] = data['Text'].str.lower()

In [None]:
data.head(10)

In [None]:
# 5.2 Remove leading/trailing whitespace
data['Text'] = data['Text'].str.strip()

In [None]:
data.head(10)

In [None]:
# 5.3 Replace specific characters or strings
data['Text'] = data['Text'].str.replace('this is fine', 'this is good')

In [None]:
data.head(10)

#  6. Outlier Detection and Handling 

In [None]:
# 6.1 Using boxplots to visualize outliers (requires matplotlib)
import matplotlib.pyplot as plt
plt.boxplot(data['Income'])
plt.show()

In [None]:
# 6.2  Remove outliers (example: using IQR)
Q1 = data['Income'].quantile(0.25)
Q3 = data['Income'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
data = data[(data['Income'] >= lower_bound) & (data['Income'] <= upper_bound)]

In [None]:
plt.boxplot(data['Income'])
plt.show()

#  7. Data Transformation 


In [None]:
# 7.1 Scaling numerical features (using MinMaxScaler from scikit-learn)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
data['Income'] = scaler.fit_transform(data[['Income']])

In [None]:
plt.boxplot(data['Income'])
plt.show()

# 8. Save the Cleaned Dataset ---

data.to_csv('cleaned_dataset.csv', index=False)