# 🧹 Data Cleaning Notebook
This notebook is designed to clean and prepare the Survey dataset for analysis.

## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np

## 2. Load Dataset

In [None]:
df = pd.read_csv('dirtydata.csv')  # Replace with your actual file path
pd.set_option('display.max_columns', None)
df.head()

## 3. Inspect Dataset

In [None]:
df.info()
df.describe()
df.isnull().sum()

## 4. Handle Missing Values

In [None]:
# Example: Fill missing age with median
df['age'].fillna(df['age'].median(), inplace=True)

## 5. Fix Data Types

In [None]:
# Example: Convert date columns
df['join_date'] = pd.to_datetime(df['join_date'], errors='coerce')

## 6. Standardize Text & Categories

In [None]:
# Example: Fix inconsistent entries
df['gender'] = df['gender'].str.strip().str.lower().replace({'f': 'female', 'm': 'male'})

## 7. Remove Duplicates and Outliers

In [None]:
# Remove duplicates
df.drop_duplicates(inplace=True)

# Remove outliers (example using z-score)
from scipy.stats import zscore
z_scores = np.abs(zscore(df.select_dtypes(include=[np.number])))
df = df[(z_scores < 3).all(axis=1)]

## 8. Final Check & Export Cleaned Data

In [None]:
df.info()
df.to_csv('cleaned_survey_data.csv', index=False)