# Day 5: Data Cleaning

This notebook covers outlier detection and data scaling techniques.

## 1. Setting up the Environment

In [1]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler    
import matplotlib.pyplot as plt
import seaborn as sns

# Set random seed for reproducibility
np.random.seed(42)

## 2. Creating Sample Dataset with Outliers

In [None]:
# Generate normal data with outliers
normal_data = np.random.normal(100, 15, 1000)
outliers = np.random.uniform(200, 300, 50)
data = np.concatenate([normal_data, outliers])

df = pd.DataFrame({
    'values': data
})

plt.figure(figsize=(10, 5))
plt.boxplot(df['values'])
plt.title('Distribution of Values with Outliers')
plt.show()

## 3. Outlier Detection Methods

In [None]:
def detect_outliers_iqr(data):
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = data[(data < lower_bound) | (data > upper_bound)]
    return outliers, lower_bound, upper_bound

def detect_outliers_zscore(data, threshold=3):
    z_scores = np.abs(stats.zscore(data))
    outliers = data[z_scores > threshold]
    return outliers

# IQR Method
outliers_iqr, lb, ub = detect_outliers_iqr(df['values'])
print("IQR Method:")
print(f"Number of outliers: {len(outliers_iqr)}")
print(f"Lower bound: {lb:.2f}")
print(f"Upper bound: {ub:.2f}")

# Z-score Method
outliers_zscore = detect_outliers_zscore(df['values'])
print("\nZ-score Method:")
print(f"Number of outliers: {len(outliers_zscore)}")

## 4. Handling Outliers

In [None]:
def handle_outliers(data, method='clip'):
    if method == 'clip':
        # Winsorization
        Q1 = data.quantile(0.25)
        Q3 = data.quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        return data.clip(lower=lower_bound, upper=upper_bound)
    elif method == 'remove':
        # Remove outliers
        z_scores = np.abs(stats.zscore(data))
        return data[z_scores <= 3]
    else:
        raise ValueError("Method must be 'clip' or 'remove'")

# Apply both methods
df['values_clipped'] = handle_outliers(df['values'], method='clip')
df['values_removed'] = handle_outliers(df['values'], method='remove')

# Plot results
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
axes[0].boxplot(df['values'])
axes[0].set_title('Original Data')
axes[1].boxplot(df['values_clipped'])
axes[1].set_title('After Winsorization')
axes[2].boxplot(df['values_removed'])
axes[2].set_title('After Removing Outliers')
plt.tight_layout()
plt.show()

## 5. Data Scaling Methods

In [None]:
# Create sample data with multiple features
n_samples = 1000
data = {
    'feature1': np.random.normal(100, 15, n_samples),
    'feature2': np.random.exponential(50, n_samples),
    'feature3': np.random.uniform(0, 1000, n_samples)
}
df_scaling = pd.DataFrame(data)

# Apply different scaling methods
scalers = {
    'standard': StandardScaler(),
    'minmax': MinMaxScaler(),
    'robust': RobustScaler()
}

scaled_dfs = {}
for name, scaler in scalers.items():
    scaled_dfs[name] = pd.DataFrame(
        scaler.fit_transform(df_scaling),
        columns=df_scaling.columns
    )

# Compare distributions
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('Distribution of Features with Different Scaling Methods')

# Original data
sns.boxplot(data=df_scaling, ax=axes[0, 0])
axes[0, 0].set_title('Original Data')

# Scaled data
positions = [(0, 1), (1, 0), (1, 1)]
for (name, scaled_df), pos in zip(scaled_dfs.items(), positions):
    sns.boxplot(data=scaled_df, ax=axes[pos[0], pos[1]])
    axes[pos[0], pos[1]].set_title(f'{name.capitalize()} Scaling')

plt.tight_layout()
plt.show()

# Print summary statistics
print("\nSummary Statistics:")
for name, scaled_df in scaled_dfs.items():
    print(f"\n{name.capitalize()} Scaling:")
    print(scaled_df.describe().round(2))

## 6. Log Transformation

In [None]:
# Create skewed data
skewed_data = np.random.lognormal(0, 1, 1000)

# Apply log transformation
log_transformed = np.log1p(skewed_data)  # log1p is log(1+x)

# Plot original vs transformed data
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Original data
sns.histplot(skewed_data, ax=axes[0])
axes[0].set_title('Original Skewed Data')

# Log-transformed data
sns.histplot(log_transformed, ax=axes[1])
axes[1].set_title('Log-transformed Data')

plt.tight_layout()
plt.show()

# Print summary statistics
print("\nSummary Statistics:")
print("\nOriginal Data:")
print(pd.Series(skewed_data).describe().round(2))
print("\nLog-transformed Data:")
print(pd.Series(log_transformed).describe().round(2))