## 4. POOR QUALITY DATA

### Definition
**Garbage In, Garbage Out (GIGO)**: Bad data input = Bad predictions output.

### Why Data Quality Matters:
```
MIT Research:
- 82% of ML projects STALL due to DATA QUALITY issues
- Not because of bad algorithms, but bad data!

Alation Report:
- 87% of data quality errors impact business outcomes
```

### Types of Data Quality Issues:

#### 4.1 Missing Values


In [None]:
import pandas as pd
import numpy as np

# Create dataset with missing values
data = pd.DataFrame({
    'age': [25, None, 35, 40, None, 28],
    'income': [50000, 60000, None, 80000, 70000, None],
    'credit_score': [720, 750, 680, None, 700, 760]
})

print("Missing values:")
print(data.isnull().sum())

# Problems:
# - Missing values reduce training data
# - Some algorithms can't handle missing values
# - Impacts model accuracy

# Solutions:

# 1. Drop rows with missing values (simple, loses data)
data_dropped = data.dropna()
print(f"\nAfter dropping: {len(data)} → {len(data_dropped)} rows")

# 2. Fill with mean/median (preserves data, introduces bias)
data_filled_mean = data.fillna(data.mean())

# 3. Fill with forward/backward fill (for time series)
data_filled_forward = data.fillna(method='ffill')

# 4. Predictive imputation (uses other features)
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=3)
data_imputed = pd.DataFrame(
    imputer.fit_transform(data),
    columns=data.columns
)

# 5. Use algorithms that handle missing values
from xgboost import XGBClassifier

model = XGBClassifier()  # Handles missing values natively
model.fit(data, y)


#### 4.2 Outliers and Anomalies


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import IsolationForest

# Dataset with outliers
data = np.random.normal(100, 15, 1000)  # Normal data
data = np.append(data, [500, 600, -100])  # Add outliers

print(f"Mean: {np.mean(data):.1f}")
print(f"Median: {np.median(data):.1f}")
# Outliers skew mean!

# Detect outliers
# Method 1: Z-score
from scipy import stats
z_scores = np.abs(stats.zscore(data))
outliers_zscore = data[z_scores > 3]  # Beyond 3 std deviations

# Method 2: IQR (Interquartile Range)
Q1 = np.percentile(data, 25)
Q3 = np.percentile(data, 75)
IQR = Q3 - Q1
outliers_iqr = data[(data < Q1 - 1.5*IQR) | (data > Q3 + 1.5*IQR)]

# Method 3: Isolation Forest
iso_forest = IsolationForest(contamination=0.05)  # Expect 5% outliers
outlier_predictions = iso_forest.fit_predict(data.reshape(-1, 1))

# Handle outliers
# Option 1: Remove
data_no_outliers = data[np.abs(stats.zscore(data)) < 3]

# Option 2: Cap at percentiles
data_capped = np.clip(data, np.percentile(data, 1), np.percentile(data, 99))

# Option 3: Transform (log, sqrt)
data_transformed = np.log(data[data > 0])  # Log transformation


#### 4.3 Inconsistent Data


In [None]:
import pandas as pd

# Inconsistencies
data = pd.DataFrame({
    'name': ['John', 'JOHN', 'john ', 'Jane', 'jane'],  # Inconsistent case/spacing
    'gender': ['M', 'Male', 'male', 'F', 'female'],  # Different formats
    'age': [25, 25.0, '25', 26, '26']  # Different types
})

print("Before cleaning:")
print(data)

# Clean inconsistencies
data['name'] = data['name'].str.strip().str.lower()
data['gender'] = data['gender'].map({
    'M': 'male',
    'F': 'female',
    'Male': 'male',
    'male': 'male',
    'female': 'female',
    'F': 'female'
})
data['age'] = pd.to_numeric(data['age'])

print("\nAfter cleaning:")
print(data)

# Use mapping dictionaries
gender_mapping = {
    'M': 'male', 'm': 'male', 'Male': 'male',
    'F': 'female', 'f': 'female', 'Female': 'female'
}
data['gender'] = data['gender'].map(gender_mapping)


#### 4.4 Duplicate Records


In [None]:
import pandas as pd

# Dataset with duplicates
data = pd.DataFrame({
    'customer_id': [1, 2, 3, 2, 1, 4],
    'name': ['Alice', 'Bob', 'Charlie', 'Bob', 'Alice', 'David']
})

print(f"Original size: {len(data)}")
print("\nDuplicate rows:")
print(data[data.duplicated()])

# Remove duplicates
data_unique = data.drop_duplicates()
print(f"\nAfter removing duplicates: {len(data_unique)}")

# Keep first, last, or most recent occurrence
data_first = data.drop_duplicates(subset=['customer_id'], keep='first')
data_last = data.drop_duplicates(subset=['customer_id'], keep='last')

# Aggregate duplicates (instead of removing)
data_aggregated = data.groupby('customer_id').agg({
    'name': 'first'
})


#### 4.5 Data Validation


In [None]:
def validate_data(data):
    """Check data quality"""
    
    print("=== DATA QUALITY REPORT ===\n")
    
    # Check 1: Missing values
    missing = data.isnull().sum()
    if missing.any():
        print("❌ Missing values:")
        print(missing[missing > 0])
        print()
    
    # Check 2: Duplicates
    duplicates = data.duplicated().sum()
    if duplicates > 0:
        print(f"❌ Duplicate rows: {duplicates}")
        print()
    
    # Check 3: Data types
    print("Data types:")
    print(data.dtypes)
    print()
    
    # Check 4: Outliers
    print("Outlier detection (Z-score > 3):")
    from scipy import stats
    for col in data.select_dtypes(include=[np.number]).columns:
        outliers = (np.abs(stats.zscore(data[col].dropna())) > 3).sum()
        if outliers > 0:
            print(f"  {col}: {outliers} outliers")
    
    # Check 5: Statistical summary
    print("\nStatistical summary:")
    print(data.describe())

# Usage
validate_data(dataset)


### Impact of Poor Data Quality:


In [None]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# True relationship: y = 2*x + noise
np.random.seed(42)
X = np.linspace(0, 10, 100).reshape(-1, 1)
y_clean = 2*X.ravel() + np.random.normal(0, 1, 100)

# Introduce data quality issues
y_dirty = y_clean.copy()
# Add outliers
y_dirty[0] = 1000
y_dirty[50] = -1000
# Add missing values (just use wrong values)
y_dirty[10:15] = np.nan

# Remove NaNs for comparison
mask = ~np.isnan(y_dirty)

# Train on clean data
model_clean = LinearRegression()
model_clean.fit(X, y_clean)
pred_clean = model_clean.predict(X)
mse_clean = mean_squared_error(y_clean, pred_clean)

# Train on dirty data
model_dirty = LinearRegression()
model_dirty.fit(X[mask], y_dirty[mask])
pred_dirty = model_dirty.predict(X[mask])
mse_dirty = mean_squared_error(y_clean[mask], pred_dirty[mask])

print(f"Clean data MSE: {mse_clean:.2f}")
print(f"Dirty data MSE: {mse_dirty:.2f}")
print(f"Performance degradation: {(mse_dirty/mse_clean - 1)*100:.0f}%")
# Dirty data causes massive performance loss!


---
