In [10]:
# Step 1: Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Step 2: Simulated dataset with quality issues
data = {
    'Age': [25, np.nan, 45, 35, 29, None, 40, 22],
    'Income': [50000, 60000, None, 45000, 52000, 58000, None, 48000],
    'Purchased': [1, 0, 1, 0, 1, 0, 1, 0]
}
df_raw = pd.DataFrame(data)

# Step 3: Check for missing values
print("Initial Missing Values:\n", df_raw.isnull().sum())

# Step 4: Split into features and target
X_raw = df_raw.drop('Purchased', axis=1)
y = df_raw['Purchased']

# Step 5: Split raw data into train and test
X_train_raw, X_test_raw, y_train, y_test = train_test_split(X_raw, y, test_size=0.3, random_state=42)

# Step 6: Train baseline model on raw data (with dropped missing values)
X_train_raw_clean = X_train_raw.dropna()
y_train_clean = y_train[X_train_raw_clean.index]
X_test_raw_clean = X_test_raw.dropna()
y_test_clean = y_test[X_test_raw_clean.index]

model_raw = LogisticRegression()
if len(X_train_raw_clean) > 0 and len(X_test_raw_clean) > 0:
    model_raw.fit(X_train_raw_clean, y_train_clean)
    y_pred_raw = model_raw.predict(X_test_raw_clean)
    print("\nBaseline Model Accuracy (with raw/incomplete data):", accuracy_score(y_test_clean, y_pred_raw))
else:
    print("\nNot enough data to train baseline model due to missing values.")

# Step 7: Data Cleaning – Imputation & Scaling
imputer = SimpleImputer(strategy='mean')
scaler = StandardScaler()

X_imputed = imputer.fit_transform(X_raw)
X_scaled = scaler.fit_transform(X_imputed)

# Step 8: Train/test split on cleaned data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# Step 9: Train model on clean data
model_clean = LogisticRegression()
model_clean.fit(X_train, y_train)
y_pred_clean = model_clean.predict(X_test)

# Step 10: Evaluate model
accuracy_clean = accuracy_score(y_test, y_pred_clean)
print("\nCleaned Data Model Accuracy:", accuracy_clean)
print("\nClassification Report:\n", classification_report(y_test, y_pred_clean))

# Step 11: Final Summary
print("\nSummary:")
print(f"- Baseline Model Accuracy: {round(accuracy_score(y_test_clean, y_pred_raw)*100, 2) if len(X_test_raw_clean) > 0 else 'N/A'}%")
print(f"- Cleaned Data Model Accuracy: {round(accuracy_clean*100, 2)}%")

Initial Missing Values:
 Age          2
Income       2
Purchased    0
dtype: int64

Baseline Model Accuracy (with raw/incomplete data): 1.0

Cleaned Data Model Accuracy: 0.0

Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00       2.0
           1       0.00      0.00      0.00       1.0

    accuracy                           0.00       3.0
   macro avg       0.00      0.00      0.00       3.0
weighted avg       0.00      0.00      0.00       3.0


Summary:
- Baseline Model Accuracy: 100.0%
- Cleaned Data Model Accuracy: 0.0%
