### AI/ML – Improving Model Performance with Clean Data

**Task 1**: Data Preprocessing for Models

**Objective**: Enhance data quality for better AI/ML outcomes.

**Steps**:
1. Choose a dataset for training an AI/ML model.
2. Identify common data issues like null values, redundant features, or noisydata.
3. Apply preprocessing methods such as imputation, normalization, or feature engineering.

In [1]:
# Write your code from here
import pandas as pd
import seaborn as sns

# Load Titanic dataset
df = sns.load_dataset('titanic')
print(df.head())
# 1. Null values
print("🔍 Missing values:\n", df.isnull().sum())

# 2. Data types and unique values
print("\n📊 Feature types and uniqueness:\n", df.dtypes)

# 3. Redundant columns (e.g., names, ticket numbers)
print("\n🧹 Redundant features likely to drop: ['deck', 'embark_town', 'alive', 'who', 'adult_male', 'class', 'embarked']")
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

# Target and features
X = df.drop(columns=['survived', 'deck', 'embark_town', 'alive', 'who', 'adult_male', 'class', 'embarked', 'ticket', 'cabin', 'name'])
y = df['survived']

# Split train-test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Numeric & Categorical features
numeric_features = ['age', 'fare', 'sibsp', 'parch']
categorical_features = ['sex', 'pclass']

# Pipelines for preprocessing
numeric_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(drop='first'))
])

# Combine transformers
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_pipeline, numeric_features),
    ('cat', categorical_pipeline, categorical_features)
])

# Model pipeline
clf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Fit model
clf_pipeline.fit(X_train, y_train)

# Score model
print("✅ Model accuracy:", clf_pipeline.score(X_test, y_test))


   survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0         0       3    male  22.0      1      0   7.2500        S  Third   
1         1       1  female  38.0      1      0  71.2833        C  First   
2         1       3  female  26.0      0      0   7.9250        S  Third   
3         1       1  female  35.0      1      0  53.1000        S  First   
4         0       3    male  35.0      0      0   8.0500        S  Third   

     who  adult_male deck  embark_town alive  alone  
0    man        True  NaN  Southampton    no  False  
1  woman       False    C    Cherbourg   yes  False  
2  woman       False  NaN  Southampton   yes   True  
3  woman       False    C  Southampton   yes  False  
4    man        True  NaN  Southampton    no   True  
🔍 Missing values:
 survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male

KeyError: "['ticket', 'cabin', 'name'] not found in axis"

**Task 2**: Evaluate Model Performance

**Objective**: Assess the impact of data quality improvements on model performance.

**Steps**:
1. Train a simple ML model with and without preprocessing.
2. Analyze and compare model performance metrics to evaluate the impact of data quality strategies.

In [2]:
# Write your code from here
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load dataset
df = sns.load_dataset('titanic')

# Target and features
X = df.drop(columns=['survived', 'deck', 'embark_town', 'alive', 'who', 'adult_male', 'class', 'embarked', 'ticket', 'cabin', 'name'])
y = df['survived']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ----------- Model 1: Without Preprocessing -------------

# Drop rows with any missing values to handle nulls (simple but lossy)
X_train_drop = X_train.dropna()
y_train_drop = y_train.loc[X_train_drop.index]

X_test_drop = X_test.dropna()
y_test_drop = y_test.loc[X_test_drop.index]

# Encode categoricals manually (simple mapping)
X_train_simple = X_train_drop.copy()
X_test_simple = X_test_drop.copy()

# Map 'sex' to 0/1
X_train_simple['sex'] = X_train_simple['sex'].map({'male': 0, 'female': 1})
X_test_simple['sex'] = X_test_simple['sex'].map({'male': 0, 'female': 1})

# Fill missing pclass if any (should be none here)
X_train_simple['pclass'] = X_train_simple['pclass'].fillna(3)
X_test_simple['pclass'] = X_test_simple['pclass'].fillna(3)

# Train model
model_simple = RandomForestClassifier(random_state=42)
model_simple.fit(X_train_simple, y_train_drop)

# Predict & Evaluate
y_pred_simple = model_simple.predict(X_test_simple)

# Metrics function
def print_metrics(y_true, y_pred, label):
    print(f"\n--- {label} ---")
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Precision:", precision_score(y_true, y_pred))
    print("Recall:", recall_score(y_true, y_pred))
    print("F1 Score:", f1_score(y_true, y_pred))

print_metrics(y_test_drop, y_pred_simple, "Without Preprocessing")

# ----------- Model 2: With Preprocessing Pipeline -------------

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

numeric_features = ['age', 'fare', 'sibsp', 'parch']
categorical_features = ['sex', 'pclass']

numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(drop='first'))
])

preprocessor = ColumnTransformer([
    ('num', numeric_pipeline, numeric_features),
    ('cat', categorical_pipeline, categorical_features)
])

model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

model_pipeline.fit(X_train, y_train)
y_pred_pipeline = model_pipeline.predict(X_test)

print_metrics(y_test, y_pred_pipeline, "With Preprocessing Pipeline")



KeyError: "['ticket', 'cabin', 'name'] not found in axis"