In [2]:
!pip install category_encoders
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from category_encoders import TargetEncoder

# Load Titanic dataset
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
df = pd.read_csv(url)

# Drop rows with missing values in relevant columns
df = df[['Sex', 'Embarked', 'Pclass', 'Fare', 'Age', 'Survived']].dropna()

# Question 5: Label vs One-Hot Encoding for 'Sex'
le = LabelEncoder()
df['Sex_label'] = le.fit_transform(df['Sex'])
ohe = pd.get_dummies(df['Sex'], prefix='Sex')
df_ohe = pd.concat([df, ohe], axis=1)

# Question 6: Combine Min-Max and Standard Scaling
scaler_minmax = MinMaxScaler()
scaler_std = StandardScaler()
minmax_scaled = scaler_minmax.fit_transform(df[['Fare', 'Age']])
std_scaled = scaler_std.fit_transform(minmax_scaled)
df[['Fare_scaled', 'Age_scaled']] = std_scaled

# Question 7: One-Hot Encoding for multiple categorical features
df_encoded = pd.get_dummies(df, columns=['Sex', 'Embarked'])

# Question 8: Ordinal encode 'Pclass'
ord_encoder = OrdinalEncoder()
df_encoded['Pclass_encoded'] = ord_encoder.fit_transform(df[['Pclass']])

# Question 9: Impact of Scaling on Decision Tree vs SVM
X = df_encoded.drop(columns=['Survived', 'Sex_label', 'Pclass', 'Fare', 'Age'])
y = df_encoded['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Decision Tree (insensitive to scaling)
tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)
acc_tree = accuracy_score(y_test, tree.predict(X_test))

# SVM (sensitive to scaling)
svm = SVC()
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train_svm, X_test_svm, y_train_svm, y_test_svm = train_test_split(X_scaled, y, test_size=0.3, random_state=0)
svm.fit(X_train_svm, y_train_svm)
acc_svm = accuracy_score(y_test_svm, svm.predict(X_test_svm))

# Question 10: Custom transformation for high-cardinality categorical feature
# Simulate high cardinality with 'Cabin' feature
df_highcard = pd.read_csv(url)[['Cabin', 'Survived']].dropna()
encoder = TargetEncoder()
df_highcard['Cabin_encoded'] = encoder.fit_transform(df_highcard['Cabin'], df_highcard['Survived'])

# Output results
print("Label Encoding for 'Sex':")
print(df[['Sex', 'Sex_label']].drop_duplicates())
print("\nOne-Hot Encoding for 'Sex':")
print(ohe.head())

print("\nAccuracy - Decision Tree (unscaled):", acc_tree)
print("Accuracy - SVM (scaled):", acc_svm)

print("\nSample of custom encoded high-cardinality feature:")
print(df_highcard[['Cabin', 'Cabin_encoded']].head())










Defaulting to user installation because normal site-packages is not writeable
Collecting category_encoders
  Downloading category_encoders-2.8.1-py3-none-any.whl (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.7/85.7 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting statsmodels>=0.9.0
  Downloading statsmodels-0.14.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m25.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting patsy>=0.5.1
  Downloading patsy-1.0.1-py2.py3-none-any.whl (232 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.9/232.9 kB[0m [31m34.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: patsy, statsmodels, category_encoders
Successfully installed category_encoders-2.8.1 patsy-1.0.1 statsmodels-0.14.4

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip 