In [1]:
# Question 5: Label Encoding vs One-Hot Encoding
# Task: Show the difference between Label Encoding and One-Hot Encoding on the Titanic dataset for the 'Sex' feature.


import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Load Titanic dataset
df = pd.read_csv("https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv")

# Label Encoding
label_encoder = LabelEncoder()
df['Sex_label_encoded'] = label_encoder.fit_transform(df['Sex'])

# One-Hot Encoding
onehot_encoded = pd.get_dummies(df['Sex'], prefix='Sex')

# Combine for comparison
df_encoded = pd.concat([df[['Sex']], df['Sex_label_encoded'], onehot_encoded], axis=1)
print(df_encoded.head())


# Question 6: Combining Feature Scaling Techniques
# Task: Demonstrate combining Min-Max Scaling and Standardization for the same datasetand explain the results.

from sklearn.preprocessing import MinMaxScaler, StandardScaler

# Select numeric columns
numeric_features = df.select_dtypes(include=['float64', 'int64']).drop(columns=['PassengerId']).dropna()

# Apply Min-Max Scaling
minmax_scaled = MinMaxScaler().fit_transform(numeric_features)

# Apply Standardization on the Min-Max scaled data
standardized = StandardScaler().fit_transform(minmax_scaled)

# Convert to DataFrame
combined_scaled_df = pd.DataFrame(standardized, columns=numeric_features.columns)
print(combined_scaled_df.head())




# Question 7: Handling Multiple Categorical Features
# Task: Handle multiple categorical features ('Sex', 'Embarked') from the Titanic dataset using One-Hot Encoding.
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)

# One-Hot Encode 'Sex' and 'Embarked'
df_onehot = pd.get_dummies(df[['Sex', 'Embarked']], drop_first=True)
print(df_onehot.head())




# Question 8: Ordinal Encoding for Ranked Categories
# Task: Ordinal encode 'Pclass' (Passenger class) from the Titanic dataset considering passenger class as a ranked feature.
from sklearn.preprocessing import OrdinalEncoder

# Assume: 3rd class < 2nd class < 1st class
ordinal_encoder = OrdinalEncoder(categories=[[3, 2, 1]])
df['Pclass_encoded'] = ordinal_encoder.fit_transform(df[['Pclass']])

print(df[['Pclass', 'Pclass_encoded']].head())





# Question 9: Impact of Scaling on Different Algorithms
# Task: Investigate the impact of different scaling techniques on a decision tree model and compare it with a SVM.
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Clean dataset for modeling
df_model = df[['Sex', 'Pclass', 'Age', 'Fare', 'Survived']].dropna()
df_model['Sex'] = LabelEncoder().fit_transform(df_model['Sex'])

X = df_model[['Sex', 'Pclass', 'Age', 'Fare']]
y = df_model['Survived']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Scaling
X_train_std = StandardScaler().fit_transform(X_train)
X_test_std = StandardScaler().transform(X_test)

# Train models
tree = DecisionTreeClassifier().fit(X_train, y_train)
svm = SVC().fit(X_train_std, y_train)

# Evaluate
print("Decision Tree Accuracy (Unscaled):", accuracy_score(y_test, tree.predict(X_test)))
print("SVM Accuracy (Standardized):", accuracy_score(y_test, svm.predict(X_test_std)))



# Question 10: Custom Transformations for Categorical Features
# Task: Implement a custom transformation function for encoding high cardinality categorical features efficiently.
def frequency_encode(column):
    freq = column.value_counts(normalize=True)
    return column.map(freq)

# Simulate a high-cardinality feature
df['Cabin'] = df['Cabin'].fillna('Unknown')  # Cabin has many unique values
df['Cabin_encoded'] = frequency_encode(df['Cabin'])

print(df[['Cabin', 'Cabin_encoded']].head())






      Sex  Sex_label_encoded  Sex_female  Sex_male
0    male                  1       False      True
1  female                  0        True     False
2  female                  0        True     False
3  female                  0        True     False
4    male                  1       False      True
   Survived    Pclass       Age     SibSp     Parch      Fare  \
0  -0.82702  0.911232 -0.530377  0.524570 -0.505895 -0.518978   
1   1.20916 -1.476364  0.571831  0.524570 -0.505895  0.691897   
2   1.20916  0.911232 -0.254825 -0.551703 -0.505895 -0.506214   
3   1.20916 -1.476364  0.365167  0.524570 -0.505895  0.348049   
4  -0.82702  0.911232  0.365167 -0.551703 -0.505895 -0.503850   

   Sex_label_encoded  
0           0.759051  
1          -1.317434  
2          -1.317434  
3          -1.317434  
4           0.759051  
   Sex_male  Embarked_Q  Embarked_S
0      True       False        True
1     False       False       False
2     False       False        True
3     False       Fal

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)


ValueError: Unsorted categories are not supported for numerical categories