In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

%matplotlib inline
plt.style.use('seaborn-whitegrid')
sns.set_style("whitegrid")

print("Libraries imported successfully!")


In [None]:
# Load the Wine dataset
from sklearn.datasets import load_wine

# Load data
wine = load_wine()
X = wine.data
y = wine.target

# Create a DataFrame
feature_names = wine.feature_names
target_names = wine.target_names
df = pd.DataFrame(X, columns=feature_names)
df['target'] = y
df['target_name'] = df['target'].map({i: name for i, name in enumerate(target_names)})

# Display basic information
print("Wine dataset loaded successfully!")
print(f"Dataset shape: {df.shape}")
print(f"Features: {', '.join(feature_names)}")
print(f"Target classes: {', '.join(target_names)}")

# Display first few rows
print("\nFirst 5 rows:")
print(df.head())

# Basic statistics
print("\nBasic statistics:")
print(df.describe())


In [None]:
# Visualize the feature distributions before scaling
plt.figure(figsize=(15, 10))
for i, feature in enumerate(feature_names[:6]):  # Plot first 6 features
    plt.subplot(2, 3, i+1)
    sns.histplot(df[feature], kde=True)
    plt.title(f'Distribution of {feature}')
    plt.tight_layout()
plt.suptitle('Original Feature Distributions', y=1.02, fontsize=16)
plt.tight_layout()
plt.show()

# Boxplot to show scale differences
plt.figure(figsize=(14, 6))
df_features = df.drop(['target', 'target_name'], axis=1)
plt.boxplot(df_features.values, labels=df_features.columns, vert=False)
plt.title('Original Feature Scales')
plt.xlabel('Value')
plt.ylabel('Feature')
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

# Apply different scaling methods
X_features = df.drop(['target', 'target_name'], axis=1)

# 1. StandardScaler (Z-score normalization)
scaler_standard = StandardScaler()
X_standard = scaler_standard.fit_transform(X_features)
df_standard = pd.DataFrame(X_standard, columns=X_features.columns)

# 2. MinMaxScaler (Min-Max scaling)
scaler_minmax = MinMaxScaler()
X_minmax = scaler_minmax.fit_transform(X_features)
df_minmax = pd.DataFrame(X_minmax, columns=X_features.columns)

# 3. RobustScaler (Robust to outliers)
scaler_robust = RobustScaler()
X_robust = scaler_robust.fit_transform(X_features)
df_robust = pd.DataFrame(X_robust, columns=X_features.columns)

# Compare the scaling methods with boxplots
plt.figure(figsize=(15, 10))

plt.subplot(2, 2, 1)
plt.boxplot(X_features.values, labels=X_features.columns, vert=False)
plt.title('Original Data')
plt.grid(True, linestyle='--', alpha=0.7)

plt.subplot(2, 2, 2)
plt.boxplot(df_standard.values, labels=df_standard.columns, vert=False)
plt.title('StandardScaler (Z-score)')
plt.grid(True, linestyle='--', alpha=0.7)

plt.subplot(2, 2, 3)
plt.boxplot(df_minmax.values, labels=df_minmax.columns, vert=False)
plt.title('MinMaxScaler (0-1 range)')
plt.grid(True, linestyle='--', alpha=0.7)

plt.subplot(2, 2, 4)
plt.boxplot(df_robust.values, labels=df_robust.columns, vert=False)
plt.title('RobustScaler (based on quantiles)')
plt.grid(True, linestyle='--', alpha=0.7)

plt.suptitle('Comparison of Scaling Methods', y=0.98, fontsize=16)
plt.tight_layout()
plt.show()


In [None]:
# Create a sample dataset with categorical variables
np.random.seed(42)
n_samples = 100

# Create categorical features
colors = ['red', 'blue', 'green', 'yellow']
sizes = ['small', 'medium', 'large']
countries = ['USA', 'Canada', 'UK', 'France', 'Germany']
ratings = ['low', 'medium', 'high']

# Generate data
cat_data = pd.DataFrame({
    'color': np.random.choice(colors, n_samples),
    'size': np.random.choice(sizes, n_samples),
    'country': np.random.choice(countries, n_samples),
    'rating': np.random.choice(ratings, n_samples),
    'price': np.random.normal(loc=50, scale=15, size=n_samples),
    'weight': np.random.uniform(low=0.1, high=10.0, size=n_samples)
})

# Display the data
print("Sample categorical dataset:")
print(cat_data.head())

# Distribution of categorical variables
plt.figure(figsize=(15, 10))

plt.subplot(2, 2, 1)
sns.countplot(y=cat_data['color'])
plt.title('Distribution of Colors')
plt.grid(True, linestyle='--', alpha=0.7)

plt.subplot(2, 2, 2)
sns.countplot(y=cat_data['size'])
plt.title('Distribution of Sizes')
plt.grid(True, linestyle='--', alpha=0.7)

plt.subplot(2, 2, 3)
sns.countplot(y=cat_data['country'])
plt.title('Distribution of Countries')
plt.grid(True, linestyle='--', alpha=0.7)

plt.subplot(2, 2, 4)
sns.countplot(y=cat_data['rating'])
plt.title('Distribution of Ratings')
plt.grid(True, linestyle='--', alpha=0.7)

plt.tight_layout()
plt.show()


In [None]:
# 1. Label Encoding
label_encoder = LabelEncoder()

# Apply to each categorical column
cat_data_label = cat_data.copy()
for col in ['color', 'size', 'country', 'rating']:
    cat_data_label[f'{col}_encoded'] = label_encoder.fit_transform(cat_data_label[col])

print("Label Encoding Results:")
print(cat_data_label.head())

# Show mapping for each variable
for col in ['color', 'size', 'country', 'rating']:
    unique_values = cat_data[col].unique()
    encoded_values = label_encoder.fit_transform(unique_values)
    mapping = dict(zip(unique_values, encoded_values))
    print(f"\nLabel Encoding mapping for {col}:")
    for original, encoded in mapping.items():
        print(f"  {original} -> {encoded}")

# 2. One-Hot Encoding
# Apply one-hot encoding
one_hot_encoder = OneHotEncoder(sparse_output=False, drop='first')  # Drop first to avoid multicollinearity

# Select categorical columns
cat_cols = ['color', 'size', 'country', 'rating']
cat_data_encoded = cat_data.copy()

# Apply one-hot encoding
encoded_data = one_hot_encoder.fit_transform(cat_data_encoded[cat_cols])

# Get feature names
feature_names = []
for i, col in enumerate(cat_cols):
    categories = one_hot_encoder.categories_[i][1:]  # Skip first category (dropped)
    feature_names.extend([f"{col}_{category}" for category in categories])

# Create DataFrame with encoded features
encoded_df = pd.DataFrame(encoded_data, columns=feature_names)

# Combine with numerical features
cat_data_onehot = pd.concat([cat_data_encoded[['price', 'weight']], encoded_df], axis=1)

print("\nOne-Hot Encoding Results (first 5 rows, first 10 columns):")
print(cat_data_onehot.iloc[:5, :10])
print(f"\nShape after one-hot encoding: {cat_data_onehot.shape}")

# 3. Ordinal Encoding (for ordered categories)
# Define the order for ordinal variables
size_order = ['small', 'medium', 'large']
rating_order = ['low', 'medium', 'high']

# Create ordinal encoder
ordinal_encoder = OrdinalEncoder(categories=[size_order, rating_order])

# Apply ordinal encoding to size and rating
cat_data_ordinal = cat_data.copy()
cat_data_ordinal[['size_ordinal', 'rating_ordinal']] = ordinal_encoder.fit_transform(cat_data_ordinal[['size', 'rating']])

print("\nOrdinal Encoding Results:")
print(cat_data_ordinal[['size', 'size_ordinal', 'rating', 'rating_ordinal']].head())

# Show the ordinal mapping
print("\nOrdinal Encoding mapping for size:")
for i, val in enumerate(size_order):
    print(f"  {val} -> {i}")

print("\nOrdinal Encoding mapping for rating:")
for i, val in enumerate(rating_order):
    print(f"  {val} -> {i}")


In [None]:
# Let's go back to the wine dataset
# 1. Creating polynomial features
# Select a subset of features for demonstration
X_subset = df[['alcohol', 'malic_acid', 'ash']].values

# Create polynomial features
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X_subset)

# Get feature names
poly_features = poly.get_feature_names_out(['alcohol', 'malic_acid', 'ash'])

# Create DataFrame with polynomial features
df_poly = pd.DataFrame(X_poly, columns=poly_features)

print("Original features:")
print(df[['alcohol', 'malic_acid', 'ash']].head())

print("\nPolynomial features (degree=2):")
print(df_poly.head())
print(f"\nFeature names: {', '.join(poly_features)}")

# 2. Creating interaction features manually
# Let's create some interaction features for the wine dataset
df_interact = df.copy()

# Create interaction features
df_interact['alcohol_ash'] = df_interact['alcohol'] * df_interact['ash']
df_interact['flavanoids_color'] = df_interact['flavanoids'] * df_interact['color_intensity']
df_interact['proline_phenols'] = df_interact['proline'] * df_interact['total_phenols']

# Create ratio features
df_interact['alcohol_to_malic'] = df_interact['alcohol'] / df_interact['malic_acid']
df_interact['hue_to_color'] = df_interact['hue'] / df_interact['color_intensity']

# Create sum features
df_interact['phenols_flavanoids'] = df_interact['total_phenols'] + df_interact['flavanoids']

print("\nCreated interaction features:")
print(df_interact[['alcohol_ash', 'flavanoids_color', 'proline_phenols', 
                  'alcohol_to_malic', 'hue_to_color', 'phenols_flavanoids']].head())

# 3. Creating binned features
# Bin the alcohol content
df_bins = df.copy()
df_bins['alcohol_bin'] = pd.cut(df_bins['alcohol'], bins=4, labels=['low', 'medium', 'high', 'very_high'])
df_bins['proline_bin'] = pd.qcut(df_bins['proline'], q=4, labels=['low', 'medium', 'high', 'very_high'])

print("\nBinned features:")
print(df_bins[['alcohol', 'alcohol_bin', 'proline', 'proline_bin']].head(10))

# Visualize the binned features
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
sns.countplot(y=df_bins['alcohol_bin'])
plt.title('Distribution of Alcohol Bins')
plt.grid(True, linestyle='--', alpha=0.7)

plt.subplot(1, 2, 2)
sns.countplot(y=df_bins['proline_bin'])
plt.title('Distribution of Proline Bins')
plt.grid(True, linestyle='--', alpha=0.7)

plt.tight_layout()
plt.show()


In [None]:
# 4. Creating domain-specific features
# For wine data, we can create some domain-specific features

df_domain = df.copy()

# Alcohol-to-sugar ratio (alcohol / residual sugar)
df_domain['alcohol_sugar_ratio'] = df_domain['alcohol'] / df_domain['residual_sugar']

# Total acidity (fixed acidity + volatile acidity)
df_domain['total_acidity'] = df_domain['malic_acid'] + df_domain['ash']

# Phenolic compounds ratio
df_domain['phenolics_ratio'] = df_domain['total_phenols'] / df_domain['flavanoids']

# Color-to-hue ratio
df_domain['color_hue_ratio'] = df_domain['color_intensity'] / df_domain['hue']

print("Domain-specific features:")
print(df_domain[['alcohol_sugar_ratio', 'total_acidity', 'phenolics_ratio', 'color_hue_ratio']].head())

# Visualize relationships between new features and target
plt.figure(figsize=(15, 10))

plt.subplot(2, 2, 1)
sns.boxplot(x='target_name', y='alcohol_sugar_ratio', data=df_domain)
plt.title('Alcohol-Sugar Ratio by Wine Class')
plt.grid(True, linestyle='--', alpha=0.7)
plt.xticks(rotation=45)

plt.subplot(2, 2, 2)
sns.boxplot(x='target_name', y='total_acidity', data=df_domain)
plt.title('Total Acidity by Wine Class')
plt.grid(True, linestyle='--', alpha=0.7)
plt.xticks(rotation=45)

plt.subplot(2, 2, 3)
sns.boxplot(x='target_name', y='phenolics_ratio', data=df_domain)
plt.title('Phenolics Ratio by Wine Class')
plt.grid(True, linestyle='--', alpha=0.7)
plt.xticks(rotation=45)

plt.subplot(2, 2, 4)
sns.boxplot(x='target_name', y='color_hue_ratio', data=df_domain)
plt.title('Color-Hue Ratio by Wine Class')
plt.grid(True, linestyle='--', alpha=0.7)
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()


In [None]:
# Let's use the original wine dataset for feature selection
X = df.drop(['target', 'target_name'], axis=1)
y = df['target']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 1. Univariate Feature Selection
# Select top k features based on ANOVA F-value
selector = SelectKBest(f_classif, k=5)
X_new = selector.fit_transform(X_train, y_train)

# Get selected feature names
selected_features_mask = selector.get_support()
selected_features = X.columns[selected_features_mask]

print("Top 5 features selected by ANOVA F-value:")
for i, feature in enumerate(selected_features):
    score = selector.scores_[selected_features_mask][i]
    print(f"  {feature}: F-score = {score:.2f}")

# Visualize feature scores
plt.figure(figsize=(12, 6))
scores = selector.scores_
feature_scores = list(zip(X.columns, scores))
feature_scores.sort(key=lambda x: x[1], reverse=True)
features, scores = zip(*feature_scores)

plt.barh(range(len(features)), scores, align='center')
plt.yticks(range(len(features)), features)
plt.title('Feature Importance (ANOVA F-value)')
plt.xlabel('F-score')
plt.tight_layout()
plt.show()

# 2. Recursive Feature Elimination (RFE)
# Use Random Forest as the base estimator
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rfe = RFE(estimator=rf, n_features_to_select=5, step=1)
rfe.fit(X_train, y_train)

# Get selected feature names
rfe_selected_features = X.columns[rfe.support_]

print("\nTop 5 features selected by RFE with Random Forest:")
for i, feature in enumerate(rfe_selected_features):
    print(f"  {feature}: Rank = {rfe.ranking_[X.columns.get_loc(feature)]}")

# Visualize feature ranking
plt.figure(figsize=(12, 6))
ranking = rfe.ranking_
feature_ranking = list(zip(X.columns, ranking))
feature_ranking.sort(key=lambda x: x[1])
features, ranking = zip(*feature_ranking)

plt.barh(range(len(features)), [1/r for r in ranking], align='center')  # Invert ranking for visualization
plt.yticks(range(len(features)), features)
plt.title('Feature Ranking (RFE with Random Forest)')
plt.xlabel('Inverse Ranking (higher is better)')
plt.tight_layout()
plt.show()

# 3. Feature Importance from Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Get feature importances
importances = rf.feature_importances_
indices = np.argsort(importances)[::-1]

print("\nFeature importances from Random Forest:")
for i in range(X.shape[1]):
    print(f"  {X.columns[indices[i]]}: {importances[indices[i]]:.4f}")

# Visualize feature importances
plt.figure(figsize=(12, 6))
plt.barh(range(X.shape[1]), importances[indices], align='center')
plt.yticks(range(X.shape[1]), [X.columns[i] for i in indices])
plt.title('Feature Importance from Random Forest')
plt.xlabel('Importance')
plt.tight_layout()
plt.show()


In [None]:
# Principal Component Analysis (PCA)
# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA
pca = PCA()
X_pca = pca.fit_transform(X_scaled)

# Explained variance ratio
explained_variance = pca.explained_variance_ratio_
cumulative_variance = np.cumsum(explained_variance)

# Print explained variance
print("Explained variance by each principal component:")
for i, var in enumerate(explained_variance):
    print(f"  PC{i+1}: {var:.4f} ({cumulative_variance[i]:.4f} cumulative)")

# Visualize explained variance
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.bar(range(1, len(explained_variance) + 1), explained_variance, alpha=0.7)
plt.plot(range(1, len(explained_variance) + 1), explained_variance, 'ro-')
plt.title('Explained Variance by Principal Component')
plt.xlabel('Principal Component')
plt.ylabel('Explained Variance Ratio')
plt.grid(True, linestyle='--', alpha=0.7)

plt.subplot(1, 2, 2)
plt.plot(range(1, len(cumulative_variance) + 1), cumulative_variance, 'bo-')
plt.axhline(y=0.9, color='r', linestyle='--', label='90% Variance')
plt.title('Cumulative Explained Variance')
plt.xlabel('Number of Principal Components')
plt.ylabel('Cumulative Explained Variance')
plt.grid(True, linestyle='--', alpha=0.7)
plt.legend()

plt.tight_layout()
plt.show()

# Determine number of components for 90% variance
n_components = np.argmax(cumulative_variance >= 0.9) + 1
print(f"\nNumber of components needed for 90% variance: {n_components}")

# Apply PCA with selected number of components
pca = PCA(n_components=n_components)
X_pca_reduced = pca.fit_transform(X_scaled)

print(f"\nOriginal data shape: {X.shape}")
print(f"Reduced data shape: {X_pca_reduced.shape}")

# Visualize first two principal components
plt.figure(figsize=(10, 8))
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='viridis', s=100, alpha=0.8, edgecolors='k')
plt.title('PCA: First Two Principal Components')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.colorbar(scatter, label='Wine Class')
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

# Feature loadings (correlation between original features and principal components)
loadings = pca.components_.T * np.sqrt(pca.explained_variance_)

# Visualize feature loadings for first two components
plt.figure(figsize=(12, 10))
plt.scatter(loadings[:, 0], loadings[:, 1], s=200)
for i, feature in enumerate(X.columns):
    plt.annotate(feature, (loadings[i, 0], loadings[i, 1]), fontsize=12)
plt.title('Feature Loadings for First Two Principal Components')
plt.xlabel(f'Principal Component 1 ({explained_variance[0]:.2%} variance)')
plt.ylabel(f'Principal Component 2 ({explained_variance[1]:.2%} variance)')
plt.axhline(y=0, color='k', linestyle='--', alpha=0.3)
plt.axvline(x=0, color='k', linestyle='--', alpha=0.3)
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()


In [None]:
# Let's compare the performance of a Random Forest classifier with:
# 1. Original features
# 2. Selected features from RFE
# 3. PCA-transformed features
# 4. Original + engineered features

# Original features
X_train_orig, X_test_orig, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Selected features from RFE
rfe = RFE(estimator=RandomForestClassifier(random_state=42), n_features_to_select=5, step=1)
rfe.fit(X_train_orig, y_train)
X_train_rfe = rfe.transform(X_train_orig)
X_test_rfe = rfe.transform(X_test_orig)

# PCA-transformed features
pca = PCA(n_components=n_components)
X_train_pca = pca.fit_transform(scaler.fit_transform(X_train_orig))
X_test_pca = pca.transform(scaler.transform(X_test_orig))

# Original + engineered features
# Create engineered features
def create_engineered_features(X):
    X_eng = X.copy()
    X_eng['alcohol_sugar_ratio'] = X_eng['alcohol'] / X_eng['residual_sugar']
    X_eng['total_acidity'] = X_eng['malic_acid'] + X_eng['ash']
    X_eng['phenolics_ratio'] = X_eng['total_phenols'] / X_eng['flavanoids']
    X_eng['color_hue_ratio'] = X_eng['color_intensity'] / X_eng['hue']
    return X_eng

X_train_eng = create_engineered_features(X_train_orig)
X_test_eng = create_engineered_features(X_test_orig)

# Function to evaluate model performance
def evaluate_model(X_train, X_test, y_train, y_test, name):
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"\n{name} Results:")
    print(f"  Accuracy: {accuracy:.4f}")
    print("  Classification Report:")
    print(classification_report(y_test, y_pred, target_names=target_names))
    return accuracy, model

# Evaluate models
print("Evaluating the impact of feature engineering on model performance...")
acc_orig, model_orig = evaluate_model(X_train_orig, X_test_orig, y_train, y_test, "Original Features")
acc_rfe, model_rfe = evaluate_model(X_train_rfe, X_test_rfe, y_train, y_test, "RFE Selected Features")
acc_pca, model_pca = evaluate_model(X_train_pca, X_test_pca, y_train, y_test, "PCA Features")
acc_eng, model_eng = evaluate_model(X_train_eng, X_test_eng, y_train, y_test, "Original + Engineered Features")

# Compare accuracies
accuracies = {
    'Original Features': acc_orig,
    'RFE Selected Features': acc_rfe,
    'PCA Features': acc_pca,
    'Original + Engineered Features': acc_eng
}

# Visualize accuracies
plt.figure(figsize=(10, 6))
plt.bar(accuracies.keys(), accuracies.values(), color=['blue', 'green', 'orange', 'red'])
plt.title('Model Accuracy with Different Feature Engineering Approaches')
plt.xlabel('Feature Set')
plt.ylabel('Accuracy')
plt.ylim(0.5, 1.0)
plt.grid(True, linestyle='--', alpha=0.7, axis='y')

# Add accuracy values on top of bars
for i, (key, value) in enumerate(accuracies.items()):
    plt.text(i, value + 0.01, f'{value:.4f}', ha='center', fontweight='bold')

plt.tight_layout()
plt.show()
