# **Ensemble Techniques And Its Types-4**

### Dataset is small so it is giving abnormal results

In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Sample data (replace with actual dataset)
data = pd.DataFrame({
    'numerical_1': [1, 2, np.nan, 4, 5],
    'numerical_2': [5, 6, 7, 8, 9],
    'categorical_1': ['a', 'b', 'a', np.nan, 'b'],
    'categorical_2': ['x', 'y', 'y', 'x', np.nan],
    'target': [0, 1, 0, 1, 0]
})

# Split the data into features and target
X = data.drop('target', axis=1)
y = data['target']

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify numerical and categorical columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X.select_dtypes(include=['object']).columns

# Define the numerical pipeline
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Define the categorical pipeline
categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine numerical and categorical pipelines
preprocessor = ColumnTransformer([
    ('num', numerical_pipeline, numerical_cols),
    ('cat', categorical_pipeline, categorical_cols)
])

# Create the full pipeline with feature selection and classifier
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('feature_selection', SelectKBest(score_func=f_classif, k='all')),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

# Train the model
pipeline.fit(X_train, y_train)

# Make predictions
y_pred = pipeline.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Print intermediate results
print("Training data (X_train):")
print(X_train)
print("Test data (X_test):")
print(X_test)
print("Training labels (y_train):")
print(y_train)
print("Test labels (y_test):")
print(y_test)
print("Predicted labels (y_pred):")
print(y_pred)


Accuracy: 0.00
Training data (X_train):
   numerical_1  numerical_2 categorical_1 categorical_2
4          5.0            9             b           NaN
2          NaN            7             a             y
0          1.0            5             a             x
3          4.0            8           NaN             x
Test data (X_test):
   numerical_1  numerical_2 categorical_1 categorical_2
1          2.0            6             b             y
Training labels (y_train):
4    0
2    0
0    0
3    1
Name: target, dtype: int64
Test labels (y_test):
1    1
Name: target, dtype: int64
Predicted labels (y_pred):
[0]


In [2]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Load the Iris dataset
iris = load_iris()
X = pd.DataFrame(iris.data, columns=iris.feature_names)
y = iris.target

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify numerical columns (all columns in Iris dataset are numerical)
numerical_cols = X.columns

# Define the numerical pipeline
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Combine numerical pipeline (no categorical columns in Iris dataset)
preprocessor = ColumnTransformer([
    ('num', numerical_pipeline, numerical_cols)
])

# Define the classifiers
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
lr_clf = LogisticRegression(max_iter=1000, random_state=42)

# Define the Voting Classifier
voting_clf = VotingClassifier(estimators=[
    ('rf', rf_clf),
    ('lr', lr_clf)
], voting='hard')

# Create the full pipeline
voting_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', voting_clf)
])

# Train the model
voting_pipeline.fit(X_train, y_train)

# Make predictions
y_pred_voting = voting_pipeline.predict(X_test)

# Evaluate the model
accuracy_voting = accuracy_score(y_test, y_pred_voting)
print(f'Voting Classifier Accuracy: {accuracy_voting:.2f}')


Voting Classifier Accuracy: 1.00
