<a href="https://colab.research.google.com/github/MdRiyadulHasan/Numpy-and-Pandas/blob/main/Feature_Selection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

# Sample dataset
data = {
    'Age': [25, 30, 35, 22, 45, 28, 50, 32, 38, 48],
    'Income': [50000, 60000, 75000, 40000, 90000, 55000, 110000, 70000, 85000, 120000],
    'Purchase': [0, 1, 1, 0, 1, 0, 1, 1, 1, 0]
}

# Create a DataFrame
df = pd.DataFrame(data)

# Separate features (Age and Income) and the target variable (Purchase)
features = df[['Age', 'Income']]
target = df['Purchase']

# Calculate the correlation between each feature and the target
correlation_scores = {}

for feature in features.columns:
    # Use Pearson correlation coefficient (default method)
    correlation = np.corrcoef(features[feature], target)[0, 1]
    correlation_scores[feature] = correlation

print(correlation_scores)
# Print the correlation scores
for feature, correlation in correlation_scores.items():
    print(f'Correlation between {feature} and Purchase: {correlation:.2f}')


{'Age': 0.4019864424943513, 'Income': 0.30648719994915724}
Correlation between Age and Purchase: 0.40
Correlation between Income and Purchase: 0.31


In [None]:
from sklearn.preprocessing import LabelEncoder

# Sample categorical data
data = ['cat', 'dog', 'fish', 'dog', 'cat']

# Initialize the label encoder
label_encoder = LabelEncoder()

# Fit and transform the data
encoded_data = label_encoder.fit_transform(data)

# Resulting encoded data
print(encoded_data)
# Output: [0 1 2 1 0]


[0 1 2 1 0]


In [None]:
import pandas as pd

# Sample data
data = {'Color': ['Red', 'Green', 'Blue', 'Red', 'Green']}

# Create a DataFrame
df = pd.DataFrame(data)
print(df)

# Perform one-hot encoding
one_hot_encoded = pd.get_dummies(df, columns=['Color'])

# Display the result
print(one_hot_encoded)


   Color
0    Red
1  Green
2   Blue
3    Red
4  Green
   Color_Blue  Color_Green  Color_Red
0           0            0          1
1           0            1          0
2           1            0          0
3           0            0          1
4           0            1          0


In [None]:
import pandas as pd

data = {'Education Level': ['High School', 'Bachelor', 'Master', 'PhD', 'Bachelor']}
df = pd.DataFrame(data)

from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df['Encoded Education'] = label_encoder.fit_transform(df['Education Level'])
print(df)


  Education Level  Encoded Education
0     High School                  1
1        Bachelor                  0
2          Master                  2
3             PhD                  3
4        Bachelor                  0


In [4]:
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split

# Load the dataset
data = load_iris()
X, y = data.data, data.target

# Define the model
model = RandomForestClassifier()

# Define the hyperparameter grid to search
param_grid = {
    'n_estimators': [10, 50, 100, 200],
    'max_depth': [None, 10, 20, 30],
}

# Split the data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Perform hyperparameter tuning using Grid Search
grid_search = GridSearchCV(model, param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best model and evaluate it
best_model = grid_search.best_estimator_
best_score = grid_search.best_score_

val_score = best_model.score(X_val, y_val)


print(f"Best Model: {best_model}")
print(f"Best Training Score: {best_score}")
print(f"Validation Score: {val_score}")

Best Model: RandomForestClassifier(n_estimators=50)
Best Training Score: 0.9583333333333334
Validation Score: 1.0


In [5]:
y_pred = best_model.predict(X_val)
print(y_pred)
print(y_val)

[1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0]
[1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0]


In [6]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# Define the hyperparameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create the model
rf = RandomForestClassifier()

# Grid search with cross-validation
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, scoring='accuracy', cv=5)
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_


In [7]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

# Define the hyperparameter distribution
param_dist = {
    'n_estimators': randint(100, 500),
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 5)
}

# Create the model
rf = RandomForestClassifier()

# Randomized search with cross-validation
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_dist, n_iter=100, scoring='accuracy', cv=5, n_jobs=-1)
random_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = random_search.best_params_
best_model = random_search.best_estimator_


In [8]:
from sklearn.model_selection import train_test_split

# Split the data into training, validation, and test sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Define a range of hyperparameters to tune
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}


In [None]:
# Create the base model
rf_model = RandomForestClassifier(random_state=42)

# Use GridSearchCV to perform hyperparameter tuning
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, scoring='accuracy', cv=5)
grid_search.fit(X_train, y_train)


In [None]:
best_params = grid_search.best_params_


In [None]:
best_rf_model = RandomForestClassifier(**best_params, random_state=42)
best_rf_model.fit(X_train, y_train)
val_accuracy = best_rf_model.score(X_val, y_val)


In [None]:
test_accuracy = best_rf_model.score(X_test, y_test)
