In [19]:
!pip install xgboost
!pip install lightgbm


import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
file_path = r'C:\Hamoye\Data_for_UCI_named.csv'

# Load the dataset into a pandas DataFrame
data = pd.read_csv(file_path)

# Drop the 'stab' column
data.drop('stab', axis=1, inplace=True)

# Split the data into features (X) and target (y)
X = data.drop('stabf', axis=1)
y = data['stabf']

# Split the data into train and test sets with an 80-20 split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)




In [20]:
# Scale the features using StandardScaler
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)


In [21]:
# Train a Random Forest classifier
rf = RandomForestClassifier(random_state=1)
rf.fit(x_train_scaled, y_train)

# Train an Extra Trees classifier
et = ExtraTreesClassifier(random_state=1)
et.fit(x_train_scaled, y_train)


In [22]:
# Convert 'stabf' labels to numerical values
y_train = y_train.replace({'stable': 0, 'unstable': 1})
y_test = y_test.replace({'stable': 0, 'unstable': 1})

# Train an XGBoost classifier
xgb = XGBClassifier(random_state=1)
xgb.fit(x_train_scaled, y_train)

# Train a LightGBM classifier
lgbm = LGBMClassifier(random_state=1)
lgbm.fit(x_train_scaled, y_train)


In [31]:
# Split the data into an 80-20 train-test split with a random state of 1
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Apply standard scaling to the training and test sets
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the random forest classifier
rf_clf = RandomForestClassifier(random_state=1)
rf_clf.fit(X_train_scaled, y_train)

# Make predictions on the test set
rf_pred = rf_clf.predict(X_test_scaled)

# Calculate the accuracy on the test set using the random forest classifier
rf_accuracy = accuracy_score(y_test, rf_pred)

# Print the accuracy on the test set using the random forest classifier

print("Random Forest Accuracy: {:.4f}".format(rf_accuracy))

Random Forest Accuracy: 0.9290


In [36]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import xgboost as xgb
from sklearn.metrics import accuracy_score



# Encode the target variable into numeric values
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split the data into an 80-20 train-test split with a random state of 1
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Apply standard scaling to the training and test sets
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the XGBoost classifier
xgb_clf = xgb.XGBClassifier(random_state=1)
xgb_clf.fit(X_train_scaled, y_train)

# Make predictions on the test set
xgb_pred = xgb_clf.predict(X_test_scaled)

# Calculate the accuracy on the test set using the XGBoost classifier
xgb_accuracy = accuracy_score(y_test, xgb_pred)

# Print the accuracy on the test set using the XGBoost classifier
print("XGBoost Accuracy: {:.4f}".format(xgb_accuracy))


XGBoost Accuracy: 0.9455


In [37]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import lightgbm as lgb
from sklearn.metrics import accuracy_score


# Encode the target variable into numeric values
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split the data into an 80-20 train-test split with a random state of 1
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Apply standard scaling to the training and test sets
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the LGBM classifier
lgb_clf = lgb.LGBMClassifier(random_state=1)
lgb_clf.fit(X_train_scaled, y_train)

# Make predictions on the test set
lgb_pred = lgb_clf.predict(X_test_scaled)

# Calculate the accuracy on the test set using the LGBM classifier
lgb_accuracy = accuracy_score(y_test, lgb_pred)

# Print the accuracy on the test set using the LGBM classifier
print("LGBM Accuracy: {:.4f}".format(lgb_accuracy))


LGBM Accuracy: 0.9395


In [38]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import ExtraTreesClassifier


# Encode the target variable into numeric values
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split the data into an 80-20 train-test split with a random state of 1
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Apply standard scaling to the training and test sets
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the ExtraTreesClassifier estimator
et_clf = ExtraTreesClassifier(random_state=1)

# Define the hyperparameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2'],
    'bootstrap': [True, False]
}

# Perform RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=et_clf, param_distributions=param_grid, cv=5, n_iter=10,
                                   scoring='accuracy', n_jobs=-1, verbose=1, random_state=1)
random_search.fit(X_train_scaled, y_train)

# Get the best hyperparameters
best_params = random_search.best_params_

# Print the best hyperparameters
print("Best Hyperparameters:")
for param, value in best_params.items():
    print(f"{param}: {value}")


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best Hyperparameters:
n_estimators: 300
min_samples_split: 5
min_samples_leaf: 4
max_features: log2
max_depth: None
bootstrap: False


In [39]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score



# Encode the target variable into numeric values
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split the data into an 80-20 train-test split with a random state of 1
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Apply standard scaling to the training and test sets
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and train the initial ExtraTreesClassifier model
initial_et_clf = ExtraTreesClassifier(random_state=1)
initial_et_clf.fit(X_train_scaled, y_train)

# Make predictions on the test set using the initial model
initial_et_pred = initial_et_clf.predict(X_test_scaled)

# Calculate the accuracy of the initial model
initial_et_accuracy = accuracy_score(y_test, initial_et_pred)

# Get the best hyperparameters from the RandomizedSearchCV
best_params = {'n_estimators': 300, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'log2',
               'max_depth': 10, 'bootstrap': False}

# Initialize and train the new ExtraTreesClassifier model with the best hyperparameters
optimized_et_clf = ExtraTreesClassifier(random_state=1, **best_params)
optimized_et_clf.fit(X_train_scaled, y_train)

# Make predictions on the test set using the optimized model
optimized_et_pred = optimized_et_clf.predict(X_test_scaled)

# Calculate the accuracy of the optimized model
optimized_et_accuracy = accuracy_score(y_test, optimized_et_pred)

# Compare the accuracies of the initial and optimized models
if optimized_et_accuracy > initial_et_accuracy:
    print("The accuracy of the optimized ExtraTreesClassifier model is higher than the initial model.")
elif optimized_et_accuracy < initial_et_accuracy:
    print("The accuracy of the optimized ExtraTreesClassifier model is lower than the initial model.")
else:
    print("The accuracy of the optimized ExtraTreesClassifier model is the same as the initial model.")


The accuracy of the optimized ExtraTreesClassifier model is lower than the initial model.
