### 4. Three Supervised Learning Models

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier 
from sklearn.metrics import accuracy_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.svm import SVC



data = pd.read_csv('C:/Users/hp/OneDrive - Deakin University/Desktop/Machine learning/9D/Dataset4.csv')

if 'target' not in data.columns:
    print("Error: 'target' column not found in the dataset.")
else:
    # Separate numeric and non-numeric columns
    numeric_cols = data.select_dtypes(include=['float64', 'int64']).columns
    non_numeric_cols = data.select_dtypes(exclude=['float64', 'int64']).columns

    # Fill missing values for numeric columns with mean
    data[numeric_cols] = data[numeric_cols].fillna(data[numeric_cols].mean())

    # Fill missing values for non-numeric columns with mode (most frequent value)
    for col in non_numeric_cols:
        if col != 'target':  # Exclude the target column from one-hot encoding
            data[col] = data[col].fillna(data[col].mode()[0])

    # Convert categorical variables to numerical (One-Hot Encoding) but exclude the target column
    data = pd.get_dummies(data, columns=non_numeric_cols.drop('target'), drop_first=True)

    # Split the data into features (X) and target (y)
    X = data.drop('target', axis=1)
    y = data['target']

    # Train-test split (80% training, 20% testing)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    y_train = y_train.fillna(y_train.mode()[0])
    # Handle missing values in X_train (impute missing values with the mean for numeric columns)
    imputer = SimpleImputer(strategy='mean')  # You can change the strategy to 'median' or 'most_frequent'
    X_train = imputer.fit_transform(X_train)
    X_test = imputer.transform(X_test)

    # Standardize features (necessary for models like SVM)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)


### METHOD 1 : LOGISTIC REGRESSION 

In [2]:
# Fill missing values in y_train with the mode (most frequent value)
y_train_filled = y_train.fillna(y_train.mode()[0])

# Logistic Regression Model
log_model = LogisticRegression()
log_model.fit(X_train, y_train_filled)

# Predictions and Performance
y_pred_log = log_model.predict(X_test)
log_accuracy = accuracy_score(y_test, y_pred_log)
log_f1 = f1_score(y_test, y_pred_log, average='weighted')

print(f"Logistic Regression Accuracy: {log_accuracy}")
print(f"Logistic Regression F1 Score: {log_f1}")



Logistic Regression Accuracy: 0.9899285250162443
Logistic Regression F1 Score: 0.9898070087556379


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### METHOD 2 : SUPPORT VECTOR MACHINE (SVM)

In [3]:
# Use SGDClassifier as an approximation to a linear SVM
sgd_model = SGDClassifier(loss='hinge', max_iter=1000, tol=1e-3)
sgd_model.fit(X_train, y_train_filled)

# Predictions and Performance
y_pred_sgd = sgd_model.predict(X_test)
sgd_accuracy = accuracy_score(y_test, y_pred_sgd)
sgd_f1 = f1_score(y_test, y_pred_sgd, average='weighted')

print(f"SGD Classifier Accuracy: {sgd_accuracy}")
print(f"SGD Classifier F1 Score: {sgd_f1}")


SGD Classifier Accuracy: 0.9882634827810266
SGD Classifier F1 Score: 0.9879612259346995


### METHOD 3 : K-NEAREST NEIGHBORS (KNN)

In [4]:
# Fill missing values in y_train with the most frequent value (mode)
y_train_filled = y_train.fillna(y_train.mode()[0])

# K-Nearest Neighbors Model
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train_filled)

# Predictions and Performance
y_pred_knn = knn_model.predict(X_test)
knn_accuracy = accuracy_score(y_test, y_pred_knn)
knn_f1 = f1_score(y_test, y_pred_knn, average='weighted')

print(f"KNN Accuracy: {knn_accuracy}")
print(f"KNN F1 Score: {knn_f1}")


KNN Accuracy: 0.99545159194282
KNN F1 Score: 0.9954229433237135


### 5. Three Ensemble Models for Predicting “Target”

### ENSEMBLE MODEL 1 : RANDOM FOREST 

In [5]:


# Impute missing values in X_train (use mean strategy for numeric features)
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)  # Impute the test set similarly

# If y_train has NaN values, fill them with mode (most frequent value)
y_train_imputed = y_train.fillna(y_train.mode()[0])

# Now fit the model with imputed values
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_imputed, y_train_imputed)

# Predictions and Performance
y_pred_rf = rf_model.predict(X_test_imputed)
rf_accuracy = accuracy_score(y_test, y_pred_rf)
rf_f1 = f1_score(y_test, y_pred_rf, average='weighted')

print(f"Random Forest Accuracy: {rf_accuracy}")
print(f"Random Forest F1 Score: {rf_f1}")


Random Forest Accuracy: 0.9979694606887589
Random Forest F1 Score: 0.9979656365539705


### ENSEMBLE MODEL 2: GRDADIENT BOOSTING 

In [6]:

# Drop rows with missing values in y_train and corresponding rows in X_train
non_nan_indices = y_train.notna()  # Get indices where y_train is not NaN
X_train_clean = X_train[non_nan_indices]           # Filter X_train
y_train_clean = y_train[non_nan_indices]           # Filter y_train

# Fit the Random Forest model with optimizations
rf_model = RandomForestClassifier(
    n_estimators=100,          # Number of trees in the forest
    max_depth=10,              # Limit depth to reduce overfitting and improve speed
    random_state=42,
    n_jobs=-1                  # Use all available CPU cores
)
rf_model.fit(X_train_clean, y_train_clean)

# Predictions and Performance
y_pred_rf = rf_model.predict(X_test)
rf_accuracy = accuracy_score(y_test, y_pred_rf)
rf_f1 = f1_score(y_test, y_pred_rf, average='weighted')

print(f"Random Forest Accuracy: {rf_accuracy}")
print(f"Random Forest F1 Score: {rf_f1}")

Random Forest Accuracy: 0.9978882391163093
Random Forest F1 Score: 0.997886201748637


### ENSEMBLE MODEL 3: AdaBoost

In [7]:
# Fit the AdaBoost model using the SAMME algorithm to avoid future deprecation warnings
ada_model = AdaBoostClassifier(n_estimators=100, algorithm='SAMME', random_state=42)
ada_model.fit(X_train, y_train)

# Predictions and Performance
y_pred_ada = ada_model.predict(X_test)
ada_accuracy = accuracy_score(y_test, y_pred_ada)
ada_f1 = f1_score(y_test, y_pred_ada, average='weighted')

print(f"AdaBoost Accuracy: {ada_accuracy}")
print(f"AdaBoost F1 Score: {ada_f1}")

AdaBoost Accuracy: 0.9100877192982456
AdaBoost F1 Score: 0.9025792039918675


### 5.E. ENSEMBLE MODEL USING ML CLASSIFIER 

In [8]:
# Create a Bagging ensemble with SVM as the base estimator
bagging_svm = BaggingClassifier(estimator=SVC(), n_estimators=10, random_state=42)
bagging_svm.fit(X_train, y_train)

This approach demonstrates that ensemble methods can be applied to classifiers like SVM, KNN, or Logistic Regression. Bagging helps reduce the variance of these models and improves their generalization.