In [None]:
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_score, recall_score, classification_report
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score, KFold
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.svm import SVC

In [None]:
global labels
global features
labels = [f"label_{i+1}" for i in range(4)]
features = [f'feature_{i+1}' for i in range(768)]
label_1 = labels[0]
label_2 = labels[1]
label_3 = labels[2]
label_4 = labels[3]

In [None]:
train = pd.read_csv("/kaggle/input/ml-project/train.csv")
valid = pd.read_csv("/kaggle/input/ml-project/valid.csv")
test = pd.read_csv("/kaggle/input/ml-project/test.csv")

In [None]:
def data_preprocess(train_df, valid_df, test_df, label):
    scaler = StandardScaler()

    # Apply feature scaling to training data
    X_train = pd.DataFrame(scaler.fit_transform(train_df.drop(labels, axis=1)), columns=features)
    y_train = train_df[label]

    # Apply feature scaling to validation data
    X_valid = pd.DataFrame(scaler.transform(valid_df.drop(labels, axis=1)), columns=features)
    y_valid = valid_df[label]

    # Apply feature scaling to test data
    X_test = pd.DataFrame(scaler.transform(test_df.drop("ID", axis=1)), columns=features)
    return X_train, y_train, X_valid, y_valid, X_test

In [None]:
def feature_engineering_with_pca(X_train, X_valid, X_test, n_components):

    # Apply PCA for dimensionality reduction
    pca = PCA(n_components)
    
    # Create a new DataFrame with the PCA-transformed features
    X_train_pca = pca.fit_transform(X_train)  
    X_train_df_pca = pd.DataFrame(X_train_pca)
    X_valid_df_pca = pd.DataFrame(pca.transform(X_valid))
    X_test_df_pca = pd.DataFrame(pca.transform(X_test))
    print("Shape for label3 train set:", X_train_df_pca.shape)
    print("Shape for label3 validation set:", X_valid_df_pca.shape)
    print("Shape for label3 validation set:", X_test_df_pca.shape)

    return X_train_df_pca, X_valid_df_pca, X_test_df_pca

In [None]:
def feature_engineering_with_select_kbest(X_train, y_train, X_valid, X_test, n):
    selector = SelectKBest(f_classif, k=n)
    X_train_df = selector.fit_transform(X_train, y_train)
    X_valid_df = selector.transform(X_valid)
    X_test_df = selector.transform(X_test)
    print("Shape for label3 train set:", X_train_df_pca.shape)
    print("Shape for label3 validation set:", X_valid_df_pca.shape)
    print("Shape for label3 validation set:", X_test_df_pca.shape)
    return X_train_df, X_valid_df, X_test_df

In [None]:
def build_knn_model(X_train, y_train, n_neighbors):
    # Create and train your KNN classifier model
    # You can perform hyperparameter tuning by changing the value of 'n_neighbors'
    knn = KNeighborsClassifier(n_neighbors)  
    knn.fit(X_train, y_train)
    return knn

In [None]:
def build_svc_model(X_train, y_train):
    svc_model = SVC(kernel='linear', gamma='scale')
    svc_model.fit(X_train, y_train)
    return svc_model

In [None]:
def evaluate(y_valid, y_pred):
    print(f"Accuracy: {accuracy_score(y_valid, y_pred)}")
    print(f"F1 Score: {f1_score(y_valid, y_pred, average='weighted')}")
    print(f"Precision: {precision_score(y_valid, y_pred, average='weighted')}")
    print(f"Recall: {recall_score(y_valid, y_pred, average='weighted')}")

# Label 03

In [None]:
X_train_3, y_train_3, X_valid_3, y_valid_3, X_test_3 = data_preprocess(train, valid, test, label_3)
y_train_3.info()

In [None]:
plt.hist(y_train_3, bins=20, edgecolor='k')
plt.xlabel(label_3)
plt.ylabel('Count')
plt.title(f'Distribution of {label_3}')
plt.show()

# Before Model Building - Lable 03

In [None]:
# Build a KNN model
model_3_before = build_knn_model(X_train_3, y_train_3, 5)

# Evaluate the model on the validation set
y_pred_3_before = model_3_before.predict(X_valid_3)

# Calculate evaluation metrics 
evaluate(y_valid_3, y_pred_3_before)

# Feature reduction using PCA - Label 03

In [None]:
X_train_3_pca, X_valid_3_pca, X_test_3_pca = feature_engineering_with_pca(X_train_3, X_valid_3, X_test_3, n_components=0.95)

In [None]:
# Build a KNN model 
model_3_knn = build_knn_model(X_train_3_pca, y_train_3, 5)

# Evaluate the model on the validation set
y_pred_3_knn = model_3_knn.predict(X_valid_3_pca)

# Calculate evaluation metrics 
print(confusion_matrix(y_valid_3, y_pred_3_knn))
evaluate(y_valid_3, y_pred_3_knn)

In [None]:
# Build a SVC model 
model_3_pca_svc = build_svc_model(X_train_3_pca, y_train_3)

# Evaluate the model on the validation set
y_pred_3_svc = model_3_pca_svc.predict(X_valid_3_pca)

# Calculate evaluation metrics 
print(confusion_matrix(y_valid_3, y_pred_3_svc))
evaluate(y_valid_3, y_pred_3_svc)

# Hyperparameter tuning for label 3

In [None]:
param_grid_3 = {
    'C': [0.1, 1, 10, 100],         
    'kernel': ['linear', 'rbf', 'poly'] 
}

model_3_svc = SVC()

grid_search_3 = GridSearchCV(estimator=model_3_svc, param_grid=param_grid_3, cv=5, verbose=3, n_jobs=-1, scoring='accuracy')

grid_search_3.fit(X_train_3_pca, y_train_3)

best_params_3 = grid_search_3.best_params_
best_model_3 = grid_search_3.best_estimator_

valid_accuracy_3 = best_model_3.score(X_valid_3_pca, y_valid_3)

# Print the best hyperparameters and test accuracy
print("Best Hyperparameters:", best_params_3)
print("Test Accuracy:", valid_accuracy_3)

# Predicting label 3

In [None]:
y_pred_3 = best_model_3.predict(X_test_3_pca)
output_df_3 = pd.DataFrame({'ID': range(1, len(y_pred_3) + 1), 'label_3': y_pred_3})

output_file_path_3 = '/kaggle/working/label3_out.csv'
output_df_3.to_csv(output_file_path_3, index=False)
print(f'Predictions saved to {output_file_path_3}')