In [None]:
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_score, recall_score, classification_report
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif
from imblearn.over_sampling import RandomOverSampler
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
global labels
global features
labels = [f"label_{i+1}" for i in range(4)]
features = [f'feature_{i+1}' for i in range(768)]
label_1 = labels[0]
label_2 = labels[1]
label_3 = labels[2]
label_4 = labels[3]

In [None]:
train = pd.read_csv("/kaggle/input/ml-project/train.csv")
valid = pd.read_csv("/kaggle/input/ml-project/valid.csv")
test = pd.read_csv("/kaggle/input/ml-project/test.csv")

In [None]:
def data_preprocess(train_df, valid_df, test_df, label):
    scaler = StandardScaler()

    # Apply feature scaling to training data
    X_train = pd.DataFrame(scaler.fit_transform(train_df.drop(labels, axis=1)), columns=features)
    y_train = train_df[label]

    # Apply feature scaling to validation data
    X_valid = pd.DataFrame(scaler.transform(valid_df.drop(labels, axis=1)), columns=features)
    y_valid = valid_df[label]

    # Apply feature scaling to test data
    X_test = pd.DataFrame(scaler.transform(test_df.drop("ID", axis=1)), columns=features)
    return X_train, y_train, X_valid, y_valid, X_test

In [None]:
def feature_engineering_with_pca(X_train, X_valid, n_components):

    # Apply PCA for dimensionality reduction
    pca = PCA(n_components)
    
    # Create a new DataFrame with the PCA-transformed features
    X_train_pca = pca.fit_transform(X_train)  
    X_train_df_pca = pd.DataFrame(X_train_pca)
    X_valid_df_pca = pd.DataFrame(pca.transform(X_valid))
    print("Shape for speaker recognition train set:", X_train_df_pca.shape)
    print("Shape for speaker recognition validation set:", X_valid_df_pca.shape)
    
    return X_train_df_pca, X_valid_df_pca

In [None]:
def feature_engineering_with_select_kbest(X_train, y_train, X_valid, n):
    selector = SelectKBest(f_classif, k=n)
    X_train_df = selector.fit_transform(X_train, y_train)
    X_valid_df = selector.transform(X_valid)
    print("Shape :", X_train_df.shape)
    return X_train_df, X_valid_df

In [None]:
def build_knn_model(X_train, y_train, n_neighbors):
    # Create and train your KNN classifier model
    # You can perform hyperparameter tuning by changing the value of 'n_neighbors'
    knn = KNeighborsClassifier(n_neighbors)  
    knn.fit(X_train, y_train)
    return knn

In [None]:
def build_svm_model(X_train, y_train):
    svm_model = svm.SVC(kernel="linear")
    svm_model.fit(X_train, y_train)
    return svm_model

In [None]:
def evaluate(y_valid, y_pred):
    print(f"Accuracy: {accuracy_score(y_valid, y_pred)}")
    print(f"F1 Score: {f1_score(y_valid, y_pred, average='weighted')}")
    print(f"Precision: {precision_score(y_valid, y_pred, average='weighted')}")
    print(f"Recall: {recall_score(y_valid, y_pred, average='weighted')}")

# Label 01

In [None]:
X_train_1, y_train_1, X_valid_1, y_valid_1, X_test_1 = data_preprocess(train, valid, test, label_1)

y_train_1.info()

In [None]:
plt.hist(y_train_1, bins=20, edgecolor='k')
plt.xlabel(label_1)
plt.ylabel('Frequency')
plt.title(f'Distribution of {label_1}')
plt.show()

# Before Feature Engineering - Lable 01

KNN

In [None]:
# Build a KNN model for Speaker Recognition
model_1_knn_before = build_knn_model(X_train_1, y_train_1, 5)

# Evaluate the model on the validation set
y_pred_1_knn_before = model_1_knn_before.predict(X_valid_1)

# Calculate evaluation metrics 
# print(classification_report(y_valid_1, y_pred_1_before))
print(confusion_matrix(y_valid_1, y_pred_1_knn_before))
evaluate(y_valid_1, y_pred_1_knn_before)

SVM

In [None]:
model_1_svm_before = build_svm_model(X_train_1, y_train_1)

# Evaluate the model on the validation set
y_pred_1_svm_before = model_1_svm_before.predict(X_valid_1)

# Calculate evaluation metrics 
# print(classification_report(y_valid_1, y_pred_1_svm_before))
print(confusion_matrix(y_valid_1, y_pred_1_svm_before))
evaluate(y_valid_1, y_pred_1_svm_before)

# Feature reduction using PCA

In [None]:
X_train_1_pca, X_valid_1_pca = feature_engineering_with_pca(X_train_1, X_valid_1, n_components=0.95)

In [None]:
# Build a KNN model for Speaker Recognition
model_1_knn = build_knn_model(X_train_1_pca, y_train_1, 5)

# Evaluate the model on the validation set
y_pred_1_knn = model_1_knn.predict(X_valid_1_pca)

# Calculate evaluation metrics 
print(confusion_matrix(y_valid_1, y_pred_1_knn))
evaluate(y_valid_1, y_pred_1_knn)

# Feature reduction using SelectKBest

In [None]:
X_train_1_select_kbest, X_valid_1_select_kbest = feature_engineering_with_select_kbest(X_train_1, y_train_1, X_valid_1, 55)

In [None]:
# Build a KNN model 
model_1_selectKBest = build_knn_model(X_train_1_select_kbest, y_train_1, 5)

# Evaluate the model on the validation set
y_pred_1_selectKBest = model_1_selectKBest.predict(X_valid_1_select_kbest)

# Calculate evaluation metrics 
print(confusion_matrix(y_valid_1, y_pred_1_selectKBest))
evaluate(y_valid_1, y_pred_1_selectKBest)

# Label 02

In [None]:
X_train_2, y_train_2, X_valid_2, y_valid_2, X_test_2 = data_preprocess(train, valid, test, label_2)
y_train_2.info()

In [None]:
train_df = train[train[label_2].notna()]
train_df.info()
valid_df = valid[valid[label_2].notna()]
valid_df.info()

In [None]:
X_train_2, y_train_2, X_valid_2, y_valid_2, X_test_2 = data_preprocess(train_df, valid_df, test, label_2)
y_train_2.info()

In [None]:
plt.hist(y_train_2, bins=20, edgecolor='k')
plt.xlabel(label_2)
plt.ylabel('Frequency')
plt.title(f'Distribution of {label_2}')
plt.show()

In [None]:
# Build a KNN model
model_2_before = build_knn_model(X_train_2, y_train_2, 5)

# Evaluate the model on the validation set
y_pred_2_before = model_2_before.predict(X_valid_2)

# Calculate evaluation metrics 
print(confusion_matrix(y_valid_2, y_pred_2_before))
evaluate(y_valid_2, y_pred_2_before)

In [None]:
X_train_2_pca, X_valid_2_pca = feature_engineering_with_pca(X_train_2, X_valid_2, n_components=0.95)

In [None]:
# Build a KNN model for Speaker Recognition
model_2 = build_knn_model(X_train_2_pca, y_train_2, 5)

# Evaluate the model on the validation set
y_pred_2 = model_2.predict(X_valid_2_pca)

# Calculate evaluation metrics 
print(confusion_matrix(y_valid_2, y_pred_2))
evaluate(y_valid_2, y_pred_2_before)

# Label 03

In [None]:
X_train_3, y_train_3, X_valid_3, y_valid_3, X_test_3 = data_preprocess(train, valid, test, label_3)
y_train_3.info()

In [None]:
plt.hist(y_train_3, bins=20, edgecolor='k')
plt.xlabel(label_3)
plt.ylabel('Count')
plt.title(f'Distribution of {label_3}')
plt.show()

In [None]:
# Build a KNN model
model_3_before = build_knn_model(X_train_3, y_train_3, 5)

# Evaluate the model on the validation set
y_pred_3_before = model_3_before.predict(X_valid_3)

# Calculate evaluation metrics 
print(classification_report(y_valid_3, y_pred_3_before))
print(confusion_matrix(y_valid_3, y_pred_3_before))

In [None]:
X_train_3_pca, X_valid_3_pca = feature_engineering_with_pca(X_train_3, X_valid_3, n_components=0.95)

In [None]:
# Build a KNN model 
model_3 = build_knn_model(X_train_3_pca, y_train_3, 5)

# Evaluate the model on the validation set
y_pred_3 = model_3.predict(X_valid_3_pca)

# Calculate evaluation metrics 
print(classification_report(y_valid_3, y_pred_3))

# Print the evaluation results
print(confusion_matrix(y_valid_3, y_pred_3))

# Label 04

In [None]:
X_train_4, y_train_4, X_valid_4, y_valid_4, X_test_4 = data_preprocess(train, valid, test, label_4)
y_train_4.info()

In [None]:
plt.hist(y_train_4, bins=20, edgecolor='k')
plt.xlabel(label_4)
plt.ylabel('Count')
plt.title(f'Distribution of {label_4}')
plt.show()

In [None]:
# Build a KNN model
model_4_before = build_knn_model(X_train_4, y_train_4, 5)

# Evaluate the model on the validation set
y_pred_4_before = model_4_before.predict(X_valid_4)

# Calculate evaluation metrics 
print(classification_report(y_valid_4, y_pred_4_before))
print(confusion_matrix(y_valid_4, y_pred_4_before))

In [None]:
resampler = RandomOverSampler(sampling_strategy='auto')
X_train_4_resampled, y_train_4_resampled = resampler.fit_resample(X_train_4, y_train_4)

In [None]:
plt.hist(y_train_4_resampled, bins=20, edgecolor='k')
plt.xlabel(label_4)
plt.ylabel('Frequency')
plt.title(f'Distribution of {label_4}')
plt.show()

In [None]:
X_train_4_pca, X_valid_4_pca = feature_engineering_with_pca(X_train_4_resampled, X_valid_4, n_components=0.95)

In [None]:
# Build a KNN model 
model_4 = build_knn_model(X_train_4_pca, y_train_4_resampled, 5)

# Evaluate the model on the validation set
y_pred_4 = model_4.predict(X_valid_4_pca)

# Calculate evaluation metrics 
print(classification_report(y_valid_4, y_pred_4))

# Print the evaluation results
print(confusion_matrix(y_valid_4, y_pred_4))

In [None]:
selector = SelectKBest(f_classif, k=15)
X_new = selector.fit_transform(X_train[label_3], y_train[label_3])
