In [16]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.under_sampling import RandomUnderSampler

# Load the dataset
df = pd.read_csv("data/features.csv")

# Map original labels to gender
# 0 -> M20, 1 -> F20, 2 -> M50, 3 -> F50
# Gender: 0 = Male, 1 = Female
df["gender"] = df["label"].map({0: 0, 1: 1, 2: 0, 3: 1})

# Define selected features (93 total)
features = []

# Pitch
features += ['pitch_mean', 'pitch_std']

# MFCCs
features += [f"mfcc_mean_{i}" for i in range(1, 14)]
features += [f"mfcc_std_{i}" for i in range(1, 14)]

# Chroma
features += [f"chroma_mean_{i}" for i in range(1, 13)]
features += [f"chroma_std_{i}" for i in range(1, 13)]

# Spectral shape
features += ['centroid_mean', 'centroid_std', 'bandwidth_mean', 'bandwidth_std']
features += ['rolloff_mean', 'rolloff_std']

# Energy / voicing
features += ['rms_mean', 'rms_std', 'zcr_mean', 'zcr_std']

# Mel (only first 13)
features += [f"mel_mean_{i}" for i in range(1, 14)]
features += [f"mel_std_{i}" for i in range(1, 14)]

# Extract features and target
X = df[features]
y = df["gender"]

# Undersample to balance genders
rus = RandomUnderSampler(random_state=42)
X_balanced, y_balanced = rus.fit_resample(X, y)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)




In [20]:
# Train KNN
knn = KNeighborsClassifier(n_neighbors=5,metric='euclidean')
knn.fit(X_train_scaled, y_train)

# Evaluate
y_pred = knn.predict(X_test_scaled)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[6404  583]
 [ 273 6807]]
              precision    recall  f1-score   support

           0       0.96      0.92      0.94      6987
           1       0.92      0.96      0.94      7080

    accuracy                           0.94     14067
   macro avg       0.94      0.94      0.94     14067
weighted avg       0.94      0.94      0.94     14067



In [21]:
from sklearn.ensemble import RandomForestClassifier

# Initialize and train the Random Forest Classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_scaled, y_train)

# Evaluate performance
rf_score = rf.score(X_test_scaled, y_test)
print(f"Random Forest Accuracy: {rf_score * 100:.2f}%")


Random Forest Accuracy: 94.48%


In [22]:
from sklearn.linear_model import LogisticRegression

# Initialize Logistic Regression with L2 regularization
log_reg = LogisticRegression(penalty='l2', solver='liblinear', random_state=42)
log_reg.fit(X_train_scaled, y_train)

# Evaluate performance
log_reg_score = log_reg.score(X_test_scaled, y_test)
print(f"Logistic Regression Accuracy: {log_reg_score * 100:.2f}%")


Logistic Regression Accuracy: 92.29%


In [23]:
from sklearn.svm import SVC

# Initialize and train the SVM classifier with RBF kernel
svm = SVC(kernel='rbf', gamma='scale', random_state=42)
svm.fit(X_train_scaled, y_train)

# Evaluate performance
svm_score = svm.score(X_test_scaled, y_test)
print(f"SVM Accuracy: {svm_score * 100:.2f}%")


SVM Accuracy: 95.98%


In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, loguniform
import matplotlib.pyplot as plt
import seaborn as sns

# Define the parameter grid for random search
param_dist = {
    'C': loguniform(1e-3, 1e3),        # Regularization parameter
    'gamma': ['scale', 'auto'] + list(uniform(1e-4, 1e3).rvs(10)),  # Kernel coefficient
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],  # Type of kernel
    'degree': [2, 3, 4],  # Degree for polynomial kernel
}

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(SVC(), param_distributions=param_dist, n_iter=50, cv=5, random_state=42, verbose=2, n_jobs=-1)

# Fit the model with random search
random_search.fit(X_train_scaled, y_train)

# Best hyperparameters
print("Best Hyperparameters found: ", random_search.best_params_)

# Evaluate the best model
best_svm = random_search.best_estimator_
best_svm_score = best_svm.score(X_test_scaled, y_test)
print(f"Accuracy with best hyperparameters: {best_svm_score * 100:.2f}%")

# Confusion Matrix for the tuned model
y_pred_best = best_svm.predict(X_test_scaled)
cm_best = confusion_matrix(y_test, y_pred_best)

# Plot Confusion Matrix
plt.figure(figsize=(6, 5))
sns.heatmap(cm_best, annot=True, fmt="d", cmap="Blues", xticklabels=["M", "F"], yticklabels=["M", "F"])
plt.title("Tuned SVM Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# Print Classification Report
print("Classification Report (Tuned Model):")
print(classification_report(y_test, y_pred_best))


In [None]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from joblib import dump

# GPU-enabled parameters
params = {
    'boosting_type': ['gbdt'],
    'num_leaves': [31, 50, 70, 100],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 300, 500],
    'max_depth': [5, 10, 15, -1],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'reg_alpha': [0, 0.1, 0.5],
    'reg_lambda': [0, 0.1, 0.5]
}

# Enable GPU
lgbm = LGBMClassifier(device='gpu', verbosity=-1)

random_search = RandomizedSearchCV(
    estimator=lgbm,
    param_distributions=params,
    n_iter=50,
    cv=5,
    scoring='f1',
    n_jobs=-1,
    verbose=1,
    random_state=42
)

random_search.fit(X_train, y_train)

# Save best model
dump(random_search.best_estimator_, 'best_lightgbm_gpu_model.joblib')

# Evaluation
y_pred = random_search.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
