In [1]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.svm import LinearSVC, SVC
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:
df = pd.read_csv('../standardized.csv')

feature_cols = [c for c in df.columns if c not in ['name', 'gender']]
X = df[feature_cols]
y = df['gender'] #in future - include the name with pred

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

#Using standard scalar instead
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

df['gender'].value_counts()

gender
0    425
1    215
Name: count, dtype: int64

In [3]:
#SVM RBF with class weighted 
pd.set_option('display.max_rows', None)
svm_rbf = SVC(
    kernel="rbf",
    C=1.0,                    # regularization strength
    gamma="scale",            # good default
    class_weight="balanced",  # crucial for minority class
    random_state=42
)
svm_rbf.fit(X_train_scaled, y_train)
y_pred_svm = svm_rbf.predict(X_test_scaled)
print("-----------------------------------------------")
print("SVM (RBF, class-weighted) on Standarized set – Quantitative analysis")
print("-----------------------------------------------")
print(classification_report(y_test, y_pred_svm))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_svm))
print()
svm_results = X_test.copy()
svm_results["actual_gender"] = y_test.values
svm_results["predicted_gender"] = y_pred_svm
svm_results["name"] = df.loc[X_test.index, "name"]
svm_results = svm_results[["name", "actual_gender", "predicted_gender"]]
print(svm_results)

-----------------------------------------------
SVM (RBF, class-weighted) on Standarized set – Quantitative analysis
-----------------------------------------------
              precision    recall  f1-score   support

           0       0.75      0.67      0.70        84
           1       0.47      0.57      0.52        44

    accuracy                           0.63       128
   macro avg       0.61      0.62      0.61       128
weighted avg       0.65      0.63      0.64       128

Confusion Matrix:
[[56 28]
 [19 25]]

                          name  actual_gender  predicted_gender
570                   Hun Batz              0                 1
265                       Sven              0                 0
291                       Akai              0                 0
597                        Nox              1                 1
174                        Axe              0                 0
247              Queen of Pain              1                 0
158                   