In [124]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.model_selection import GridSearchCV

In [125]:
# Load the dfset
df = pd.read_csv('mm_names.csv',index_col=0)


In [126]:
# Drop rows with missing 'Name' values
df = df.dropna(subset=['Name'])

In [127]:
# Encode the 'Gender' column
label_encoder = LabelEncoder()
df['Gender'] = label_encoder.fit_transform(df['Gender'])


In [128]:
# Convert the 'Name' column to numerical features using TF-IDF
tfidf_vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(1, 3))
X = tfidf_vectorizer.fit_transform(df['Name'])


In [129]:

# Labels
y = df['Gender']

In [130]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [131]:
# Define the SVM model
svm_model = SVC()

In [132]:
# Define the parameter grid
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['linear', 'rbf']
}

In [133]:
# Setup the grid search with cross-validation
grid_search = GridSearchCV(svm_model, param_grid, refit=True, verbose=2, cv=5)

In [134]:
# Perform the grid search on the training data
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 32 candidates, totalling 160 fits
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   1.0s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   1.0s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   1.0s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   1.0s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   1.0s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   1.2s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   1.3s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   1.2s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   1.2s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   1.1s
[CV] END ....................C=0.1, gamma=0.1, kernel=linear; total time=   1.0s
[CV] END ....................C=0.1, gamma=0.1, 

In [135]:
# Get the best parameters from the grid search
best_params = grid_search.best_params_
print(f"Best Parameters: {best_params}")

Best Parameters: {'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}


In [136]:
# Train the final model with the best parameters
best_svm_model = grid_search.best_estimator_

In [138]:

# Predict on the test set
y_pred = best_svm_model.predict(X_test)

In [139]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=label_encoder.classes_)

In [140]:
print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)

Accuracy: 0.8384976525821596
Classification Report:
              precision    recall  f1-score   support

      Female       0.83      0.80      0.82       471
        Male       0.85      0.87      0.86       594

    accuracy                           0.84      1065
   macro avg       0.84      0.83      0.84      1065
weighted avg       0.84      0.84      0.84      1065



In [141]:
# Predict on the test set
y_pred = best_svm_model.predict(X_test)

In [142]:
# Check class distribution in the training and testing sets
print("Class distribution in the training set:")
print(y_train.value_counts())
print("Class distribution in the testing set:")
print(y_test.value_counts())

Class distribution in the training set:
Gender
1    2426
0    1832
Name: count, dtype: int64
Class distribution in the testing set:
Gender
1    594
0    471
Name: count, dtype: int64


In [143]:
# Train an SVM classifier
svm_classifier = SVC(kernel='linear')
svm_classifier.fit(X_train, y_train)


In [None]:
# Make predictions
y_pred_train = svm_classifier.predict(X_train)
y_pred_test = svm_classifier.predict(X_test)


In [145]:
# Evaluate the classifier on the training set
print("<<<< this is train dataset >>>>")
print(confusion_matrix(y_train, y_pred_train))
print("Accuracy:", accuracy_score(y_train, y_pred_train))
print("\nClassification Report:\n", classification_report(y_train, y_pred_train))

<<<< this is train dataset >>>>
[[1554  278]
 [ 240 2186]]
Accuracy: 0.878346641615782

Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.85      0.86      1832
           1       0.89      0.90      0.89      2426

    accuracy                           0.88      4258
   macro avg       0.88      0.87      0.88      4258
weighted avg       0.88      0.88      0.88      4258



In [146]:
# Evaluate the classifier on the testing set
print("<<<<< this is test dataset >>>>>")
print(confusion_matrix(y_test, y_pred_test))
print("Accuracy:", accuracy_score(y_test, y_pred_test))
print("\nClassification Report:\n", classification_report(y_test, y_pred_test))

<<<<< this is test dataset >>>>>
[[391  80]
 [ 78 516]]
Accuracy: 0.8516431924882629

Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.83      0.83       471
           1       0.87      0.87      0.87       594

    accuracy                           0.85      1065
   macro avg       0.85      0.85      0.85      1065
weighted avg       0.85      0.85      0.85      1065



In [None]:
# Tune hyperparameters using GridSearchCV
# Define the parameter grid
param_grid = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
    'gamma': ['scale', 'auto']
}

In [None]:
# Create a GridSearchCV object
grid_search = GridSearchCV(SVC(), param_grid, cv=5, scoring='accuracy')

In [149]:
# Fit the model
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 32 candidates, totalling 160 fits
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   1.0s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   1.0s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   1.0s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   1.1s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   1.0s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   1.1s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   1.3s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   1.3s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   1.2s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   1.1s
[CV] END ....................C=0.1, gamma=0.1, kernel=linear; total time=   1.1s
[CV] END ....................C=0.1, gamma=0.1, 

KeyboardInterrupt: 

In [148]:
# Print the best parameters
print("Best parameters found: ", grid_search.best_params_)

Best parameters found:  {'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}


In [147]:
# Evaluate the best model on the test set
best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test)
print("<<<< This is the best model on the testing dataset >>>>")
print(confusion_matrix(y_test, y_pred_best))
print("Accuracy:", accuracy_score(y_test, y_pred_best))
print("\nClassification Report:\n", classification_report(y_test, y_pred_best))

<<<< This is the best model on the testing dataset >>>>
[[379  92]
 [ 80 514]]
Accuracy: 0.8384976525821596

Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.80      0.82       471
           1       0.85      0.87      0.86       594

    accuracy                           0.84      1065
   macro avg       0.84      0.83      0.84      1065
weighted avg       0.84      0.84      0.84      1065

