In [46]:
#SVM have a lot of terminology associated with them.... brace yourself!!!
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.model_selection import GridSearchCV

In [47]:
# Load the dfset
df = pd.read_csv('mm_names.csv',index_col=0)
df.head()

Unnamed: 0,Name,Gender
0,aungkyi,Female
1,aungmay,Female
2,aye,Female
3,ayeaye,Female
4,ayeayeaung,Female


In [48]:
# Drop rows with missing 'Name' values
df = df.dropna(subset=['Name'])

In [49]:
# Encode the 'Gender' column
label_encoder = LabelEncoder()
df['Gender'] = label_encoder.fit_transform(df['Gender'])
print (df['Gender'])


0       0
1       0
2       0
3       0
4       0
       ..
5319    1
5320    1
5321    1
5322    1
5323    1
Name: Gender, Length: 5323, dtype: int32


In [50]:
# Convert the 'Name' column to numerical features using TF-IDF
tfidf_vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(1, 3))
X = tfidf_vectorizer.fit_transform(df['Name'])


In [51]:

# Labels
y = df['Gender']

In [52]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [53]:
# Define the SVM model
svm_model = SVC()

In [54]:
# Define the parameter grid
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['linear', 'rbf']
}

In [55]:
# Setup the grid search with cross-validation
grid_search = GridSearchCV(svm_model, param_grid, refit=True, verbose=2, cv=5)

In [56]:
# Perform the grid search on the training data
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 32 candidates, totalling 160 fits
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   1.0s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   0.9s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   0.9s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   1.0s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   1.2s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   1.4s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   1.3s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   1.3s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   1.2s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   1.3s
[CV] END ....................C=0.1, gamma=0.1, kernel=linear; total time=   1.0s
[CV] END ....................C=0.1, gamma=0.1, 

In [57]:
# Get the best parameters from the grid search
best_params = grid_search.best_params_
print(f"Best Parameters: {best_params}")

Best Parameters: {'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}


In [58]:
# Train the final model with the best parameters
best_svm_model = grid_search.best_estimator_

In [59]:
# Input name to classify
input_name = ["Su Myat Aye"]

In [60]:
# Preprocess the input name
input_name_transformed = tfidf_vectorizer.transform(input_name)

In [61]:
# Predict the gender
predicted_gender = best_svm_model.predict(input_name_transformed)

In [62]:
# Decode the predicted gender
predicted_gender_label = label_encoder.inverse_transform(predicted_gender)

In [63]:
print(f"The predicted gender for the name '{input_name[0]}' is: {predicted_gender_label[0]}")

The predicted gender for the name 'Su Myat Aye' is: Female


In [64]:

# Predict on the train set
y_pred_train = best_svm_model.predict(X_train)

In [65]:
# Evaluate the train model
accuracy = accuracy_score(y_train, y_pred_train)
report = classification_report(y_train, y_pred_train, target_names=label_encoder.classes_)

In [66]:
print(f"Accuracy: {accuracy}")
print("Classification Report for train data:")
print(report)

Accuracy: 0.8985439173320808
Classification Report for train data:
              precision    recall  f1-score   support

      Female       0.89      0.87      0.88      1832
        Male       0.90      0.92      0.91      2426

    accuracy                           0.90      4258
   macro avg       0.90      0.89      0.90      4258
weighted avg       0.90      0.90      0.90      4258



In [67]:
# Predict on the test set
y_pred_test = best_svm_model.predict(X_test)

In [68]:
# Evaluate the train model
accuracy = accuracy_score(y_test, y_pred_test)
report = classification_report(y_test, y_pred_test, target_names=label_encoder.classes_)

In [69]:
print(f"Accuracy: {accuracy}")
print("Classification Report for test data:")
print(report)

Accuracy: 0.8384976525821596
Classification Report for test data:
              precision    recall  f1-score   support

      Female       0.83      0.80      0.82       471
        Male       0.85      0.87      0.86       594

    accuracy                           0.84      1065
   macro avg       0.84      0.83      0.84      1065
weighted avg       0.84      0.84      0.84      1065



In [70]:
# Check class distribution in the training and testing sets
print("Class distribution in the training set:")
print(y_train.value_counts())
print("Class distribution in the testing set:")
print(y_test.value_counts())

Class distribution in the training set:
Gender
1    2426
0    1832
Name: count, dtype: int64
Class distribution in the testing set:
Gender
1    594
0    471
Name: count, dtype: int64


In [71]:
# Train an SVM classifier
svm_classifier = SVC(kernel='linear')
svm_classifier.fit(X_train, y_train)


In [72]:
# Make predictions
y_pred_train = svm_classifier.predict(X_train)
y_pred_test = svm_classifier.predict(X_test)


In [73]:
# Evaluate the classifier on the training set
print("<<<< this is train dataset >>>>")
print(confusion_matrix(y_train, y_pred_train))
print("Accuracy:", accuracy_score(y_train, y_pred_train))
print("\nClassification Report:\n", classification_report(y_train, y_pred_train))

<<<< this is train dataset >>>>
[[1554  278]
 [ 240 2186]]
Accuracy: 0.878346641615782

Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.85      0.86      1832
           1       0.89      0.90      0.89      2426

    accuracy                           0.88      4258
   macro avg       0.88      0.87      0.88      4258
weighted avg       0.88      0.88      0.88      4258



In [74]:
# Evaluate the classifier on the testing set
print("<<<<< this is test dataset >>>>>")
print(confusion_matrix(y_test, y_pred_test))
print("Accuracy:", accuracy_score(y_test, y_pred_test))
print("\nClassification Report:\n", classification_report(y_test, y_pred_test))

<<<<< this is test dataset >>>>>
[[391  80]
 [ 78 516]]
Accuracy: 0.8516431924882629

Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.83      0.83       471
           1       0.87      0.87      0.87       594

    accuracy                           0.85      1065
   macro avg       0.85      0.85      0.85      1065
weighted avg       0.85      0.85      0.85      1065

