In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report,accuracy_score
import re

In [3]:
# Load data
data = pd.read_csv('name_gender.csv')

# Drop undefined genders
data = data[data['gender'] != 'undefined']

In [4]:
# Feature Engineering: Extract name length and first letter
data['name_length'] = data['name'].apply(len)
data['first_letter'] = data['name'].apply(lambda x: x[0].lower())

# Encode gender labels
data['gender'] = data['gender'].map({'M': 0, 'F': 1})

In [5]:
# Split data into features (X) and target (y)
X = data[['name_length', 'first_letter']]
y = data['gender']

# Convert 'first_letter' to a numerical feature
X = pd.get_dummies(X, columns=['first_letter'], drop_first=True)

In [6]:
# Split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# Train a logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predict on test set
y_pred = model.predict(X_test)

# Evaluate the model
print("Logistic Regression")
print("Accuracy: ", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Logistic Regression
              precision    recall  f1-score   support

           0       0.55      0.12      0.20     10333
           1       0.65      0.94      0.77     18175

    accuracy                           0.65     28508
   macro avg       0.60      0.53      0.48     28508
weighted avg       0.62      0.65      0.56     28508



In [7]:
# Split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# Train a logistic regression model
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Predict on test set
y_pred = model.predict(X_test)

# Evaluate the model
print("RandomForestClassifier")
print("Accuracy: ", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

RandomForestClassifier
              precision    recall  f1-score   support

           0       0.54      0.24      0.33     10395
           1       0.67      0.88      0.76     18113

    accuracy                           0.65     28508
   macro avg       0.61      0.56      0.55     28508
weighted avg       0.62      0.65      0.61     28508



In [11]:
# Split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# Train a logistic regression model
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

# Predict on test set
y_pred = model.predict(X_test)

# Evaluate the model
print("DecisionTreeClassifier")
print("Accuracy: ", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

DecisionTreeClassifier
              precision    recall  f1-score   support

           0       0.54      0.24      0.33     10292
           1       0.67      0.88      0.76     18216

    accuracy                           0.65     28508
   macro avg       0.60      0.56      0.55     28508
weighted avg       0.62      0.65      0.61     28508

