In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report,accuracy_score
import re

In [13]:
# Load data
data = pd.read_csv('name_gender.csv')

# Drop undefined genders
data = data[data['gender'] != 'undefined']

In [14]:
# Feature Engineering: Extract name length and first letter
data['name_length'] = data['name'].apply(len)
data['first_letter'] = data['name'].apply(lambda x: x[0].lower())

# Encode gender labels
data['gender'] = data['gender'].map({'M': 0, 'F': 1})

In [15]:
# Split data into features (X) and target (y)
X = data[['name_length', 'first_letter']]
y = data['gender']

# Convert 'first_letter' to a numerical feature
X = pd.get_dummies(X, columns=['first_letter'], drop_first=True)

In [16]:
# Split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# Train a logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predict on test set
y_pred = model.predict(X_test)

# Evaluate the model
print("Logistic Regression")
print("Accuracy: ", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Logistic Regression
Accuracy:  0.6418899957906553
              precision    recall  f1-score   support

           0       0.55      0.12      0.20     10454
           1       0.65      0.94      0.77     18054

    accuracy                           0.64     28508
   macro avg       0.60      0.53      0.49     28508
weighted avg       0.61      0.64      0.56     28508



In [17]:
# Split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# Train a logistic regression model
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Predict on test set
y_pred = model.predict(X_test)

# Evaluate the model
print("RandomForestClassifier")
print("Accuracy: ", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

RandomForestClassifier
Accuracy:  0.6468710537393012
              precision    recall  f1-score   support

           0       0.53      0.26      0.35     10375
           1       0.67      0.87      0.76     18133

    accuracy                           0.65     28508
   macro avg       0.60      0.56      0.55     28508
weighted avg       0.62      0.65      0.61     28508



In [18]:
# Split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# Train a logistic regression model
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

# Predict on test set
y_pred = model.predict(X_test)

# Evaluate the model
print("DecisionTreeClassifier")
print("Accuracy: ", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

DecisionTreeClassifier
Accuracy:  0.6439595902904448
              precision    recall  f1-score   support

           0       0.54      0.23      0.32     10542
           1       0.66      0.89      0.76     17966

    accuracy                           0.64     28508
   macro avg       0.60      0.56      0.54     28508
weighted avg       0.62      0.64      0.60     28508

