In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report
import xgboost as xgb

In [11]:
rawdata = pd.read_csv('name_gender.csv')

df = pd.DataFrame(rawdata)
df.head(5)


# Feature Engineering
df['name_length'] = df['name'].apply(len)
df['initial'] = df['name'].str[0]
df['suffix'] = df['name'].str[-1]  # Last character
df['vowel_count'] = df['name'].apply(lambda x: sum(1 for char in x if char.lower() in 'aeiou'))
df['consonant_count'] = df['name'].apply(lambda x: sum(1 for char in x if char.lower() not in 'aeiou'))
df['vowel_consonant_ratio'] = df['vowel_count'] / df['consonant_count'].replace(0, 1)  # Avoid division by zero


df['gender'] = df['gender'].map({'female': 1, 'male': 0})  # Encode gender

X = df[['name_length', 'vowel_count', 'consonant_count', 'vowel_consonant_ratio']]
y = df['gender']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define models and hyperparameters
models = {
    "Logistic Regression": (LogisticRegression(max_iter=1000), {}),
    "Random Forest": (RandomForestClassifier(random_state=42), {
        'n_estimators': [50, 100],
        'max_depth': [None, 10, 20]
    }),
    "Gradient Boosting": (GradientBoostingClassifier(), {
        'n_estimators': [50, 100],
        'learning_rate': [0.01, 0.1]
    }),
    "XGBoost": (xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss'), {
        'n_estimators': [50, 100],
        'learning_rate': [0.01, 0.1]
    })
}

# Train and evaluate models
for model_name, (model, params) in models.items():
    if params:
        grid_search = GridSearchCV(model, params, cv=5)
        grid_search.fit(X_train, y_train)
        best_model = grid_search.best_estimator_
    else:
        model.fit(X_train, y_train)
        best_model = model

    # Cross-validate the best model
    scores = cross_val_score(best_model, X, y, cv=5)
    print(f"{model_name} - Cross-validated accuracy: {scores.mean():.2f}")

    # Evaluate on the test set
    y_pred = best_model.predict(X_test)
    print(classification_report(y_test, y_pred))