In [5]:
import os
import pickle
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Constants
LEARNED_DATA_FILE = "learned_data.pkl"
CSV_FILE = "name_gender.csv"  # Default CSV file path


# Load learned data from file
def load_learned_data():
    if os.path.exists(LEARNED_DATA_FILE):
        with open(LEARNED_DATA_FILE, "rb") as file:
            return pickle.load(file)
    return pd.DataFrame(columns=["name", "gender"])


# Save learned data to file
def save_learned_data(df):
    with open(LEARNED_DATA_FILE, "wb") as file:
        pickle.dump(df, file)


# Preprocess names using character bigrams
def preprocess_names(names):
    return names.str.lower().str.replace("[^a-z]", "", regex=True)


# Train the model
def train_model(df):
    df["name"] = preprocess_names(df["name"])
    X = df["name"].apply(lambda x: [x[i: i + 2] for i in range(len(x) - 1)])  # Bigrams
    X = X.apply(lambda x: " ".join(x))  # Convert list to string
    y = df["gender"]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = LogisticRegression()
    model.fit(X_train.values.reshape(-1, 1), y_train)  # Reshape for single feature
    predictions = model.predict(X_test.values.reshape(-1, 1))

    print(f"Accuracy: {accuracy_score(y_test, predictions):.2f}")
    return model


# Predict gender
def predict_gender(model, name, learned_data):
    name = preprocess_names(pd.Series([name]))[0]
    name_bigrams = " ".join([name[i: i + 2] for i in range(len(name) - 1)])  # Create bigrams

    # Check learned data
    if name in learned_data["name"].values:
        gender = learned_data.loc[learned_data["name"] == name, "gender"].values[0]
        print(f"Predicted gender using learned data: {gender}")
        return gender
    else:
        prediction = model.predict([name_bigrams])
        return prediction[0]


# Main function to handle user interaction
def main():
    action = input("Choose A) Predict B) Train: ").strip().lower()

    if action == "b":
        if os.path.exists(CSV_FILE):
            rawdata = pd.read_csv(CSV_FILE)
            model = train_model(rawdata)
            print("Model training complete.")
        else:
            print("CSV file not found for training.")
            return
    elif action == "a":
        learned_data = load_learned_data()
        # Initialize model
        model = LogisticRegression()  # Placeholder model, must be replaced with a trained one
        # Note: Ideally, you would load a previously trained model here
        if not os.path.exists(LEARNED_DATA_FILE):
            print("No learned data available. Please train the model first.")
            return

    # Continuous name prediction loop
    while True:
        name = input("Enter a name (or type 'exit' to quit): ")
        if name.lower() == "exit":
            break

        predicted_gender = predict_gender(model, name, learned_data)
        print(f"The predicted gender for '{name}' is: {predicted_gender}")

        correct = input("Was the prediction correct? (yes/no): ").strip().lower()
        if correct == "no":
            correct_gender = input("Please provide the correct gender (male/female): ").strip().lower()
            learned_data = learned_data.append({"name": name, "gender": correct_gender}, ignore_index=True)
            save_learned_data(learned_data)


if __name__ == "__main__":
    main()


Choose A) Predict B) Train:  B


ValueError: could not convert string to float: 'ka ar rv ve er'