# Few Imports

In [14]:
import pandas as pd
import numpy as np

# for repeatability 
np.random.seed(1) 

#models (algorithms)
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

# result validation
from sklearn.model_selection import train_test_split

#our success metric
from sklearn.metrics import accuracy_score  

# visualization
import matplotlib.pyplot as plt

# plots inside of notebook
%matplotlib inline 


# Load CSV: type 
df = pd.read_csv("./data/polish_names.csv")

# # Convert to gender (f,m) to numeric values (0,1)
df['target'] = df['gender'].map(lambda x: int(x == 'm'))

In [15]:
def train_and_predict_model(X_train, X_test, y_train, y_test, model, success_metric=accuracy_score):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return success_metric(y_test, y_pred)

# Feature Engineering
Features from `1_LogisticRegression_prediction` Notebook 

In [17]:
vowels = ['a', 'ą', 'e', 'ę', 'i', 'o', 'u', 'y']

def how_many_vowels(name):
    return sum( map(lambda x: int(x in vowels), name.lower()) )
    
def first_is_vowel(name):
    return name.lower()[0] in vowels

def last_is_vowel(name):
    return name.lower()[-1] in vowels

def get_all_vowels(name):
    all_vowels = [letter for letter in name.lower() if letter in vowels]
    
    return ''.join(all_vowels)

def get_all_consonants(name):
    all_consonants = [letter for letter in name.lower() if letter not in vowels]
    
    return ''.join(all_consonants)

def feature_engineering(df):
    df['len_name'] = df['name'].map(lambda x: len(x))
    
    df['first_is_vowel'] = df['name'].map(first_is_vowel)
    df['last_is_vowel'] = df['name'].map(last_is_vowel)
    
    df['first_letter'] = df['name'].map(lambda x: x.lower()[0])
    df['first_letter_cnt'] = pd.factorize(df['first_letter'])[0]
    
    df['last_letter'] = df['name'].map(lambda x: x.lower()[-1])
    df['last_letter_cnt'] = pd.factorize(df['last_letter'])[0]
    
    df['all_vowels'] = df['name'].map(get_all_vowels)
    df['all_vowels_cnt'] = pd.factorize(df['all_vowels'])[0]

    df['all_consonants'] = df['name'].map(get_all_consonants)
    df['all_consonants_cnt'] = pd.factorize(df['all_consonants'])[0]
    
    
    return df  

In [21]:
df_fe = feature_engineering(df)

features = ['len_name', 'first_is_vowel', 'last_is_vowel', 'first_letter_cnt', 'last_letter_cnt', 'all_vowels_cnt', 'all_consonants_cnt']
X = df_fe[ features ]
y = df_fe['target']

train_and_predict_model(X, X, y, y, LogisticRegression())

0.9524926686217009

# Split for TRAIN and TEST
Train: 70%
Test: 30% 


In [38]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

print( "X before: ", X.shape )
print ( "X Train: ", X_train.shape )
print ( "X Test: ", X_test.shape )


X before:  (1705, 7)
X Train:  (1193, 7)
X Test:  (512, 7)
