# Import needed stuff

In [1]:
import pandas as pd

from sklearn.dummy import DummyClassifier           # Simplest model - shouldn't be used for real problems
from sklearn.linear_model import LogisticRegression # Linear Regression
from sklearn.metrics import accuracy_score          # Scoring 

# Load CSV ( type: Pandas.Dataframe ) 
df = pd.read_csv("./data/polish_names.csv")         

# Info 

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1705 entries, 0 to 1704
Data columns (total 3 columns):
name      1705 non-null object
gender    1705 non-null object
target    1705 non-null int64
dtypes: int64(1), object(2)
memory usage: 40.0+ KB


# Prepare Stuff

In [10]:
# Convert to gender (f,m) to numeric values (0,1)
df['target'] = df['gender'].map( lambda x: int(x == 'm') )

# First MODEL - just to start - to have something that you can compare with... 
X = df[ ["target"] ].values  # MATRIX - features for our objects
y = df['target'].values      # VECTOR - target variable


# 1. Choose model - algorithm that will be used 
model = DummyClassifier()
# 2. Train choosed model
model.fit(X, y) 
# 3. Response from MODEL
y_pred = model.predict(X)
# 4. Check accuracy of the MODEL
print( "DummyClassifier: (just for tests):", accuracy_score(y, y_pred) )

# Make point 1,2,3,4 into Function
def train_and_predict(X,y,model, success_metric=accuracy_score):
    model.fit(X, y)
    y_pred = model.predict(X)
    return accuracy_score(y, y_pred)

DummyClassifier: (just for tests): 0.526099706745


# Feature Engineering 

## Name Length

In [32]:
df["name_length"] = df["name"].map( lambda x: len(x))

X = df[ ["name_length"] ].values
y = df["target"].values

In [33]:
train_and_predict(X,y,LogisticRegression())

0.60586510263929616

## Vowels Counter

In [36]:
vowels = ['a', 'ą', 'e', 'ę', 'i', 'o', 'u', 'y']

def count_vowels(name):
    count = sum( map(lambda x: int(x in vowels), name.lower()) )
    return count

df["count_vowels"] = df["name"].map(count_vowels)

In [48]:
train_and_predict( df[ ["name_length", "count_vowels"] ], y, LogisticRegression() )

0.71436950146627565

## First is Vowel

In [55]:
vowels = ['a', 'ą', 'e', 'ę', 'i', 'o', 'u', 'y']

def isFirstVowel(name):
    return name.lower()[0] in vowels

df["vowel_first"] = df["name"].map(isFirstVowel)    

In [56]:
train_and_predict( df[ ["name_length", "count_vowels", "vowel_first"] ], y, LogisticRegression() )

0.72961876832844574

## First Letter

In [59]:
# How factorize works? It basically encode abcd... to numeric values
pd.factorize(["a","aa", "a", "b","c"]) # returns TUPLE with two arrays

(array([0, 1, 0, 2, 3]), array(['a', 'aa', 'b', 'c'], dtype=object))

In [None]:
vowels = ['a', 'ą', 'e', 'ę', 'i', 'o', 'u', 'y']

def isFirstVowel(name):
    return name.lower()[0] in vowels

df["vowel_first"] = df["name"].map(isFirstVowel)    