In [None]:
import pandas as pd
from time import time
import enchant
from metaphone import doublemetaphone

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import CategoricalNB,BaseNB,BernoulliNB, MultinomialNB, GaussianNB
from sklearn.svm import LinearSVC,SVC
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

MAX_CHAR = 4

params = {
    'model__penalty': ['l1','l2'],
    'model__loss': ['squared_hinge','hinge'],
    'model__dual': [True, False],
    'model__tol': [0.00001,0.0001, 0.001],
    'model__C': [0.001,0.1,0.5,1.0,3,5,10],
    'vectorizer__ngram_range' : [(1,1), (1,2), (1,3), (1,4), (1,5), (1,6),]     
}

def text_bounds(text,ngram_range):
    MAX_CHAR = max(ngram_range)
    return (MAX_CHAR-1)*'_' + text + '_'*(MAX_CHAR-1)

def add_dmetaphone_char(text,ngram_range):
    MAX_CHAR = max(ngram_range)
    text + ' ' + (MAX_CHAR-1)*'_' + doublemetaphone(text)[0] + '_'*(MAX_CHAR-1)
    
model_df = pd.read_csv('nomes-censos-ibge-v2.csv', delimiter=';').dropna(subset=['Nome'])
model_df = model_df[(model_df['SEX'] == 'F') | (model_df['SEX'] == 'M')]

model_df['Nome'] = (MAX_CHAR-1)*'_' + model_df['Nome'] + '_'*(MAX_CHAR-1) #grande acrescimo a performance
model_df['Nome'] = model_df['Nome'] + ' ' + (MAX_CHAR-1)*'_' + model_df['Nome'].apply(lambda x: doublemetaphone(x)[0]) + '_'*(MAX_CHAR-1)


X = model_df['Nome']
y = model_df['SEX']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

pipe = Pipeline([('vectorizer', CountVectorizer(analyzer='char')),
                 ('model', LinearSVC())])

grid_search = GridSearchCV(pipe, params, verbose = 1)
t0 = time()
grid_search.fit(X_train, y_train)
print("done in %0.3fs" % (time() - t0))

print(classification_report(y_true=y_test, y_pred=y_pred))

In [None]:
pd.DataFrame(grid_search.cv_results_).sort_values(by='rank_test_score').dropna()

In [None]:
grid_search.best_estimator_

In [None]:
MAX_CHAR = 4

model_df = pd.read_csv('nomes-censos-ibge-v2.csv', delimiter=';').dropna(subset=['Nome'])
model_df = model_df[(model_df['SEX'] == 'F') | (model_df['SEX'] == 'M')]

model_df['Nome'] = (MAX_CHAR-1)*'_' + model_df['Nome'] + '_'*(MAX_CHAR-1) #grande acrescimo a performance
model_df['Nome'] = model_df['Nome'] + ' ' + (MAX_CHAR-1)*'_' + model_df['Nome'].apply(lambda x: doublemetaphone(x)[0]) + '_'*(MAX_CHAR-1)

X = model_df['Nome']
y = model_df['SEX']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

vect = CountVectorizer(analyzer='char', ngram_range=(1,MAX_CHAR))
vect = grid_search.best_estimator_['vectorizer']
X_train_vect = vect.fit_transform(X_train)
X_test_vect = vect.transform(X_test)

model = LinearSVC(dual=True)
model = grid_search.best_estimator_['model']

model.fit(X_train_vect.toarray(), y_train)
y_pred = model.predict(X_test_vect.toarray())

print(classification_report(y_true=y_test, y_pred=y_pred))

In [None]:
test_text = 'THOMAS'

try:
    print(model.predict_proba(vect.transform([test_text])))
except:
    print(model.decision_function(vect.transform([test_text])))
    
print(model.predict(vect.transform([test_text])))