<a href="https://colab.research.google.com/github/MercadoMR/AITraining/blob/main/Model_Names.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%load_ext autoreload
%autoreload
import numpy as np
import pandas as pd
import json
import re
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split

nltk.download('stopwords')
nltk.download('punkt')

!pip install unidecode
!pip install faker

import unidecode

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
Collecting unidecode
  Downloading Unidecode-1.3.4-py3-none-any.whl (235 kB)
[K     |████████████████████████████████| 235 kB 4.8 MB/s 
[?25hInstalling collected packages: unidecode
Successfully installed unidecode-1.3.4
Collecting faker
  Downloading Faker-13.3.5-py3-none-any.whl (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 5.1 MB/s 
Installing collected packages: faker
Successfully installed faker-13.3.5


# Feature Extraction from Text

## Load a dataset

In [None]:
desc_csv = 'names_bd_2 (1).csv'
df = pd.read_csv(desc_csv,  encoding='utf-8')
df.head(10)

Unnamed: 0,Nombre,etiqueta
0,Eric Montoya,name
1,Francisca Magaña,name
2,Javier Cabán,name
3,Luz Ilse Arce Alba,name
4,Cristian Isabela Molina Estévez,name
5,Claudia Mercedes Mayorga,name
6,Nelly Mateo Castro Benavides,name
7,Sessa Medrano,name
8,Gabriel María Cristina Ballesteros Rascón,name
9,Patricio de la Crúz,name


In [None]:
X = df['Nombre'] 
y = df['etiqueta']
X.head(10)


0                                 Eric Montoya
1                             Francisca Magaña
2                                 Javier Cabán
3                           Luz Ilse Arce Alba
4              Cristian Isabela Molina Estévez
5                     Claudia Mercedes Mayorga
6                 Nelly Mateo Castro Benavides
7                                Sessa Medrano
8    Gabriel María Cristina Ballesteros Rascón
9                          Patricio de la Crúz
Name: Nombre, dtype: object

In [None]:
stop_words = set(stopwords.words("spanish"))

def normalize_text(text_str):
        unaccented = unidecode.unidecode(text_str)
        unaccented = unaccented.replace("\n", ",")
        text_token = word_tokenize(unaccented)
        filtered_sentence = " ".join([w for w in text_token if not w.lower() in stop_words])
        # Remove Stop Words
        text_upper = filtered_sentence.upper().replace(".", "")
        return text_upper

for i,X_i in enumerate(X):
    unaccented = unidecode.unidecode(str(X_i))
    X[i] = normalize_text(unaccented)
    #print(X[i], len(X[i]))


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# Train using a SCV and GridSearch

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# Train a SVM classification model
print("Fitting the classifier to the training set")
param_grid = {'C': [5e3, 1e4, 1e5], 'gamma': [0.01, 0.1],}

clf = GridSearchCV(SVC(kernel='rbf', class_weight='balanced',probability=True), param_grid)

text_clf_SVM = Pipeline([('tfidf', TfidfVectorizer()), ('clf', clf),])
# Feed the training data through the pipeline
text_clf_SVM.fit(X_train, y_train)  


Fitting the classifier to the training set


Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('clf',
                 GridSearchCV(estimator=SVC(class_weight='balanced',
                                            probability=True),
                              param_grid={'C': [5000.0, 10000.0, 100000.0],
                                          'gamma': [0.01, 0.1]}))])

## Evaluation

In [None]:
# Form a prediction set
predictions = text_clf_SVM.predict(X_test)

# Report the confusion matrix
from sklearn import metrics
print(metrics.confusion_matrix(y_test,predictions))

[[366   1]
 [  2 470]]


In [None]:
desc_auto = 'Juan Aurelio Ortiz Morales'
res = text_clf_SVM.predict_proba([desc_auto])
print(res)
print(round(res[0][0], 5), round(res[0][1], 5))

[[9.9999990e-01 1.0000001e-07]]
1.0 0.0


In [None]:
desc_auto_2 = 'AUDI A3 SEDAN AMBIENTE'
res2 = text_clf_SVM.predict_proba([desc_auto_2])
print(res2)
print(round(res2[0][0], 5), round(res2[0][1], 5))

[[3.15161064e-10 1.00000000e+00]]
0.0 1.0


## Save model

In [None]:
import pickle

with open('name_classifier.pickle', 'wb') as f:
    pickle.dump(text_clf_SVM, f)

In [None]:
with open('name_classifier.pickle', "rb") as f:
    clf = pickle.load(f)

In [None]:
name= "Venustiano Carranza SN"
# name= "Financiamiento por pago fraccionado"
text_line = [normalize_text(name.upper())]
prediction = clf.predict_proba(text_line)
if prediction[0][0]>0.9:
  print(name, round(prediction[0][0], 5))

print(name, round(prediction[0][1], 5))

Venustiano Carranza SN 0.99361
Venustiano Carranza SN 0.00639
