In [None]:
# Get data to work with

from observatorio_laboral.offer.offer_controller import OfferController
from observatorio_laboral.offer.date_range import DateRange

text_fields = []
oc = OfferController(text_fields = ["Job Title", "Description", "Qualifications", "Software"])
date_range = DateRange(1, 2013, 5, 2017)
source = "symplicity"

oc.load_offers(source, date_range)
oc.filter_offers_by_career("ECONOMÍA")
all_offers = oc.offers
oc.filter_offers_by_field("Areas")

offer_texts = oc.get_text()
offer_classes = oc.get_field_labels("Areas")

data = offer_texts
classes = offer_classes
print(len(data))
print(len(all_offers))

In [239]:
# Get data to work with

from observatorio_laboral.offer.offer_controller import OfferController
from observatorio_laboral.offer.date_range import DateRange

text_fields = []
oc = OfferController(text_fields = ["Job Title", "Description", "Qualifications"])
date_range = DateRange(1, 2013, 5, 2017)
source = "symplicity"

oc.load_offers(source, date_range)
oc.filter_offers_by_career("ECONOMÍA")
all_offers = oc.offers
oc.filter_offers_by_field("Areas")

offer_texts = oc.get_text()
offer_classes = oc.get_field_labels("Areas")

data = offer_texts
classes = offer_classes
print(len(data))
print(len(all_offers))

553
12653


In [240]:
# Check quantity per labels
offer_by_labels = {}
for offer in oc.offers:
    labels = " ".join(sorted(oc.get_field_labels("Areas", offer)))
    if labels not in offer_by_labels:
        offer_by_labels[labels] = []
        
    offer_by_labels[labels].append(offer)

In [241]:
qnt_list = [(label.split(), len(offers)) for label, offers in offer_by_labels.items()]
qnt_list.sort(key = lambda k: k[1], reverse=True)

In [242]:
qnt_list

[(['PP'], 141),
 (['FI', 'PP'], 125),
 (['FI'], 99),
 (['EM', 'FI'], 33),
 (['MC'], 32),
 (['EM'], 26),
 (['FI', 'MC'], 22),
 (['MC', 'PP'], 15),
 (['EM', 'MC'], 7),
 (['MC', 'TE'], 7),
 (['FI', 'MC', 'PP'], 7),
 (['OI'], 5),
 (['EM', 'FI', 'MC'], 4),
 (['FI', 'MC', 'OI', 'TE'], 3),
 (['PP', 'TE'], 3),
 (['MC', 'OI'], 3),
 (['FI', 'MC', 'TE'], 2),
 (['FI', 'OI', 'PP'], 2),
 (['EI', 'FI'], 2),
 (['MC', 'OI', 'TE'], 1),
 (['EI'], 1),
 (['FI', 'MC', 'OI'], 1),
 (['EI', 'OI'], 1),
 (['EI', 'MC', 'OI', 'TE'], 1),
 (['FI', 'TE'], 1),
 (['EM', 'TE'], 1),
 (['EM', 'PP'], 1),
 (['EM', 'FI', 'MC', 'TE'], 1),
 (['FI', 'OI'], 1),
 (['MC', 'PP', 'TE'], 1),
 (['EM', 'OI'], 1),
 (['OI', 'PP'], 1),
 (['EI', 'MC'], 1),
 (['EI', 'FI', 'PP'], 1)]

In [243]:
# Offers by label

qnt_by_lab = {}
for labels, qnt in qnt_list:
    for lab in labels:
        if lab not in qnt_by_lab: 
            qnt_by_lab[lab] = 0
            
        qnt_by_lab[lab] += qnt
        
import operator
print(sorted(qnt_by_lab.items(), key=operator.itemgetter(1), reverse=True))


[('FI', 304), ('PP', 297), ('MC', 108), ('EM', 74), ('TE', 21), ('OI', 20), ('EI', 7)]


In [244]:
# Over-sampling approach
X = data
y = classes

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

x_train, x_test, y_train, y_test = train_test_split(X, y, random_state=42)

mlb = MultiLabelBinarizer()
y_train = mlb.fit_transform(y_train)
y_test = mlb.transform(y_test)


In [245]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

lemmatizer = {}

class CustomTokenizer(object):
    def __init__(self, lemmatizer):
        self.wnl = lemmatizer
        self.tok = TfidfVectorizer().build_tokenizer()
        
    def __call__(self, doc):
        return [word for word in self.tok(doc)]
    
        tokens = []
        for t in self.tok(doc):
            if t in self.wnl:
                tokens.append(self.wnl[t])
            else:
                tokens.append(t)
                
        return tokens


tfidf_vect = TfidfVectorizer(ngram_range=(1,1), max_df=0.55, norm=None, use_idf=False,
                            tokenizer=CustomTokenizer(lemmatizer))



#tfidf_vect = TfidfVectorizer(ngram_range=(1,2), max_df=0.55)
tfidf_vect.fit(X)
len(tfidf_vect.vocabulary_)


6850

In [246]:
# Features by label
tfidf = tfidf_vect.transform(X)
X_inv = tfidf_vect.inverse_transform(tfidf)

features_by_label = {}
for x, labels in zip(X_inv, y):
    for label in labels:
        if label not in features_by_label:
            features_by_label[label] = {}
            
        for word in x:
            if word not in features_by_label[label]:
                features_by_label[label][word] = 0
                
            features_by_label[label][word] += 1   


In [247]:
import operator

for label in features_by_label:
    print("Label: ", label, "Words: ", len(features_by_label[label]))    
    for item in sorted(features_by_label[label].items(), key=operator.itemgetter(1), reverse=True):
        pass
        #print(item)
    #print()    

Label:  EI Words:  606
Label:  FI Words:  5081
Label:  EM Words:  2066
Label:  MC Words:  2768
Label:  TE Words:  745
Label:  OI Words:  1033
Label:  PP Words:  5327


In [248]:
# Exclusive Features by labels
exc = {}
for label, words in features_by_label.items():
    exc[label] = {}
    for word in words:
        found = False
        for l2, w2 in features_by_label.items():            
            if l2!= label:
                if word in w2:
                    found = True
        if not found:
            exc[label][word] = words[word]                

In [249]:
exc

{'EI': {'aduana': 1,
  'aduanera': 1,
  'aduanero': 1,
  'anticipado': 1,
  'copias': 1,
  'courier': 1,
  'exportaciones': 1,
  'factura': 1,
  'ilo': 1,
  'importada': 1,
  'marítimos': 1,
  'matarani': 1,
  'naviera': 1,
  'navieras': 1,
  'originales': 1,
  'paita': 1,
  'peligrosos': 1,
  'proforma': 1,
  'puertos': 1,
  'reventa': 1,
  'terrestre': 1},
 'EM': {'actúa': 1,
  'acumulado': 1,
  'adelantados': 1,
  'adjudicada': 1,
  'administrada': 1,
  'afiliaciones': 1,
  'agendar': 1,
  'agroquímicos': 1,
  'alineamiento': 1,
  'anomalías': 1,
  'antena': 1,
  'armando': 1,
  'asegurados': 1,
  'automotivación': 1,
  'av': 1,
  'bloqueados': 1,
  'buscan': 1,
  'cara': 1,
  'castillos': 1,
  'ccsd': 1,
  'cercanas': 1,
  'certificadas': 2,
  'chico': 1,
  'chorrillos': 1,
  'cobros': 1,
  'comparar': 1,
  'comparativas': 1,
  'concretadas': 1,
  'confesiones': 1,
  'confitería': 1,
  'consolida': 1,
  'consultiva': 1,
  'cruzar': 1,
  'crédito_': 1,
  'cumplido': 1,
  'decision':

In [135]:
result = {}

In [177]:
low = {}
for x, labels in zip(X_inv, y):
    labels.sort()
    #if labels in [["TE"], ["EI"], ["OI"], ['EI', 'MC'],['EI', 'MC', 'OI', 'TE'],
    #              ['MC', 'OI', 'TE'],['MC', 'TE']]:
    if labels == ['MC', 'OI', 'TE']: #['OI']: #['MC', 'TE']: #['EI', 'MC']: #['EI', 'MC', 'OI', 'TE']:#, ['MC', 'OI', "TE"]]:
        for word in x:
            if word not in low:
                low[word] = 0 
            low[word] += 1
        

In [178]:
print(len(low))
sorted(low.items(), key=operator.itemgetter(1), reverse=True)

61


[('responsable', 1),
 ('referidos', 1),
 ('office', 1),
 ('manejo', 1),
 ('intermedio', 1),
 ('estado', 1),
 ('voz', 1),
 ('técnico', 1),
 ('ingresos', 1),
 ('datos', 1),
 ('deseable', 1),
 ('análisis', 1),
 ('empresas', 1),
 ('regulatorias', 1),
 ('probabilísticos', 1),
 ('elaboración', 1),
 ('volúmenes', 1),
 ('profesional', 1),
 ('internos', 1),
 ('estadísticos', 1),
 ('consultas', 1),
 ('costos', 1),
 ('organismos', 1),
 ('impacto', 1),
 ('indicadores', 1),
 ('materia', 1),
 ('microsoft', 1),
 ('telecomunicaciones', 1),
 ('interconexión', 1),
 ('información', 1),
 ('elaborar', 1),
 ('regulación', 1),
 ('liquidación', 1),
 ('cálculo', 1),
 ('evaluación', 1),
 ('budget', 1),
 ('otras', 1),
 ('nivel', 1),
 ('contribuyan', 1),
 ('informes', 1),
 ('analista', 1),
 ('tráfico', 1),
 ('proyectos', 1),
 ('redacción', 1),
 ('avanzado', 1),
 ('por', 1),
 ('forecast', 1),
 ('políticas', 1),
 ('grandes', 1),
 ('económicos', 1),
 ('medidas', 1),
 ('excel', 1),
 ('monitoreo', 1),
 ('procesos', 1)

In [179]:
low_words = [word for word, cnt in low.items()]

In [184]:
filtered = {}
for offer in all_offers:
    text = oc.get_text(offer)
    tfidf = tfidf_vect.transform([text])
    words = tfidf_vect.inverse_transform(tfidf)
    
    cnt = 0
    for word in words[0]:        
        if word in low_words:
            cnt += 1            
    
    if cnt> 10 and (cnt/len(words[0])) > 0.35:
        print(len(words[0]))
        filtered[offer.id] = offer        
            
len(filtered)

32
46
61
47
26
34
35
29
31
31
32
32
33
35
29
30
35
30
30
27
36
45
40


22

In [185]:
for key in filtered:
    result[key] = filtered[key]

In [186]:
len(result)

75

In [190]:
import csv

with open("NewData.csv", "w") as csvfile:
    fieldnames = ["id", "year", "month", "source", "Job Title", "Description", "Qualifications", "Organization Name"]
    writer = csv.DictWriter(csvfile, fieldnames)
    writer.writeheader()    
    for id, offer in result.items():
        row = {}
        row['id'] = offer.id
        row['year'] = offer.year
        row['month'] = offer.month
        row['source'] = offer.source
        
        row['Job Title'] = offer.features["Job Title"]
        row['Description'] = offer.features["Description"]
        row['Qualifications'] = offer.features['Qualifications']
        row['Organization Name'] = offer.features['Organization Name']
        
        writer.writerow(row)


In [141]:
for key in result:
    print(key)
    break

practicante de estadística apoyar en la elaboración de informes de servicios de información estadística del sistema bancario elaborar reportes consolidar información estadística apoyar en el desarrollo de proyectos de investigación del área apoyar en la atención de requerimientos y consultas sobre información financiera estadística del sector manejo de base de datos a nivel avanzado


In [134]:
oc.get_text(filtered[0])

#"Economía Internacional": "EI",
#"Finanzas": "FI",
#"Teoría Económica" : "TE",
#"Organización Industrial" : "OI",
#"Métodos Cuantitativos/Investigación económica" : "MC",
#"Proyectos/Planeamiento estratégico" : "PP",
#"Estudios de mercado" : "EM",


KeyError: 0

In [None]:
filtered[0].features['Organization Name']

In [None]:
import csv

with open("NewData.csv", "w") as csvfile:
    writer = csv.DictWriter()
    
    

In [196]:
for label in exc:
    print("Label: ", label, "Words: ", len(exc[label]))
    for item in sorted(exc[label].items(), key=operator.itemgetter(1), reverse=True):
        print(item)
        
    print()

Label:  PP Words:  1102
('conducir', 6)
('compensaciones', 6)
('etapas', 5)
('saneamiento', 4)
('contrataciones', 4)
('valoraciones', 3)
('independientes', 3)
('preferible', 3)
('planteadas', 3)
('bpm', 3)
('régimen', 3)
('conducción', 3)
('sueldo', 3)
('gas', 3)
('lanzamientos', 3)
('mm', 3)
('intervención', 3)
('determinando', 3)
('prestando', 3)
('incrementos', 3)
('evidencias', 3)
('difundir', 3)
('acreditada', 3)
('logísticas', 3)
('20', 3)
('incidencia', 3)
('modernización', 3)
('ganar', 3)
('rh', 2)
('junta', 2)
('postpago', 2)
('leer', 2)
('impresión', 2)
('usuarias', 2)
('acopio', 2)
('comunitarias', 2)
('simplificación', 2)
('rendición', 2)
('expertise', 2)
('articulación', 2)
('inclusive', 2)
('ambas', 2)
('agosto', 2)
('comunidad', 2)
('adjunto', 2)
('éticos', 2)
('concluyente', 2)
('definiendo', 2)
('considera', 2)
('segmentar', 2)
('hospitales', 2)
('después', 2)
('intendencia', 2)
('minedu', 2)
('productiva', 2)
('responsibilities', 2)
('disponer', 2)
('cluster', 2)
('en

In [13]:
# Using a lemmatization dictionary
# Reading

# Post working review: 
# Lemmatization doesn't seems to work cause it reduce the number of "unique" words in low sample offers
lemmatizer = {}
with open("lemmatization-es.txt") as lemma_file:
    for line in lemma_file:
        line = line.split()                
        lemmatizer[line[0]] = line[1]


In [14]:
for label in exc:
    print("Label: ", label)
    for word in exc[label]:
        if word in lemmatizer:
            print(word, lemmatizer[word])    

Label:  MC
boletín boletines
cualquiera cualquier
redactar redactó
extracto extractos
aterrizaje aterrizajes
independiente independientes
mujer mujeres
factor factores
tutoría tutorías
entrevistar entrevistó
inmerso inmersos
francés franceses
útil útiles
sílabo sílabos
periodista periodistas
publicación publicaciones
enseñanza enseñanzas
sintetizar sintetizó
empleo empleos
domingo domingos
despegue despegues
esquematizar esquematizó
sociólogo sociólogos
disciplinario disciplinarios
actualidad actualidades
rampa rampas
guía guías
curiosidad curiosidades
método métodos
ocupación ocupaciones
esencia esencias
comentario comentarios
partida partidas
conexión conexiones
clase clases
estacionamiento estacionamientos
embajada embajadas
psicólogo psicólogos
consultar consultó
manga mangas
preferente preferentes
aerolínea aerolíneas
arma armas
migratorio migratorios
abstracción abstracciones
terminal terminales
conversar conversó
Label:  EM
noche noches
tope topes
ratio ratios
porcentual porcent

In [13]:
# Word tokenizer vs tfidf tokenizer
# Result1: Word tokenizer needs punctuation remove
from nltk import word_tokenize

punctuations = set("•!$%&*+,.//:=;ò?@]`¡¨°–“”−’´¿-()·")

def custom_tokenize(doc):
    translator = str.maketrans("".join(punctuations),' '*len(punctuations))
    doc = doc.translate(translator)    
    return [word.strip("-").replace("'", "") for word in word_tokenize(doc)]


vect1 = TfidfVectorizer(ngram_range=(1,2), max_df=0.55, norm=None, use_idf=False,
                            tokenizer=custom_tokenize)

vect2 = TfidfVectorizer(ngram_range=(1,2), max_df=0.55, norm=None, use_idf=False)

vect1.fit(X)
vect2.fit(X)

# "Words found only by word_tokenize"
only_wt = []
for word in vect1.vocabulary_:    
    if word not in vect2.vocabulary_:
        only_wt.append(word)

only_df = []
# "Words found only by default tokenize"
for word in vect2.vocabulary_:    
    if word not in vect1.vocabulary_:
         only_df.append(word)
            
print(only_wt)
        

['', 'involucrando a', 'y propuestas', 'y segmentos', 'o gestor', 'y generando', 'o órdenes', 'reuniones y', 'o parcial', 'cruce y', 'relación y', 'y con', 'o departamentos', 'métodos y', 'r experiencia', 'prácticos y', 'promover e', 'y compartir', 'interbancarias y', 'a gerencia', 'usos y', 'iniciativa y', 'tesorero o', 'cad y', 'roche y', 'vacaciones u', 'o fraudulenta', 'o negación', 'forward d', 'y dimensionando', 'intermedio \uf0a7', 'y conocimiento', 'operaciones y', 'o mba', 'destitución y', 'y consolidación', 'estabilidad y', 'y hablado', 'identificando y', 'a 05', 'verano y', 'y abastecimiento', 'habilidad y', 'y ecológicas', 'cargo 8', 'o negociar', 'incentivos y', 'y correcta', 'e inmobiliario', 'clientes d', '9 am', 'y liderazgo', 'sobreendeudamiento 4', 'crecimiento y', 'financiero y', 'de m', 'ppt y', 'o futuros', '5 haber', 'acciones a', 'o ejecutivo', 'programa y', 'o ecuador', 'hasta 3', 'gastos y', 'invertibles y', '3 participar', 'y notas', 'y pasajeros', 'mejor a', 

In [14]:
# Finding rare punctuation
common_chars = set("abcdefghijklmnopqrstuvwxyz-0123456789áéíóúñ")

rare_chars = set()
for word in only_wt:    
    for char in word:
        #if char in test_punctuations:
        if char not in common_chars:
            print(word)
            rare_chars.add(char)
            
rare_chars

involucrando a
y propuestas
y segmentos
o gestor
y generando
o órdenes
reuniones y
o parcial
cruce y
relación y
y con
o departamentos
métodos y
r experiencia
prácticos y
promover e
y compartir
interbancarias y
a gerencia
usos y
iniciativa y
tesorero o
cad y
roche y
vacaciones u
o fraudulenta
o negación
forward d
y dimensionando
intermedio 
intermedio 
y conocimiento
operaciones y
o mba
destitución y
y consolidación
estabilidad y
y hablado
identificando y
a 05
verano y
y abastecimiento
habilidad y
y ecológicas
cargo 8
o negociar
incentivos y
y correcta
e inmobiliario
clientes d
9 am
y liderazgo
sobreendeudamiento 4
crecimiento y
financiero y
de m
ppt y
o futuros
5 haber
acciones a
o ejecutivo
programa y
o ecuador
hasta 3
gastos y
invertibles y
3 participar
y notas
y pasajeros
mejor a
actividades 1
a movilizar
y producto
n 088
títulos y
y preparar
pedidos y
dirección 4
benchmarking o
aportando y
transferencia o
plazo e
grandes y
vincular a
mantener y
y herramientas
y forcasting
a clien

elabora y
o procedimientos
cliente 4
4 apoyar
tic s
4 haber
permitan a
eficiente y
confección y
a tráfico
4 gestión
a 60
y planes
usuario y
a entidades
detalles y
o flujo
3 auditar
documentación a
y computacional
y rotación
control e
a 5
a comercio
r y
pedido f
for i
fuentes y
y por
finanzas e
oficinas u
impuesto a
o industrial
o capital
hyperion o
etc 5
descuentos a
datos e
o función
d actualizar
o trabajos
relacionados y
mercadeo y
bloomberg a
seminarios y
3 en
acciones o
y perspectivas
y ppt
complementarias o
archivos e
puesto y
y contribuir
nivel 1
 realizar
 realizar
y apetito
otros a
servicio y
abastecimiento b
categorías a
stakeholders 2
recurrente o
capacitar a
presentaciones a
diaria y
6 participar
venta a
farmacéutico o
7 coordinar
cualitativos y
y optimizar
kpis a
agencias a
a un
y medio
centros y
compañía 2
promocional y
área 5
value y
entre s
crédito o
4 información
y seguros
llevado a
plantear e
gratuito a
7 500
macro y
nuevas y
o comentarios
varianza y
y eficientemente

{' ', '_', '`', 'º', '‐', '\uf02d', '\uf0a7', '\uf0b7', '\uf0fc', '\ufeff'}

In [198]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

lemmatizer = {}

class CustomTokenizer(object):
    def __init__(self, lemmatizer):
        self.wnl = lemmatizer
        self.tok = TfidfVectorizer().build_tokenizer()
        
    def __call__(self, doc):
        return [word for word in self.tok(doc)]
    
        tokens = []
        for t in self.tok(doc):
            if t in self.wnl:
                tokens.append(self.wnl[t])
            else:
                tokens.append(t)
                
        return tokens


tfidf_vect = TfidfVectorizer(ngram_range=(1,1), max_df=0.55,# norm=None, use_idf=False,
                            tokenizer=CustomTokenizer(lemmatizer))



#tfidf_vect = TfidfVectorizer(ngram_range=(1,2), max_df=0.55)
tfidf_vect.fit(X)
len(tfidf_vect.vocabulary_)

6850

In [199]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score


clfs = []

clfs.append(("Ada",OneVsRestClassifier(AdaBoostClassifier(random_state=42))))
clfs.append(("Bernou", OneVsRestClassifier(BernoulliNB())))
clfs.append(("SVC ", OneVsRestClassifier(SVC())))

clfs.append(("Multi", OneVsRestClassifier(MultinomialNB())))

tf_train = tfidf_vect.transform(x_train)
tf_test = tfidf_vect.transform(x_test)
for name, clf in clfs:
    clf.fit(tf_train, y_train)    
    y_pred = clf.predict(tf_test)
    print(name, accuracy_score(y_test, y_pred))


Ada 0.410071942446
Bernou 0.395683453237
SVC  0.230215827338
Multi 0.330935251799


In [82]:
clf = OneVsRestClassifier(MultinomialNB(alpha=1.1))
clf.fit(tf_train, y_train)
y_pred = clf.predict(tf_test)

print(accuracy_score(y_test, y_pred))

0.460431654676


In [83]:
from sklearn.grid_search import GridSearchCV
param_grid = [{'estimator__alpha': (1, 0.1, 0.01, 0.001, 0.0001, 0.00001)}]
clf = GridSearchCV(OneVsRestClassifier(MultinomialNB()), param_grid, cv=4)
tf_train = tfidf_vect.transform(x_train)
clf.fit(tf_train, y_train)
y_pred = clf.predict(tf_test)
print(name, accuracy_score(y_test, y_pred))


Multi 0.460431654676


In [84]:
def ml_score(y_test, y_pred):    
    match_cnt = 0
    total_cnt = 0
    for yt, yp in zip(y_test, y_pred):        
        for i, lt in enumerate(yt):
            if lt == 1 or yp[i] == 1:
                total_cnt += 1
                if lt == yp[i]:
                    match_cnt += 1           
                
    return match_cnt/total_cnt

ml_score(y_test, y_pred)
    

0.5703422053231939

In [86]:
pred_prob = [[format(p, '.1f') for p in probs] for probs in clf.predict_proba(tf_test)]

print("Classes: ", mlb.classes_)
# Economía Internacional
# Finanzas
# Teoría Económica
# Organización Industrial
# Métodos Cuantitativos/Investigación económica
# Proyectos/Planeamiento estratégico
# Estudios de mercado

for yt, yp, prob in zip(y_test, y_pred, pred_prob):
    print("True: ", yt, " Pred: ", yp , "Probs: ", prob)

Classes:  ['FI' 'MC' 'Otros' 'PP']
True:  [1 0 0 1]  Pred:  [1 0 0 1] Probs:  ['1.0', '0.0', '0.0', '1.0']
True:  [1 0 0 1]  Pred:  [1 0 0 1] Probs:  ['1.0', '0.0', '0.0', '1.0']
True:  [0 0 0 1]  Pred:  [0 0 0 1] Probs:  ['0.0', '0.0', '0.0', '1.0']
True:  [0 0 0 1]  Pred:  [1 0 0 1] Probs:  ['1.0', '0.0', '0.0', '0.5']
True:  [0 0 0 1]  Pred:  [1 0 0 1] Probs:  ['1.0', '0.0', '0.1', '1.0']
True:  [1 0 1 0]  Pred:  [1 0 0 0] Probs:  ['1.0', '0.0', '0.0', '0.0']
True:  [1 0 0 0]  Pred:  [1 0 1 0] Probs:  ['1.0', '0.0', '1.0', '0.0']
True:  [1 0 0 0]  Pred:  [1 0 0 1] Probs:  ['1.0', '0.0', '0.0', '1.0']
True:  [1 0 0 0]  Pred:  [1 0 0 1] Probs:  ['1.0', '0.0', '0.0', '1.0']
True:  [1 0 0 1]  Pred:  [1 0 0 1] Probs:  ['1.0', '0.0', '0.0', '1.0']
True:  [0 1 0 1]  Pred:  [0 1 0 0] Probs:  ['0.0', '1.0', '0.0', '0.0']
True:  [1 0 0 1]  Pred:  [1 0 0 1] Probs:  ['1.0', '0.0', '0.0', '1.0']
True:  [0 0 1 0]  Pred:  [0 0 0 0] Probs:  ['0.1', '0.0', '0.0', '0.4']
True:  [0 0 0 1]  Pred:  [0 0

In [296]:
# Get data to work with


from observatorio_laboral.offer.offer_controller import OfferController
from observatorio_laboral.offer.date_range import DateRange

text_fields = []
oc = OfferController(text_fields = ["Job Title", "Description", "Qualifications"])
date_range = DateRange(1, 2013, 5, 2017)
source = "symplicity"

oc.load_offers(source, date_range)
oc.filter_offers_by_career("ECONOMÍA")
all_offers = oc.offers
oc.filter_offers_by_field("Areas", ignore=["EI"])

offer_texts = oc.get_text()
offer_classes = oc.get_field_labels("Areas")

data = offer_texts
classes = offer_classes
print(len(data))
print(len(all_offers))

553
12653


In [295]:
#('TE', 17), ('OI', 12), ('EI', 6)]
new_classes = []
for labels in offer_classes:
    new_labels = set()
    for label in labels:
        if label in ['TE', 'OI', 'EI', "MC"]:
            new_labels.add("Otros")
        else:
            new_labels.add(label)
    tmp_list = list(new_labels)
    tmp_list.sort()
    new_classes.append(tmp_list)
            

In [297]:
# Over-sampling approach
X = data
y = offer_classes

#from imblearn.over_sampling import RandomOverSampler
#ros = RandomOverSampler(random_state=42)
#X_resampled, y_resampled = ros.fit_sample(X,y)

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

x_train, x_test, y_train, y_test = train_test_split(X, y, random_state=42)

mlb = MultiLabelBinarizer()
y_train = mlb.fit_transform(y_train)
y_test = mlb.transform(y_test)

In [298]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

lemmatizer = {}

class CustomTokenizer(object):
    def __init__(self, lemmatizer):
        self.wnl = lemmatizer
        self.tok = TfidfVectorizer().build_tokenizer()
        
    def __call__(self, doc):
        return [word for word in self.tok(doc)]
    
        tokens = []
        for t in self.tok(doc):
            if t in self.wnl:
                tokens.append(self.wnl[t])
            else:
                tokens.append(t)
                
        return tokens


tfidf_vect = TfidfVectorizer(ngram_range=(1,1), max_df=0.55, norm=None, use_idf=False,
                            tokenizer=CustomTokenizer(lemmatizer))



#tfidf_vect = TfidfVectorizer(ngram_range=(1,2), max_df=0.55)
tfidf_vect.fit(X)
len(tfidf_vect.vocabulary_)

6850

In [299]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

def ml_score(y_test, y_pred):    
    match_cnt = 0
    total_cnt = 0
    for yt, yp in zip(y_test, y_pred):        
        for i, lt in enumerate(yt):
            if lt == 1 or yp[i] == 1:
                total_cnt += 1
                if lt == yp[i]:
                    match_cnt += 1           
                
    return match_cnt/total_cnt


def our_score(y_test, y_pred):
    match_cnt = 0
    total_cnt = 0
    for yt, yp in zip(y_test, y_pred):
        for i, lt in enumerate(yt):
            if lt == 1:
                total_cnt += 1
                if lt == yp[i]:
                    match_cnt += 1
                    
    return match_cnt/total_cnt



clfs = []

clfs.append(("Ada",OneVsRestClassifier(AdaBoostClassifier(random_state=42))))
clfs.append(("Bernou", OneVsRestClassifier(BernoulliNB())))
clfs.append(("SVC ", OneVsRestClassifier(SVC())))

clfs.append(("Multi", OneVsRestClassifier(MultinomialNB())))

tf_train = tfidf_vect.transform(x_train)
tf_test = tfidf_vect.transform(x_test)
for name, clf in clfs:
    clf.fit(tf_train, y_train)    
    y_pred = clf.predict(tf_test)
    print(name, accuracy_score(y_test, y_pred), ml_score(y_test, y_pred), our_score(y_test, y_pred))

Ada 0.338129496403 0.48846153846153845 0.6105769230769231
Bernou 0.388489208633 0.508130081300813 0.6009615384615384
SVC  0.230215827338 0.49538461538461537 0.7740384615384616
Multi 0.453237410072 0.5606060606060606 0.7115384615384616


In [300]:
pred_prob = [[format(p, '.1f') for p in probs] for probs in clf.predict_proba(tf_test)]

print("Classes: ", mlb.classes_)
# Economía Internacional
# Finanzas
# Teoría Económica
# Organización Industrial
# Métodos Cuantitativos/Investigación económica
# Proyectos/Planeamiento estratégico
# Estudios de mercado

for yt, yp, prob in zip(y_test, y_pred, pred_prob):
    print("True: ", yt, " Pred: ", yp , "Probs: ", prob)

Classes:  ['EI' 'EM' 'FI' 'MC' 'OI' 'PP' 'TE']
True:  [0 0 1 0 0 1 0]  Pred:  [0 0 1 0 0 1 0] Probs:  ['0.0', '0.0', '1.0', '0.0', '0.0', '1.0', '0.0']
True:  [0 0 1 0 0 1 0]  Pred:  [0 0 1 0 0 1 0] Probs:  ['0.0', '0.0', '1.0', '0.0', '0.0', '1.0', '0.0']
True:  [0 0 0 0 0 1 0]  Pred:  [0 0 0 0 0 1 0] Probs:  ['0.0', '0.0', '0.0', '0.0', '0.0', '1.0', '0.0']
True:  [0 0 0 0 0 1 0]  Pred:  [0 0 1 0 0 1 0] Probs:  ['0.0', '0.0', '1.0', '0.0', '0.0', '0.5', '0.0']
True:  [0 0 0 0 0 1 0]  Pred:  [0 0 1 0 0 1 0] Probs:  ['0.0', '0.0', '1.0', '0.0', '0.0', '1.0', '0.0']
True:  [0 1 1 0 0 0 0]  Pred:  [0 0 1 0 0 0 0] Probs:  ['0.0', '0.0', '1.0', '0.0', '0.0', '0.0', '0.0']
True:  [0 0 1 0 0 0 0]  Pred:  [0 1 1 0 0 0 0] Probs:  ['0.0', '1.0', '1.0', '0.0', '0.0', '0.0', '0.0']
True:  [0 0 1 0 0 0 0]  Pred:  [0 0 1 0 0 1 0] Probs:  ['0.0', '0.0', '1.0', '0.0', '0.0', '1.0', '0.0']
True:  [0 0 1 0 0 0 0]  Pred:  [0 0 1 0 0 1 0] Probs:  ['0.0', '0.0', '1.0', '0.0', '0.0', '1.0', '0.0']
True:  [

In [301]:

import numpy as np

error = []
for yt, yp, offer in zip(y_test, y_pred, X):
    if not np.array_equal(yt, yp):
        error.append((yt, yp, offer))
        
len(error)
        
        
        

76

In [305]:
print()
error[3]





(array([0, 0, 1, 0, 0, 0, 0]),
 array([0, 1, 1, 0, 0, 0, 0]),
 'Auditor Senior de Gestion Creditia -Planear y ejecutar las auditorías de créditos en las agencias a su cargo y presentar los resultados correspondientes, formulando las observaciones y recomendaciones.\n-Verificar que se realice el cumplimiento de las normas y políticas establecidas por la organización para la operación crediticia y el cumplimiento a las políticas, criterios y lineamientos de créditos determinados por el directorio.\n-Elaborar el informe de auditoría de gestión de créditos incluyendo las observaciones y las acciones correctivas propuestas por los responsables de los procesos observados.\n-Documentar cada etapa, actividad y hallazgo del proceso de otorgamiento de crédito, asegurándose de que el expediente contenga todos los documentos de la evaluación y que la operación cumplió con las autorizaciones correspondientes según el monto y tipo de crédito.\n-Dar seguimiento al cumplimiento de las acciones correct

In [303]:
pred_prob = [[format(p, '.1f') for p in probs] for probs in clf.predict_proba(tf_test)]

print("Classes: ", mlb.classes_)
# Economía Internacional
# Finanzas
# Teoría Económica
# Organización Industrial
# Métodos Cuantitativos/Investigación económica
# Proyectos/Planeamiento estratégico
# Estudios de mercado

for yt, yp, prob in zip(y_test, y_pred, pred_prob):
    print("True: ", yt, " Pred: ", yp , "Probs: ", prob)

Classes:  ['EI' 'EM' 'FI' 'MC' 'OI' 'PP' 'TE']
True:  [0 0 1 0 0 1 0]  Pred:  [0 0 1 0 0 1 0] Probs:  ['0.0', '0.0', '1.0', '0.0', '0.0', '1.0', '0.0']
True:  [0 0 1 0 0 1 0]  Pred:  [0 0 1 0 0 1 0] Probs:  ['0.0', '0.0', '1.0', '0.0', '0.0', '1.0', '0.0']
True:  [0 0 0 0 0 1 0]  Pred:  [0 0 0 0 0 1 0] Probs:  ['0.0', '0.0', '0.0', '0.0', '0.0', '1.0', '0.0']
True:  [0 0 0 0 0 1 0]  Pred:  [0 0 1 0 0 1 0] Probs:  ['0.0', '0.0', '1.0', '0.0', '0.0', '0.5', '0.0']
True:  [0 0 0 0 0 1 0]  Pred:  [0 0 1 0 0 1 0] Probs:  ['0.0', '0.0', '1.0', '0.0', '0.0', '1.0', '0.0']
True:  [0 1 1 0 0 0 0]  Pred:  [0 0 1 0 0 0 0] Probs:  ['0.0', '0.0', '1.0', '0.0', '0.0', '0.0', '0.0']
True:  [0 0 1 0 0 0 0]  Pred:  [0 1 1 0 0 0 0] Probs:  ['0.0', '1.0', '1.0', '0.0', '0.0', '0.0', '0.0']
True:  [0 0 1 0 0 0 0]  Pred:  [0 0 1 0 0 1 0] Probs:  ['0.0', '0.0', '1.0', '0.0', '0.0', '1.0', '0.0']
True:  [0 0 1 0 0 0 0]  Pred:  [0 0 1 0 0 1 0] Probs:  ['0.0', '0.0', '1.0', '0.0', '0.0', '1.0', '0.0']
True:  [