In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

In [6]:
# Get data to work with

from observatorio_laboral.offer.offer_controller import OfferController
from observatorio_laboral.offer.date_range import DateRange

text_fields = []
oc = OfferController(text_fields = ["Job Title", "Description", "Qualifications"])
date_range = DateRange(1, 2013, 5, 2017)
source = "symplicity"

oc.load_offers(source, date_range)
print(len(oc.offers))

oc.filter_offers_by_career("ECONOMÍA")
print(len(oc.offers))

oc.filter_offers_by_field("Areas")
print(len(oc.offers))

offer_texts = oc.get_text()
offer_classes = oc.get_field_labels("Areas", ignore=['MC', 'TE', 'OI', 'EI', 'EM'])

80706
12653
553


In [8]:
offer_texts = [text for text, labels in zip(offer_texts, offer_classes) if labels != []]
offer_classes = [labels for labels in offer_classes if labels != []]


In [11]:
# Over-sampling approach
X = offer_texts
y = offer_classes

#from imblearn.over_sampling import RandomOverSampler
#ros = RandomOverSampler(random_state=42)
#X_resampled, y_resampled = ros.fit_sample(X,y)

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

x_train, x_test, y_train, y_test = train_test_split(X, y, random_state=42)

mlb = MultiLabelBinarizer()
y_train = mlb.fit_transform(y_train)
y_test = mlb.transform(y_test)

In [119]:
lemmatizer = {}
with open("lemmatization-es.txt") as lemma_file:
    for line in lemma_file:
        line = line.split()
        lemmatizer[line[1]] = line[0]
        
        
class CustomTokenizer(object):
    def __init__(self, lemmatizer):
        self.wnl = lemmatizer
        self.tok = TfidfVectorizer().build_tokenizer()
        
    def __call__(self, doc):
        #return [word for word in self.tok(doc)]    
        tokens = []
        for t in self.tok(doc):
            while t in self.wnl:
                t = self.wnl[t]                
            tokens.append(t)                
        return tokens
custom_tokenizer = CustomTokenizer(lemmatizer)

In [128]:
X = offer_texts

stop_words = stopwords.words('spanish') + ['parar']

vec = TfidfVectorizer(ngram_range=(1,1), stop_words=stop_words, tokenizer=custom_tokenizer, norm=None)
vec.fit(X) 

Xtr = vec.fit_transform(X)
features = vec.get_feature_names()

y_w = np.array([",".join(labels) for labels in offer_classes])
dfs = top_feats_by_class(Xtr, y_w, features)


In [129]:
dfs['PP']

Unnamed: 0,feature,tfidf
0,proyectar,3.003235
1,gestión,2.357437
2,procesar,2.235174
3,venta,1.844506
4,experiencia,1.789305
5,público,1.779082
6,servicio,1.75613
7,área,1.700789
8,desarrollar,1.654856
9,comercial,1.602017


In [130]:
np.setdiff1d(dfs['FI,PP']['feature'], np.union1d(dfs['PP']['feature'], dfs['FI']['feature']))

array(['inversión', 'presupuestar', 'realizar', 'seguimiento'], dtype=object)

In [135]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

def ml_score(y_test, y_pred):    
    match_cnt = 0
    total_cnt = 0
    for yt, yp in zip(y_test, y_pred):        
        for i, lt in enumerate(yt):
            if lt == 1 or yp[i] == 1:
                total_cnt += 1
                if lt == yp[i]:
                    match_cnt += 1           
                
    return match_cnt/total_cnt


def our_score(y_test, y_pred):
    match_cnt = 0
    total_cnt = 0
    for yt, yp in zip(y_test, y_pred):
        for i, lt in enumerate(yt):
            if lt == 1:
                total_cnt += 1                
                if lt == yp[i]:
                    match_cnt += 1
                    
    return match_cnt/total_cnt

vocab = set(list(dfs['PP']['feature']))
vocab.update(set(list(dfs['FI']['feature'])))
#vocab.update(set(list(dfs['FI,PP']['feature'])))
#vocab.update(set(feats_by_label['P']))
#vocab.update(set(feats_by_label['MC']))

#vocab.update(set(feats_by_label['F,P']))
#vocab.update(set(feats_by_label['EI']))



tfidf_vect = TfidfVectorizer(vocabulary=vocab, norm=None, tokenizer=custom_tokenizer)
tfidf_vect.fit(X) 

clfs = []

clfs.append(("Ada",OneVsRestClassifier(AdaBoostClassifier(random_state=42))))
clfs.append(("Bernou", OneVsRestClassifier(BernoulliNB())))
clfs.append(("SVC ", OneVsRestClassifier(SVC())))
clfs.append(("Multi", OneVsRestClassifier(MultinomialNB())))

tf_train = tfidf_vect.transform(x_train)
tf_test = tfidf_vect.transform(x_test)
for name, clf in clfs:
    clf.fit(tf_train, y_train)    
    y_pred = clf.predict(tf_test)
    print(name, accuracy_score(y_test, y_pred), ml_score(y_test, y_pred), our_score(y_test, y_pred))

Ada 0.538461538462 0.6720430107526881 0.8445945945945946
Bernou 0.606837606838 0.6896551724137931 0.8108108108108109
SVC  0.350427350427 0.6409090909090909 0.9527027027027027
Multi 0.649572649573 0.7251461988304093 0.8378378378378378


In [136]:
print(clfs[3][0])
clf = clfs[3][1]
pred_prob = [[format(p, '.1f') for p in probs] for probs in clf.predict_proba(tf_test)]

print("Classes: ", mlb.classes_)
# Economía Internacional
# Finanzas
# Teoría Económica
# Organización Industrial
# Métodos Cuantitativos/Investigación económica
# Proyectos/Planeamiento estratégico
# Estudios de mercado

for idx, (yt, yp, prob) in enumerate(zip(y_test, y_pred, pred_prob)):
    print(idx, "True: ", yt, " Pred: ", yp , "Probs: ", prob)

Multi
Classes:  ['FI' 'PP']
0 True:  [1 0]  Pred:  [1 0] Probs:  ['1.0', '0.0']
1 True:  [1 1]  Pred:  [1 0] Probs:  ['1.0', '0.0']
2 True:  [1 0]  Pred:  [1 0] Probs:  ['1.0', '0.0']
3 True:  [1 1]  Pred:  [1 0] Probs:  ['1.0', '0.0']
4 True:  [1 0]  Pred:  [0 1] Probs:  ['0.0', '1.0']
5 True:  [1 1]  Pred:  [1 0] Probs:  ['1.0', '0.3']
6 True:  [1 0]  Pred:  [1 0] Probs:  ['1.0', '0.0']
7 True:  [1 0]  Pred:  [1 0] Probs:  ['1.0', '0.1']
8 True:  [0 1]  Pred:  [0 1] Probs:  ['0.4', '0.8']
9 True:  [1 0]  Pred:  [1 1] Probs:  ['1.0', '0.6']
10 True:  [1 0]  Pred:  [1 0] Probs:  ['1.0', '0.4']
11 True:  [1 0]  Pred:  [1 0] Probs:  ['1.0', '0.1']
12 True:  [0 1]  Pred:  [0 1] Probs:  ['0.0', '1.0']
13 True:  [1 0]  Pred:  [1 1] Probs:  ['1.0', '1.0']
14 True:  [1 1]  Pred:  [1 0] Probs:  ['1.0', '0.0']
15 True:  [1 1]  Pred:  [1 0] Probs:  ['1.0', '0.0']
16 True:  [0 1]  Pred:  [1 1] Probs:  ['1.0', '1.0']
17 True:  [0 1]  Pred:  [0 1] Probs:  ['0.3', '0.8']
18 True:  [1 1]  Pred:  [1 1

In [137]:
top_feats_in_doc(tf_test, tfidf_vect.get_feature_names(), 4, 20)

Unnamed: 0,feature,tfidf
0,empresa,12.332839
1,riesgo,7.505815
2,comprar,6.679334
3,gestión,4.959803
4,plan,4.620095
5,servicio,4.588598
6,unir,4.021062
7,elaboración,3.79861
8,proyectar,3.756942
9,manejar,3.646639


In [138]:
np.setdiff1d(dfs['FI']['feature'], np.union1d(dfs['FI,PP']['feature'], dfs['PP']['feature']))

array(['banco', 'cobranza', 'elaboración', 'manejar', 'mercar', 'reportar'], dtype=object)

In [87]:
x_test[15]

'ANALISTA DE CONSUMO - BPC El Analista de Consumo explora y analiza a profundidad la información de los clientes (tarjeta de crédito y créditos consumo) para encontrar explicación del nivel de riesgo que tienen, deterioros no esperados y mejoras en las políticas de riesgos.\n\nPRINCIPALES FUNCIONES:\n\n- Contribuye con la identificación de los segmentos de clientes que no son rentables para el banco y que generan pérdidas importantes. Asimismo, identifica segmentos de clientes donde se puede maximizar el valor. \n- Monitorea toda la cartera a fin de identificar tendencias, se adelanta a posibles deteriores minimizando las pérdidas.\t\t\n- Revisa permanentemente la evolución de las provisiones y realiza los análisis respectivos que expliquen las variaciones importantes para cada producto.\t\t\t\t\t\n- Analiza permanentemente la calidad de cartera del producto y la relación de ésta con la rentabilidad de los mismos, mediante el seguimiento constante de indicadores que muestren el perfil 

In [15]:
vec= tfidf_vect

In [16]:
Xtr = vec.fit_transform(X)
features = vec.get_feature_names()

In [13]:
def top_tfidf_feats(row, features, top_n=25):
    ''' Get top n tfidf values in row and return them with their corresponding feature names.'''
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf']
    return df

def top_feats_in_doc(Xtr, features, row_id, top_n=25):
    ''' Top tfidf features in specific document (matrix row) '''
    row = np.squeeze(Xtr[row_id].toarray())
    return top_tfidf_feats(row, features, top_n)

def top_mean_feats(Xtr, features, grp_ids=None, min_tfidf=0.1, top_n=25):
    ''' Return the top n features that on average are most important amongst documents in rows
        indentified by indices in grp_ids. '''
    if grp_ids:
        D = Xtr[grp_ids].toarray()
    else:
        D = Xtr.toarray()

    D[D < min_tfidf] = 0
    #tfidf_means = np.nanmean(np.where(matrix!=0,matrix,np.nan),1)
    tfidf_means = np.mean(D, axis=0)
    return top_tfidf_feats(tfidf_means, features, top_n)

def top_feats_by_class(Xtr, y, features, min_tfidf=0.1, top_n=25):
    ''' Return a list of dfs, where each df holds top_n features and their mean tfidf value
        calculated across documents with the same class label. '''
    dfs = {}
    labels = np.unique(y)
    for label in labels:
        ids = np.where(y==label)          
        feats_df = top_mean_feats(Xtr, features, ids, min_tfidf=min_tfidf, top_n=top_n)
        feats_df.label = label
        dfs[label] = feats_df        
    return dfs

def plot_tfidf_classfeats_h(dfs):
    ''' Plot the data frames returned by the function plot_tfidf_classfeats(). '''
    fig = plt.figure(figsize=(12, 9), facecolor="w")
    x = np.arange(len(dfs[0]))
    for i, df in enumerate(dfs):
        ax = fig.add_subplot(1, len(dfs), i+1)
        ax.spines["top"].set_visible(False)
        ax.spines["right"].set_visible(False)
        ax.set_frame_on(False)
        ax.get_xaxis().tick_bottom()
        ax.get_yaxis().tick_left()
        ax.set_xlabel("Mean Tf-Idf Score", labelpad=16, fontsize=14)
        ax.set_title("label = " + str(df.label), fontsize=16)
        ax.ticklabel_format(axis='x', style='sci', scilimits=(-2,2))
        ax.barh(x, df.tfidf, align='center', color='#3F5D7D')
        ax.set_yticks(x)
        ax.set_ylim([-1, x[-1]+1])
        yticks = ax.set_yticklabels(df.feature)
        plt.subplots_adjust(bottom=0.09, right=0.97, left=0.15, top=0.95, wspace=0.52)
    plt.show()