In [20]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd


In [1]:
# Get data to work with

from observatorio_laboral.offer.offer_controller import OfferController
from observatorio_laboral.offer.date_range import DateRange

text_fields = []
oc = OfferController(text_fields = ["Job Title", "Description", "Qualifications"])
date_range = DateRange(1, 2013, 5, 2017)
source = "symplicity"

oc.load_offers(source, date_range)
print(len(oc.offers))

oc.filter_offers_by_career("ECONOMÍA")
print(len(oc.offers))

oc.filter_offers_by_field("Areas")
print(len(oc.offers))

offer_texts = oc.get_text()
offer_classes = oc.get_field_labels("Areas")


12653
12653
491


In [7]:
# Over-sampling approach
X = offer_texts
y = offer_classes

#from imblearn.over_sampling import RandomOverSampler
#ros = RandomOverSampler(random_state=42)
#X_resampled, y_resampled = ros.fit_sample(X,y)

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

x_train, x_test, y_train, y_test = train_test_split(X, y, random_state=42)

mlb = MultiLabelBinarizer()
y_train = mlb.fit_transform(y_train)
y_test = mlb.transform(y_test)

In [110]:
vec = TfidfVectorizer(offer_texts, ngram_range=(1,1), stop_words = stopwords.words('spanish'))
X = offer_texts

Xtr = vec.fit_transform(X)
features = vec.get_feature_names()

y_w = np.array([",".join(labels) for labels in offer_classes])
dfs = top_feats_by_class(Xtr, y_w, features, min_tfidf=0, top_n=0)

feats_by_label = {}
labels = np.unique(y_w)
for idx, label in enumerate(labels):
    feats_by_label[label] = list(dfs[idx]['feature'])


In [121]:
top_feats_in_doc(vec.transform(x_test), vec.get_feature_names(), 122, top_n=10)

Unnamed: 0,feature,tfidf
0,bancario,0.304533
1,venta,0.20614
2,función,0.187809
3,seguros,0.178153
4,planificar,0.159821
5,productos,0.159664
6,citas,0.14706
7,medida,0.14706
8,necesita,0.14706
9,suscripciones,0.14706


In [92]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

def ml_score(y_test, y_pred):    
    match_cnt = 0
    total_cnt = 0
    for yt, yp in zip(y_test, y_pred):        
        for i, lt in enumerate(yt):
            if lt == 1 or yp[i] == 1:
                total_cnt += 1
                if lt == yp[i]:
                    match_cnt += 1           
                
    return match_cnt/total_cnt


def our_score(y_test, y_pred):
    match_cnt = 0
    total_cnt = 0
    for yt, yp in zip(y_test, y_pred):
        for i, lt in enumerate(yt):
            if lt == 1:
                total_cnt += 1                
                if lt == yp[i]:
                    match_cnt += 1
                    
    return match_cnt/total_cnt

vocab = set(feats_by_label['F'])
vocab.update(set(feats_by_label['P']))
vocab.update(set(feats_by_label['MC']))

#vocab.update(set(feats_by_label['F,P']))
#vocab.update(set(feats_by_label['EI']))

tfidf_vect = TfidfVectorizer(ngram_range=(1,1), vocabulary=vocab)#, norm=False, use_idf=False)
tfidf_vect.fit(X) 

clfs = []

clfs.append(("Ada",OneVsRestClassifier(AdaBoostClassifier(random_state=42))))
clfs.append(("Bernou", OneVsRestClassifier(BernoulliNB())))
clfs.append(("SVC ", OneVsRestClassifier(SVC())))
clfs.append(("Multi", OneVsRestClassifier(MultinomialNB())))

tf_train = tfidf_vect.transform(x_train)
tf_test = tfidf_vect.transform(x_test)
for name, clf in clfs:
    clf.fit(tf_train, y_train)    
    y_pred = clf.predict(tf_test)
    print(name, accuracy_score(y_test, y_pred), ml_score(y_test, y_pred), our_score(y_test, y_pred))

Ada 0.333333333333 0.452 0.6348314606741573
Bernou 0.422764227642 0.5361702127659574 0.7078651685393258
SVC  0.19512195122 0.4570446735395189 0.7471910112359551
Multi 0.430894308943 0.5536480686695279 0.7247191011235955


In [100]:
x_test[122]

'Ejecutivo Comercial Bancario -Planificar su trabajo en función a las metas comerciales del mes asignadas por el Banco.\n-Entrevistas, llamadas y citas con el cliente potencial asignado por Leads a través de marketing (90% trabajo en campo y 10% en oficina).\n-Planificar la mejor propuesta de venta a la medida del cliente, a partir del conocimiento de su perfil, su situación financiera y del entorno económico, para así asesorarlos en función a los productos que necesita.\n-Realizar la venta de productos de Ahorro e Inversión, brindando el soporte operativo y seguimiento a las operaciones y transacciones de sus clientes (aperturas, depósitos, suscripciones, fondos mutuos), asegurando que complete el proceso bancario.\n-Participar en las actividades planificadas del equipo a fin de ganar eficiencia y compartir mejores prácticas, buscando su propia actualización financiera-económica. -Mínimo 1 año de experiencia en venta de seguros, rentas vitalicias, productos pasivos u otros intangibles

In [99]:
top_feats_in_doc(tf_test, tfidf_vect.get_feature_names(), 122, top_n=25)

Unnamed: 0,feature,tfidf
0,venta,0.582467
1,productos,0.451144
2,financiera,0.346185
3,comercial,0.301773
4,económica,0.24696
5,marketing,0.208659
6,inversión,0.198278
7,financiero,0.152947
8,deseable,0.150886
9,empresas,0.138068


In [93]:
print(clfs[3][0])
clf = clfs[3][1]
pred_prob = [[format(p, '.1f') for p in probs] for probs in clf.predict_proba(tf_test)]

print("Classes: ", mlb.classes_)
# Economía Internacional
# Finanzas
# Teoría Económica
# Organización Industrial
# Métodos Cuantitativos/Investigación económica
# Proyectos/Planeamiento estratégico
# Estudios de mercado

for idx, (yt, yp, prob) in enumerate(zip(y_test, y_pred, pred_prob)):
    print(idx, "True: ", yt, " Pred: ", yp , "Probs: ", prob)

Multi
Classes:  ['EI' 'EM' 'F' 'MC' 'OI' 'P' 'TE']
0 True:  [0 0 0 0 0 1 0]  Pred:  [0 0 1 0 0 1 0] Probs:  ['0.0', '0.1', '0.5', '0.1', '0.0', '0.8', '0.0']
1 True:  [0 0 1 0 0 0 0]  Pred:  [0 0 1 0 0 1 0] Probs:  ['0.0', '0.1', '0.8', '0.2', '0.0', '0.6', '0.0']
2 True:  [0 0 0 0 0 1 0]  Pred:  [0 0 1 0 0 1 0] Probs:  ['0.0', '0.1', '0.8', '0.0', '0.0', '0.7', '0.0']
3 True:  [0 0 1 0 0 1 0]  Pred:  [0 0 1 0 0 1 0] Probs:  ['0.0', '0.1', '0.7', '0.1', '0.0', '0.8', '0.0']
4 True:  [0 0 1 0 0 1 0]  Pred:  [0 0 1 0 0 1 0] Probs:  ['0.0', '0.1', '0.9', '0.1', '0.0', '0.6', '0.0']
5 True:  [0 0 0 0 0 1 0]  Pred:  [0 0 0 0 0 1 0] Probs:  ['0.0', '0.1', '0.3', '0.2', '0.0', '0.6', '0.0']
6 True:  [0 0 0 0 0 1 0]  Pred:  [0 0 1 0 0 1 0] Probs:  ['0.0', '0.2', '0.5', '0.1', '0.0', '0.6', '0.0']
7 True:  [0 0 0 0 0 1 0]  Pred:  [0 0 0 0 0 1 0] Probs:  ['0.0', '0.1', '0.3', '0.1', '0.0', '0.8', '0.0']
8 True:  [0 1 0 0 0 0 0]  Pred:  [0 0 0 0 0 1 0] Probs:  ['0.0', '0.2', '0.4', '0.2', '0.0', 

In [15]:
vec= tfidf_vect

In [16]:
Xtr = vec.fit_transform(X)
features = vec.get_feature_names()

In [120]:
def top_tfidf_feats(row, features, top_n=25):
    ''' Get top n tfidf values in row and return them with their corresponding feature names.'''
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf']
    return df

def top_feats_in_doc(Xtr, features, row_id, top_n=25):
    ''' Top tfidf features in specific document (matrix row) '''
    row = np.squeeze(Xtr[row_id].toarray())
    return top_tfidf_feats(row, features, top_n)

def top_mean_feats(Xtr, features, grp_ids=None, min_tfidf=0.1, top_n=25):
    ''' Return the top n features that on average are most important amongst documents in rows
        indentified by indices in grp_ids. '''
    if grp_ids:
        D = Xtr[grp_ids].toarray()
    else:
        D = Xtr.toarray()

    D[D < min_tfidf] = 0
    tfidf_means = np.nanmean(np.where(matrix!=0,matrix,np.nan),1)
    #tfidf_means = np.mean(D, axis=0)
    return top_tfidf_feats(tfidf_means, features, top_n)

def top_feats_by_class(Xtr, y, features, min_tfidf=0.1, top_n=25):
    ''' Return a list of dfs, where each df holds top_n features and their mean tfidf value
        calculated across documents with the same class label. '''
    dfs = []
    labels = np.unique(y)    
    for label in labels:
        ids = np.where(y==label)        
        feats_df = top_mean_feats(Xtr, features, ids, min_tfidf=min_tfidf, top_n=top_n)
        feats_df.label = label
        dfs.append(feats_df)
    return dfs

def plot_tfidf_classfeats_h(dfs):
    ''' Plot the data frames returned by the function plot_tfidf_classfeats(). '''
    fig = plt.figure(figsize=(12, 9), facecolor="w")
    x = np.arange(len(dfs[0]))
    for i, df in enumerate(dfs):
        ax = fig.add_subplot(1, len(dfs), i+1)
        ax.spines["top"].set_visible(False)
        ax.spines["right"].set_visible(False)
        ax.set_frame_on(False)
        ax.get_xaxis().tick_bottom()
        ax.get_yaxis().tick_left()
        ax.set_xlabel("Mean Tf-Idf Score", labelpad=16, fontsize=14)
        ax.set_title("label = " + str(df.label), fontsize=16)
        ax.ticklabel_format(axis='x', style='sci', scilimits=(-2,2))
        ax.barh(x, df.tfidf, align='center', color='#3F5D7D')
        ax.set_yticks(x)
        ax.set_ylim([-1, x[-1]+1])
        yticks = ax.set_yticklabels(df.feature)
        plt.subplots_adjust(bottom=0.09, right=0.97, left=0.15, top=0.95, wspace=0.52)
    plt.show()