In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

In [11]:
# Get data to work with

from observatorio_laboral.offer.offer_controller import OfferController
from observatorio_laboral.offer.date_range import DateRange

text_fields = []
oc = OfferController(text_fields = ["Job Title", "Description", "Qualifications"])
date_range = DateRange(1, 2013, 5, 2017)
source = "symplicity"

oc.load_offers(source, date_range)
print(len(oc.offers))

oc.filter_offers_by_career("ECONOMÍA")
print(len(oc.offers))

oc.filter_offers_by_field("Areas")
print(len(oc.offers))

offer_texts = oc.get_text()
offer_classes = oc.get_field_labels("Areas", ignore=['MC', 'TE', 'OI', 'EI', 'EM'])

12653
12653
491


In [None]:
with open("")

In [14]:
offer_texts = [text for text, labels in zip(offer_texts, offer_classes) if labels != []]
offer_classes = [labels for labels in offer_classes if labels != []]

print(len(offer_texts))

430


In [24]:
# Over-sampling approach
X = offer_texts
y = offer_classes

#from imblearn.over_sampling import RandomOverSampler
#ros = RandomOverSampler(random_state=42)
#X_resampled, y_resampled = ros.fit_sample(X,y)

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

x_train, x_test, y_train, y_test = train_test_split(X, y, random_state=42)

mlb = MultiLabelBinarizer()
y_train = mlb.fit_transform(y_train)
y_test = mlb.transform(y_test)

In [25]:
lemmatizer = {}
with open("lemmatization-es.txt") as lemma_file:
    for line in lemma_file:
        line = line.split()
        lemmatizer[line[1]] = line[0]
        
        
class CustomTokenizer(object):
    def __init__(self, lemmatizer):
        self.wnl = lemmatizer
        self.tok = TfidfVectorizer().build_tokenizer()
        
    def __call__(self, doc):
        #return [word for word in self.tok(doc)]    
        tokens = []
        for t in self.tok(doc):
            while t in self.wnl:
                t = self.wnl[t]                
            tokens.append(t)                
        return tokens
custom_tokenizer = CustomTokenizer(lemmatizer)

In [28]:
X = offer_texts

stop_words = stopwords.words('spanish') + ['parar']

vec = TfidfVectorizer(ngram_range=(2,2), stop_words=stop_words, tokenizer=custom_tokenizer, norm=None)
vec.fit(X) 

Xtr = vec.fit_transform(X)
features = vec.get_feature_names()

y_w = np.array([",".join(labels) for labels in offer_classes])
dfs = top_feats_by_class(Xtr, y_w, features)


In [29]:
dfs['P']

Unnamed: 0,feature,tfidf
0,año experiencia,0.964771
1,gestión proyectar,0.878574
2,asir comer,0.809556
3,recurso humanar,0.78062
4,mínimo año,0.650363
5,experiencia laboral,0.642952
6,sector público,0.637833
7,proyectar investigación,0.541885
8,inversión público,0.523242
9,experiencia mínimo,0.48948


In [30]:
np.setdiff1d(dfs['P']['feature'], np.union1d(dfs['F']['feature'], dfs['F,P']['feature']))

array(['comité directivo', 'evaluación desempeñar', 'experiencia laboral',
       'gas natural', 'gerente público', 'gestión proyectar',
       'indicador gestión', 'ingeniería industrial', 'inversión público',
       'mejorar procesar', 'monitoreo evaluación', 'plan estratégico',
       'proponer mejorar', 'proyectar investigación', 'recurso humanar',
       'sector público'], dtype=object)

In [31]:
dfs['F,P']

Unnamed: 0,feature,tfidf
0,asir comer,0.961232
1,año experiencia,0.759001
2,flujo caja,0.749585
3,planeamiento financiero,0.701862
4,nivel intermediar,0.666946
5,presupuestar anual,0.644459
6,control gestión,0.639644
7,mínimo año,0.636751
8,evaluación proyectar,0.623654
9,hacer seguimiento,0.618681


In [32]:
np.setdiff1d(dfs['F,P']['feature'], np.union1d(dfs['P']['feature'], dfs['F']['feature']))

array(['administración finanzas', 'cliente aliar',
       'elaboración presupuestar', 'evaluación proyectar',
       'experiencia año', 'financiero empresa', 'idioma inglés',
       'planeamiento financiero', 'presupuestar anual',
       'proyectar inversión', 'realizar análisis', 'unidad negociar'], dtype=object)

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

def ml_score(y_test, y_pred):    
    match_cnt = 0
    total_cnt = 0
    for yt, yp in zip(y_test, y_pred):        
        for i, lt in enumerate(yt):
            if lt == 1 or yp[i] == 1:
                total_cnt += 1
                if lt == yp[i]:
                    match_cnt += 1           
                
    return match_cnt/total_cnt


def our_score(y_test, y_pred):
    match_cnt = 0
    total_cnt = 0
    for yt, yp in zip(y_test, y_pred):
        for i, lt in enumerate(yt):
            if lt == 1:
                total_cnt += 1                
                if lt == yp[i]:
                    match_cnt += 1
                    
    return match_cnt/total_cnt

vocab = set(list(dfs['P']['feature']))
vocab.update(set(list(dfs['F']['feature'])))
#vocab.update(set(list(dfs['FI,PP']['feature'])))
#vocab.update(set(feats_by_label['P']))
#vocab.update(set(feats_by_label['MC']))

#vocab.update(set(feats_by_label['F,P']))
#vocab.update(set(feats_by_label['EI']))

tfidf_vect = TfidfVectorizer(vocabulary=vocab, norm=None, tokenizer=custom_tokenizer)
tfidf_vect.fit(X) 

clfs = []

clfs.append(("Ada",OneVsRestClassifier(AdaBoostClassifier(random_state=42))))
clfs.append(("Bernou", OneVsRestClassifier(BernoulliNB())))
clfs.append(("SVC ", OneVsRestClassifier(SVC())))
clfs.append(("Multi", OneVsRestClassifier(MultinomialNB())))

tf_train = tfidf_vect.transform(x_train)
tf_test = tfidf_vect.transform(x_test)
for name, clf in clfs:
    clf.fit(tf_train, y_train)    
    y_pred = clf.predict(tf_test)
    print(name, accuracy_score(y_test, y_pred), ml_score(y_test, y_pred), our_score(y_test, y_pred))

Ada 0.268518518519 0.6342592592592593 1.0
Bernou 0.268518518519 0.6342592592592593 1.0
SVC  0.268518518519 0.6342592592592593 1.0
Multi 0.268518518519 0.6342592592592593 1.0


In [34]:
print(clfs[3][0])
clf = clfs[3][1]
pred_prob = [[format(p, '.1f') for p in probs] for probs in clf.predict_proba(tf_test)]

print("Classes: ", mlb.classes_)
# Economía Internacional
# Finanzas
# Teoría Económica
# Organización Industrial
# Métodos Cuantitativos/Investigación económica
# Proyectos/Planeamiento estratégico
# Estudios de mercado

for idx, (yt, yp, prob) in enumerate(zip(y_test, y_pred, pred_prob)):
    print(idx, "True: ", yt, " Pred: ", yp , "Probs: ", prob)

Multi
Classes:  ['F' 'P']
0 True:  [1 0]  Pred:  [1 1] Probs:  ['0.7', '0.6']
1 True:  [0 1]  Pred:  [1 1] Probs:  ['0.7', '0.6']
2 True:  [0 1]  Pred:  [1 1] Probs:  ['0.7', '0.6']
3 True:  [1 0]  Pred:  [1 1] Probs:  ['0.7', '0.6']
4 True:  [0 1]  Pred:  [1 1] Probs:  ['0.7', '0.6']
5 True:  [1 1]  Pred:  [1 1] Probs:  ['0.7', '0.6']
6 True:  [0 1]  Pred:  [1 1] Probs:  ['0.7', '0.6']
7 True:  [0 1]  Pred:  [1 1] Probs:  ['0.7', '0.6']
8 True:  [1 1]  Pred:  [1 1] Probs:  ['0.7', '0.6']
9 True:  [1 1]  Pred:  [1 1] Probs:  ['0.7', '0.6']
10 True:  [0 1]  Pred:  [1 1] Probs:  ['0.7', '0.6']
11 True:  [1 0]  Pred:  [1 1] Probs:  ['0.7', '0.6']
12 True:  [0 1]  Pred:  [1 1] Probs:  ['0.7', '0.6']
13 True:  [0 1]  Pred:  [1 1] Probs:  ['0.7', '0.6']
14 True:  [1 0]  Pred:  [1 1] Probs:  ['0.7', '0.6']
15 True:  [1 0]  Pred:  [1 1] Probs:  ['0.7', '0.6']
16 True:  [0 1]  Pred:  [1 1] Probs:  ['0.7', '0.6']
17 True:  [1 0]  Pred:  [1 1] Probs:  ['0.7', '0.6']
18 True:  [1 0]  Pred:  [1 1] 

In [21]:
top_feats_in_doc(tf_test, tfidf_vect.get_feature_names(), 4, 20)

Unnamed: 0,feature,tfidf
0,realizar,5.425838
1,riesgo,4.862758
2,cliente,3.908241
3,información,3.837227
4,análisis,3.438001
5,comer,1.819084
6,reportar,1.79825
7,empresa,1.71425
8,área,1.650008
9,público,0.0


In [None]:
np.setdiff1d(dfs['FI']['feature'], np.union1d(dfs['FI,PP']['feature'], dfs['PP']['feature']))

In [None]:
x_test[15]

In [None]:
vec= tfidf_vect

In [None]:
Xtr = vec.fit_transform(X)
features = vec.get_feature_names()

In [7]:
def top_tfidf_feats(row, features, top_n=25):
    ''' Get top n tfidf values in row and return them with their corresponding feature names.'''
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf']
    return df

def top_feats_in_doc(Xtr, features, row_id, top_n=25):
    ''' Top tfidf features in specific document (matrix row) '''
    row = np.squeeze(Xtr[row_id].toarray())
    return top_tfidf_feats(row, features, top_n)

def top_mean_feats(Xtr, features, grp_ids=None, min_tfidf=0.1, top_n=25):
    ''' Return the top n features that on average are most important amongst documents in rows
        indentified by indices in grp_ids. '''
    if grp_ids:
        D = Xtr[grp_ids].toarray()
    else:
        D = Xtr.toarray()

    D[D < min_tfidf] = 0
    #tfidf_means = np.nanmean(np.where(matrix!=0,matrix,np.nan),1)
    tfidf_means = np.mean(D, axis=0)
    return top_tfidf_feats(tfidf_means, features, top_n)

def top_feats_by_class(Xtr, y, features, min_tfidf=0.1, top_n=25):
    ''' Return a list of dfs, where each df holds top_n features and their mean tfidf value
        calculated across documents with the same class label. '''
    dfs = {}
    labels = np.unique(y)
    for label in labels:
        ids = np.where(y==label)          
        feats_df = top_mean_feats(Xtr, features, ids, min_tfidf=min_tfidf, top_n=top_n)
        feats_df.label = label
        dfs[label] = feats_df        
    return dfs

def plot_tfidf_classfeats_h(dfs):
    ''' Plot the data frames returned by the function plot_tfidf_classfeats(). '''
    fig = plt.figure(figsize=(12, 9), facecolor="w")
    x = np.arange(len(dfs[0]))
    for i, df in enumerate(dfs):
        ax = fig.add_subplot(1, len(dfs), i+1)
        ax.spines["top"].set_visible(False)
        ax.spines["right"].set_visible(False)
        ax.set_frame_on(False)
        ax.get_xaxis().tick_bottom()
        ax.get_yaxis().tick_left()
        ax.set_xlabel("Mean Tf-Idf Score", labelpad=16, fontsize=14)
        ax.set_title("label = " + str(df.label), fontsize=16)
        ax.ticklabel_format(axis='x', style='sci', scilimits=(-2,2))
        ax.barh(x, df.tfidf, align='center', color='#3F5D7D')
        ax.set_yticks(x)
        ax.set_ylim([-1, x[-1]+1])
        yticks = ax.set_yticklabels(df.feature)
        plt.subplots_adjust(bottom=0.09, right=0.97, left=0.15, top=0.95, wspace=0.52)
    plt.show()