In [55]:
# Get data to work with

from observatorio_laboral.offer.offer_controller import OfferController
from observatorio_laboral.offer.date_range import DateRange

text_fields = []
oc = OfferController(text_fields = ["Job Title", "Description", "Qualifications"])
date_range = DateRange(1, 2013, 5, 2017)
source = "symplicity"

oc.load_offers(source, date_range)
oc.filter_offers_by_career("ECONOMÍA")
oc.filter_offers_by_field("Areas")

offer_texts = oc.get_text()
offer_classes = oc.get_field_labels("Areas")


In [57]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

vec = TfidfVectorizer(offer_texts, ngram_range=(1,1), stop_words = stopwords.words('spanish'))
X = offer_texts

Xtr = vec.fit_transform(X)
features = vec.get_feature_names()

In [58]:
import numpy as np
import pandas as pd

def top_tfidf_feats(row, features, top_n=25):
    ''' Get top n tfidf values in row and return them with their corresponding feature names.'''
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf']
    return df

In [59]:
def top_feats_in_doc(Xtr, features, row_id, top_n=25):
    ''' Top tfidf features in specific document (matrix row) '''
    row = np.squeeze(Xtr[row_id].toarray())
    return top_tfidf_feats(row, features, top_n)

In [60]:
top_feats_in_doc(Xtr, features, 5)

Unnamed: 0,feature,tfidf
0,envío,0.207776
1,importaciones,0.20741
2,factura,0.204628
3,importación,0.192102
4,prima,0.192102
5,comercio,0.183506
6,exportación,0.176322
7,emisión,0.166871
8,carga,0.158164
9,exportaciones,0.153471


In [62]:
def top_mean_feats(Xtr, features, grp_ids=None, min_tfidf=0.1, top_n=25):
    ''' Return the top n features that on average are most important amongst documents in rows
        indentified by indices in grp_ids. '''
    if grp_ids:
        D = Xtr[grp_ids].toarray()
    else:
        D = Xtr.toarray()

    D[D < min_tfidf] = 0
    tfidf_means = np.mean(D, axis=0)
    return top_tfidf_feats(tfidf_means, features, top_n)

In [63]:
top_mean_feats(Xtr, features, min_tfidf=0, top_n=15)

Unnamed: 0,feature,tfidf
0,análisis,0.039802
1,gestión,0.036279
2,elaboración,0.035589
3,experiencia,0.033909
4,proyectos,0.033534
5,control,0.028789
6,información,0.028783
7,clientes,0.028399
8,nivel,0.027085
9,apoyo,0.027078


In [75]:
y = [",".join(labels) for labels in y]


In [77]:
def top_feats_by_class(Xtr, y, features, min_tfidf=0.1, top_n=25):
    ''' Return a list of dfs, where each df holds top_n features and their mean tfidf value
        calculated across documents with the same class label. '''
    dfs = []
    labels = np.unique(y)    
    for label in labels:
        ids = np.where(y==label)        
        feats_df = top_mean_feats(Xtr, features, ids, min_tfidf=min_tfidf, top_n=top_n)
        feats_df.label = label
        dfs.append(feats_df)
    return dfs

In [78]:
top_feats_by_class(Xtr, y, features)

  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)


[        feature  tfidf
 0        útiles    NaN
 1   econometric    NaN
 2         drive    NaN
 3        driven    NaN
 4       drivers    NaN
 5        ductos    NaN
 6         dudas    NaN
 7       dudosas    NaN
 8           due    NaN
 9         dueño    NaN
 10       dueños    NaN
 11     duración    NaN
 12        duros    NaN
 13       duties    NaN
 14          dwh    NaN
 15     dándoles    NaN
 16      débiles    NaN
 17       débito    NaN
 18          día    NaN
 19         días    NaN
 20      dólares    NaN
 21        earth    NaN
 22       ebitda    NaN
 23   ecológicas    NaN
 24      econmic    NaN,         feature  tfidf
 0        útiles    NaN
 1   econometric    NaN
 2         drive    NaN
 3        driven    NaN
 4       drivers    NaN
 5        ductos    NaN
 6         dudas    NaN
 7       dudosas    NaN
 8           due    NaN
 9         dueño    NaN
 10       dueños    NaN
 11     duración    NaN
 12        duros    NaN
 13       duties    NaN
 14          dwh