# Revisión de convocatorias de entrenamiento

Se extraen las convocatorias de la tabla general y se seleccionan aquellas etiquetadas como perteneciente a la carrera por symplicity (Majors/Concentrations)

In [1]:
from observatorio_laboral.offer import OfferController
from observatorio_laboral.offer import Offer
from observatorio_laboral.offer import DateRange
from random import shuffle

keyspace = "l4_test"
table = "all_offers"

oc = OfferController(keyspace, table)
date_range = DateRange(1, 2013, 12, 2017)
source = "symplicity"

oc.load_offers(source, date_range)

In [2]:
Offer.ConnectToDatabase(keyspace, "reviewed_offers")
career_offers = []
no_career_offers = []

for offer in oc.offers:
    careers = [career.strip() for career in offer.features['Majors/Concentrations'].split(",")]
    if "ECONOMÍA" in careers:
        if len(careers) <= 15:           
            career_offers.append(offer)
    else:
        no_career_offers.append(offer)


shuffle(no_career_offers)
no_career_offers = no_career_offers[:len(career_offers)]
for offer in no_career_offers:
    offer.table = "reviewed_offers"
    offer.Insert()
    
for offer in career_offers:
    offer.career = "ECONOMÍA"
    offer.table = "reviewed_offers"
    offer.Insert()

# Clasificador de carreras


In [1]:
from observatorio_laboral.offer import OfferController
from observatorio_laboral.offer import Offer
from observatorio_laboral.offer import DateRange
from random import shuffle

keyspace = "l4_test"
career = "ECONOMÍA"

oc_train = OfferController(keyspace, "reviewed_offers")
train_date_range = DateRange(1, 2013, 12, 2017)
train_source = "symplicity"

oc_train.load_offers(train_source, train_date_range, career)
positive_offers = oc_train.offers
print(positive_offers[0].features['Job Title'])

oc_train.offers = []
oc_train.load_offers(train_source, train_date_range, career="")
negative_offers = oc_train.offers
print(negative_offers[0].features['Job Title'])

oc_train.offers = positive_offers + negative_offers
shuffle(oc_train.offers)

01 EXPERTO EN COSTOS (SECT. SERVICIOS) - Zona: SURCO
Abogado


In [2]:
train_text_fields = ["Job Title",
                     "Description",
                     "Qualifications"]

X = oc_train.get_text(train_text_fields)
y = []
for offer in oc_train.offers:
    if offer.career == "ECONOMÍA":
        y.append(1)
    else:
        y.append(0)

In [3]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.multiclass import OneVsRestClassifier
from sklearn.feature_selection import SelectKBest, chi2, SelectFromModel
from nltk.corpus import stopwords
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


from time import time

vectorizers = [TfidfVectorizer(),
              ]

feature_selectors = [SelectKBest(chi2, k=1000),
                     #SelectFromModel(OneVsRestClassifier(LinearSVC(penalty="l1", dual=False, tol=1e-3, C=10000))), 
                    ]

classifiers = [LinearSVC(class_weight="balanced"),               
               MultinomialNB(),
               BernoulliNB(),
              ]

parameters = {
    'vec__max_df' : (0.5, 0.75, 1.0),
    #'vec__min_df' : (0.0, 0.05, 0.01),
    'vec__ngram_range' : ((1,1), (1,2), (1,3)),
    #'vec__vocabulary' : (None, vocab),    
    'vec__use_idf' : (False, True),
    'vec__norm' : (None, 'l2'),
    'vec__binary' : (True, False),    
    #'vec__tokenizer' : (None, word_tokenize),
    'vec__stop_words' : (None, stopwords.words('spanish')),
    #'fs__k' : (1000,),    
}


for vectorizer in vectorizers:
    vec = ('vec', vectorizer)
    for feature_selector in feature_selectors:
        fs = ('fs', feature_selector)
        for classifier in classifiers:
            custom_parameters = {}
            clf = ('clf', classifier)
            
            pipeline = Pipeline([vec, fs, clf])
            
            # Add to dictionaries without altering them
            all_params = {**parameters, **custom_parameters}
            
            grid_search = GridSearchCV(pipeline, all_params,
                                       scoring="precision",                                       
                                       n_jobs=-1)            
            
            
            print("Pipeline: ")
            print(vectorizer.__class__)
            print(feature_selector.__class__)
            print(classifier.__class__)
            
            t0 = time()
            grid_search.fit(X_train, y_train)
            #print(len([vocab[i] for i in grid_search.best_estimator_.steps[1][1].get_support(indices=True)]))
            print("done in %0.3fs" % (time() - t0))
            print()
            
            print("Best F1 score: %0.3f" % grid_search.best_score_)
                        
            y_pred = grid_search.predict(X_test)
            print("Metrics :")            
            
            print("Accuracy: %0.3f" %  accuracy_score(y_test, y_pred))            
            print(classification_report(y_test, y_pred))
            
            print("Best parameters set:")            
            best_parameters = grid_search.best_estimator_.get_params()
            for param_name in sorted(all_params.keys()):
                if param_name == "vec__stop_words":
                    if best_parameters[param_name] == None:
                        print("\t%s: None" % (param_name))
                    else:
                        print("\t%s: spanish" % (param_name))
                elif param_name == "vec__vocabulary":
                    if best_parameters[param_name] == None:
                        print("\t%s: Default" % (param_name))
                    else:
                        print("\t%s: Reviewed" % (param_name))
                else:
                    print("\t%s: %r" % (param_name, best_parameters[param_name]))            
            print("================================================================")
            print()

Pipeline: 
<class 'sklearn.feature_extraction.text.TfidfVectorizer'>
<class 'sklearn.feature_selection.univariate_selection.SelectKBest'>
<class 'sklearn.svm.classes.LinearSVC'>
done in 1278.671s

Best F1 score: 0.828
Metrics :
Accuracy: 0.797
             precision    recall  f1-score   support

          0       0.76      0.87      0.81      3120
          1       0.84      0.72      0.78      3021

avg / total       0.80      0.80      0.80      6141

Best parameters set:
	vec__binary: False
	vec__max_df: 1.0
	vec__ngram_range: (1, 2)
	vec__norm: None
	vec__stop_words: None
	vec__use_idf: True

Pipeline: 
<class 'sklearn.feature_extraction.text.TfidfVectorizer'>
<class 'sklearn.feature_selection.univariate_selection.SelectKBest'>
<class 'sklearn.naive_bayes.MultinomialNB'>


In [None]:
keyspace = "l4_test"
oc_pred = OfferController(keyspace, "all_offers")
pred_date_range = DateRange(1, 2016, 12, 2016)
pred_source = "aptitus"
