# Revisión de convocatorias de entrenamiento

Se extraen las convocatorias de la tabla general y se seleccionan aquellas etiquetadas como perteneciente a la carrera por symplicity (Majors/Concentrations)

In [None]:
from observatorio_laboral.offer import OfferController
from observatorio_laboral.offer import Offer
from observatorio_laboral.offer import DateRange
from random import shuffle

keyspace = "l4_test"
table = "all_offers"

oc = OfferController(keyspace, table)
date_range = DateRange(1, 2013, 12, 2017)
source = "symplicity"

oc.load_offers(source, date_range)
print(len(oc.offers))

In [None]:
Offer.ConnectToDatabase(keyspace, "career_train")
career_offers = []
no_career_offers = []

for offer in oc.offers:
    careers = [career.strip() for career in offer.features['Majors/Concentrations'].split(",")]
    if "ECONOMÍA" in careers:
        if len(careers) <= 40:
            career_offers.append(offer)
    else:
        no_career_offers.append(offer)

shuffle(no_career_offers)
#no_career_offers = no_career_offers[:2*len(career_offers)]

for offer in no_career_offers:
    offer.table = "career_train"
    offer.career = "NO-ECONOMÍA"
    offer.Insert()
    
for offer in career_offers:
    offer.career = "ECONOMÍA"
    offer.table = "career_train"
    offer.Insert()
    
print(len(career_offers))    
print(len(no_career_offers))

# Clasificador de carreras


In [None]:
from observatorio_laboral.offer import OfferController
from observatorio_laboral.offer import Offer
from observatorio_laboral.offer import DateRange
from random import shuffle

keyspace = "l4_test"
career = "ECONOMÍA"

oc_train = OfferController(keyspace, "career_train")
train_date_range = DateRange(1, 2013, 12, 2017)
train_source = "symplicity"

oc_train.load_offers(train_source, train_date_range, career)
positive_offers = oc_train.offers
print(positive_offers[0].features['Job Title'])

oc_train.offers = []
oc_train.load_offers(train_source, train_date_range, "NO-" + career)
negative_offers = oc_train.offers
print(negative_offers[0].features['Job Title'])

oc_train.offers = positive_offers + negative_offers
shuffle(oc_train.offers)

In [None]:
len(positive_offers)

In [None]:
len(negative_offers)

In [None]:
train_text_fields = ["Job Title",
                     "Description",
                     "Qualifications"]

X = oc_train.get_text(train_text_fields)

# Simple text preprocesing
punctuations = ['•','/', ')', '-']
translator = str.maketrans("".join(punctuations),' '*len(punctuations))

proc_data = []
for text in X:
    text = text.lower()
    text = text.translate(translator)
    proc_data.append(text)    
    
X = proc_data
y = []
for offer in oc_train.offers:
    if offer.career == "ECONOMÍA":
        y.append(1)
    else:
        y.append(0)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest, chi2, SelectFromModel
from nltk.corpus import stopwords
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from nltk.tokenize import word_tokenize
from time import time



#class 'sklearn.feature_selection.univariate_selection.SelectKBest'>
#class 'sklearn.naive_bayes.MultinomialNB'>
#done in 111.597s

#Best F1 score: 0.822
#Metrics :
#Accuracy: 0.863
#             precision    recall  f1-score   support

#          0       0.86      0.99      0.92     18754
#          1       0.83      0.16      0.27      3487

#avg / total       0.86      0.86      0.82     22241

#Best parameters set:
#	vec__binary: True
#	vec__norm: 'l2'
#	vec__stop_words: None
#	vec__use_idf: False
#=============================

pipeline = Pipeline([
    ('vec', TfidfVectorizer(min_df = 0.01,
                            binary = True,
                            norm = 'l2',
                            use_idf=True,
                            ngram_range=(1,2)
                           )),
    #('fs', SelectKBest(chi2, k=1000)),
    ('clf', MultinomialNB())])

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

print("Metrics :")            

print("Accuracy: %0.3f" %  accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


In [None]:
vocab = pipeline.steps[0][1].get_feature_names()
print(len(vocab))
vocab

In [None]:
keyspace = "l4_test"
oc_pred = OfferController(keyspace, "all_offers")
pred_date_range = DateRange(7, 2016, 6, 2017)
pred_source = "aptitus"

oc_pred.load_offers(pred_source, pred_date_range)
print(len(oc_pred.offers))

In [None]:
features = set()
for offer in oc_pred.offers:
    for feat in offer.features:
        features.add(feat)

In [None]:
features

In [None]:
pred_text_fields = ["título",
                    "descripción",
                    "requisitos",
                    "NombreAviso",
                    "FuncionesResponsabilidades",                    
                    "Requerimientos",
                   ]

X_pred = oc_pred.get_text(pred_text_fields)
print(X_pred[0])
# Simple text preprocesing
punctuations = ['•','/', ')', '-']
translator = str.maketrans("".join(punctuations),' '*len(punctuations))

proc_data = []
for text in X_pred:
    text = text.lower()
    text = text.translate(translator)
    proc_data.append(text)    
    
X_pred = proc_data

In [None]:
for i in range(100):
    print(oc_pred.offers[i].features)
    

In [None]:
X_pred

In [None]:
y_preds = pipeline.predict(X_pred)

In [None]:
cnt = 0
filtered_offers = []
for offer, y in zip(oc_pred.offers, y_preds):
    if y == 1:
        cnt += 1
        filtered_offers.append(offer)
        #print(offer)
#        print("=============================================================================================")
#        print()
cnt

In [None]:
import csv

fieldnames = ["ID","year", "month", "source","Título", 'Descripción', 'Requerimientos','Empresa', 'Salario']

with open("Data_A_Limpiar/Economia/Aptitus-2016-2-2017-1.csv", "w") as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for offer in filtered_offers:
        row = {}
        row['ID'] = offer.id
        row['year'] = offer.year
        row['month'] = offer.month
        row['source'] = offer.source
        
        if "título" in offer.features:
            row['Título'] = offer.features['título']
            
        if "NombreAviso" in offer.features:
            row['Título'] = offer.features['NombreAviso']
            
        if "descripción" in offer.features:
            row['Descripción'] = offer.features['descripción']
            
        if "FuncionesResponsabilidades" in offer.features:
            row['Descripción'] = offer.features['FuncionesResponsabilidades']
            
        if "requerimientos" in offer.features:
            row['Requerimientos'] = offer.features['requerimientos']
        if "Requisitos" in offer.features:
            row['Requerimientos'] = offer.features['Requisitos']
            
        if "Empresa" in offer.features:
            row['Empresa'] = offer.features['Empresa']
            
        if "empresa" in offer.features:
            row['Empresa'] = offer.features['empresa']

            
        if "datos de la empresa" in offer.features:
            row['Empresa'] = offer.features['datos de la empresa']
            
        if "business" in offer.features:
            row['Empresa'] = offer.features['business']
            
        if "condiciones salariales" in offer.features:
            row['Salario'] = offer.features['condiciones salariales']
            
            
        writer.writerow(row)

# Entrenamiento

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.multiclass import OneVsRestClassifier
from sklearn.feature_selection import SelectKBest, chi2, SelectFromModel
from nltk.corpus import stopwords
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


from time import time

vectorizers = [TfidfVectorizer(max_df=0.5, ngram_range=(1,1)),#, min_df=0.01),
              ]

feature_selectors = [SelectKBest(chi2, k=1000),
                     #SelectFromModel(OneVsRestClassifier(LinearSVC(penalty="l1", dual=False, tol=1e-3, C=10000))), 
                    ]

classifiers = [LinearSVC(class_weight="balanced"),               
               MultinomialNB(),
               BernoulliNB(),
              ]

parameters = {
    #'vec__max_df' : (0.5, 0.75, 1.0),
    #'vec__min_df' : (0.0, 0.05, 0.01),
    #'vec__ngram_range' : ((1,1), (1,2)),#(1,3)),
    #'vec__vocabulary' : (None, vocab),    
    'vec__use_idf' : (False, True),
    'vec__norm' : (None, 'l2'),
    'vec__binary' : (True, False),    
    #'vec__tokenizer' : (None, word_tokenize),
    'vec__stop_words' : (None, stopwords.words('spanish')),
    #'fs__k' : (1000,),    
}


for vectorizer in vectorizers:
    vec = ('vec', vectorizer)
    for feature_selector in feature_selectors:
        fs = ('fs', feature_selector)
        for classifier in classifiers:
            custom_parameters = {}
            clf = ('clf', classifier)
            
            pipeline = Pipeline([vec, fs, clf])
            
            # Add to dictionaries without altering them
            all_params = {**parameters, **custom_parameters}
            
            grid_search = GridSearchCV(pipeline, all_params,
                                       scoring="precision",                                       
                                       n_jobs=-1)            
            
            
            print("Pipeline: ")
            print(vectorizer.__class__)
            print(feature_selector.__class__)
            print(classifier.__class__)
            
            t0 = time()
            grid_search.fit(X_train, y_train)
            #print(len([vocab[i] for i in grid_search.best_estimator_.steps[1][1].get_support(indices=True)]))
            print("done in %0.3fs" % (time() - t0))
            print()
            
            print("Best F1 score: %0.3f" % grid_search.best_score_)
                        
            y_pred = grid_search.predict(X_test)
            print("Metrics :")            
            
            print("Accuracy: %0.3f" %  accuracy_score(y_test, y_pred))            
            print(classification_report(y_test, y_pred))
            
            print("Best parameters set:")            
            best_parameters = grid_search.best_estimator_.get_params()
            for param_name in sorted(all_params.keys()):
                if param_name == "vec__stop_words":
                    if best_parameters[param_name] == None:
                        print("\t%s: None" % (param_name))
                    else:
                        print("\t%s: spanish" % (param_name))
                elif param_name == "vec__vocabulary":
                    if best_parameters[param_name] == None:
                        print("\t%s: Default" % (param_name))
                    else:
                        print("\t%s: Reviewed" % (param_name))
                else:
                    print("\t%s: %r" % (param_name, best_parameters[param_name]))            
            print("================================================================")
            print()

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest, chi2, SelectFromModel
from nltk.corpus import stopwords
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from nltk.tokenize import word_tokenize
from time import time

pipeline = Pipeline([
    ('vec', TfidfVectorizer(min_df=70, stop_words=stopwords.words('spanish'), ngram_range=(1,3))),
    #('fs', SelectKBest(chi2, k=10)),
    ('clf', MultinomialNB())
])

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

print("Metrics :")            

print("Accuracy: %0.3f" %  accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


In [None]:
vocab = pipeline.steps[0][1].get_feature_names()
print(len(vocab))
vocab

In [None]:
keyspace = "l4_test"
oc_pred = OfferController(keyspace, "all_offers")
pred_date_range = DateRange(1, 2016, 12, 2017)
pred_source = "bumeran"

oc_pred.load_offers(pred_source, pred_date_range)
print(len(oc_pred.offers))

In [None]:
pred_text_fields = ["título",
                    "descripción",
                    "requerimientos",
                    "NombreAviso",
                    "FuncionesResponsabilidades",
                    "Requisitos",
                   ]

X_pred = oc_pred.get_text(pred_text_fields)
# Simple text preprocesing
punctuations = ['•','/', ')', '-']
translator = str.maketrans("".join(punctuations),' '*len(punctuations))

proc_data = []
for text in X_pred:
    text = text.lower()
    text = text.translate(translator)
    proc_data.append(text)    
    
X_pred = proc_data


In [None]:
y_preds = pipeline.predict(X_pred)

In [None]:
cnt = 0
filtered_offers = []
for offer, y in zip(oc_pred.offers, y_preds):
    if y == 1:
        cnt += 1
        filtered_offers.append(offer)
        #print(offer)
#        print("=============================================================================================")
#        print()
cnt

In [None]:
features = set()
for offer in oc_pred.offers:
    for feat in offer.features:
        features.add(feat)

In [None]:
features

In [None]:
import csv

fieldnames = ["ID","Título", 'Descripción', 'Requerimientos','Empresa', 'Salario']

with open("Data_A_Limpiar/Geografia/Bumeran.csv", "w") as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for offer in filtered_offers:
        row = {}
        row['ID'] = offer.id
        
        if "título" in offer.features:
            row['Título'] = offer.features['título']
            
        if "NombreAviso" in offer.features:
            row['Título'] = offer.features['NombreAviso']
            
        if "descripción" in offer.features:
            row['Descripción'] = offer.features['descripción']
            
        if "FuncionesResponsabilidades" in offer.features:
            row['Descripción'] = offer.features['FuncionesResponsabilidades']
            
        if "requerimientos" in offer.features:
            row['Requerimientos'] = offer.features['requerimientos']
        if "Requisitos" in offer.features:
            row['Requerimientos'] = offer.features['Requisitos']
            
        if "Empresa" in offer.features:
            row['Empresa'] = offer.features['Empresa']
            
        if "empresa" in offer.features:
            row['Empresa'] = offer.features['empresa']

            
        if "datos de la empresa" in offer.features:
            row['Empresa'] = offer.features['datos de la empresa']
            
        if "business" in offer.features:
            row['Empresa'] = offer.features['business']
            
        if "condiciones salariales" in offer.features:
            row['Salario'] = offer.features['condiciones salariales']
            
            
        writer.writerow(row)

In [None]:
#Pasar ofertas limpiadas a tabla de l4_offers

import csv
ids = []
with open("Data A Clasificar/Economia.csv") as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        id = row["id"]
        year = int(row['year'])
        month = int(row['month'])
        source = "symplicity"
        mark = row['Aceptado']
        if mark == 'A':
            ids.append((id,year,month, source))
            
from cassandra.cluster import Cluster
from observatorio_laboral.offer.offer import Offer
from observatorio_laboral.offer.offer_controller import OfferController
from observatorio_laboral.offer.date_range import DateRange

Offer.ConnectToDatabase("l4_test", "reviewed_offers")
cluster = Cluster()
session = cluster.connect()

#select_cmd = """
#            SELECT * FROM symplicity.new_offers;
#             """
#result = session.execute(select_cmd);
result = Offer.Query("l4_test", "all_offers", "select_all", ())

for row in result:    
    id = row.id
    year = row.year
    month = row.month
    careers = row.careers
    features = row.features
    source = "symplicity"

    if (id, year, month, source) in ids:
        #offer = Offer(source, year, month, id, features, careers)
        row.table = "reviewed_offers"
        row.Insert()

# La 

In [23]:
import csv
ids = []
with open("Data_A_Clasificar/rBumeran2016.csv") as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        id = row["ID"]
        year = int(row['year'])
        month = int(row['month'])
        source = "bumeran"
        mark = row['Aceptado']
        if mark == 'A':
            ids.append((id,year,month, source))
from cassandra.cluster import Cluster
from observatorio_laboral.offer.offer import Offer
from observatorio_laboral.offer.offer_controller import OfferController
from observatorio_laboral.offer.date_range import DateRange

Offer.ConnectToDatabase("l4_test", "all_offers")
cluster = Cluster()
session = cluster.connect()
select_cmd = """
            SELECT * FROM l4_test.all_offers;
             """

result = session.execute(select_cmd);
#result = Offer.Query("l4_test", "all_offers", "select_all", ())

for row in result:
    row = Offer.ByRow("l4_test", "all_offers", row)
    id = row.id
    year = row.year
    month = row.month
    careers = row.career
    features = row.features
    source = "bumeran"

    if (id, year, month, source) in ids:
        #offer = Offer(source, year, month, id, features, careers)
        row.table = "reviewed_offers"
        row.Insert()

In [20]:
select_cmd = """
            SELECT * FROM l4_test.all_offers;
             """

result = session.execute(select_cmd);
for x in result:
    offer = Offer.ByRow("l4_test", "all_offers", x)

In [13]:
i = 0
for x in result:
    if i == 4 :
        break
    i+=1
    print(x.career)







In [None]:
import csv
ids = []
with open("Data_A_Clasificar/rApititus2016.csv") as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        id = row["ID"]
        year = int(row['year'])
        month = int(row['month'])
        source = "aptitus"
        mark = row['Aceptado']
        if mark == 'A':
            ids.append((id,year,month, source))

In [None]:
            
from cassandra.cluster import Cluster
from observatorio_laboral.offer.offer import Offer
from observatorio_laboral.offer.offer_controller import OfferController
from observatorio_laboral.offer.date_range import DateRange

Offer.ConnectToDatabase("l4_test", "reviewed_offers")
cluster = Cluster()
session = cluster.connect()

#select_cmd = """
#            SELECT * FROM symplicity.new_offers;
#             """
#result = session.execute(select_cmd);
result = Offer.Query("l4_test", "all_offers", "select_all", ())

for row in result:    
    print("asd")
    id = row.id
    year = row.year
    month = row.month
    careers = row.careers
    features = row.features
    source = "aptitus"

    if (id, year, month, source) in ids:
        print("asda")
        #offer = Offer(source, year, month, id, features, careers)
        row.table = "reviewed_offers"
        row.Insert()
    else :
        print("gg")

In [None]:
#Pasar ofertas limpiadas a tabla de l4_offers

import csv
with open("Data_A_Clasificar/rApititus2016.csv") as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        id = row["id"]
        year = int(row['year'])
        month = int(row['month'])
        source = row['source']
        mark = row['Aceptado']
        career = ""
        if mark == 'A':            
            params = (source, year, month,career, id)
            offer = Offer.Query("l4_test", "all_offers", "select_by_id", params)
            offer.Insert()

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest, chi2, SelectFromModel
from nltk.corpus import stopwords
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from nltk.tokenize import word_tokenize
from time import time

pipeline = Pipeline([
    ('vec', TfidfVectorizer(min_df=0.05, stop_words=stopwords.words('spanish'))),
    #('fs', SelectKBest(chi2, k=1000)),
    ('clf', LinearSVC())
])

parameters = {
    #'vec__max_df' : (0.5, 0.75, 1.0),
    #'vec__min_df' : (0.0, 0.05, 0.01),
    'vec__ngram_range' : ((1,1), (1,2), (1,3)),
    #'vec__vocabulary' : (None, vocab),    
    'vec__use_idf' : (False, True),
    'vec__norm' : (None, 'l2'),
    #'vec__binary' : (True, False),        
    #'vec__stop_words' : (stopwords.words('spanish'),),    
}

grid_search = GridSearchCV(pipeline, parameters,
                           scoring="precision",                                       
                           n_jobs=-1)

print("Pipeline: ")
t0 = time()
grid_search.fit(X_train, y_train)
#selected_vocab = [vocab[i] for i in grid_search.best_estimator_.steps[1][1].get_support(indices=True)]
print("done in %0.3fs" % (time() - t0))
print()

print("Best precision score: %0.3f" % grid_search.best_score_)

y_pred = grid_search.predict(X_test)
print("Metrics :")            

print("Accuracy: %0.3f" %  accuracy_score(y_test, y_pred))            
print(classification_report(y_test, y_pred))

print("Best parameters set:")            
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    if param_name == "vec__stop_words":
        if best_parameters[param_name] == None:
            print("\t%s: None" % (param_name))
        else:
            print("\t%s: spanish" % (param_name))
    elif param_name == "vec__vocabulary":
        if best_parameters[param_name] == None:
            print("\t%s: Default" % (param_name))
        else:
            print("\t%s: Reviewed" % (param_name))
    else:
        print("\t%s: %r" % (param_name, best_parameters[param_name]))            
print("================================================================")
print()

In [None]:
keyspace = "l4_test"
oc_pred = OfferController(keyspace, "all_offers")
pred_date_range = DateRange(1, 2016, 12, 2016)
pred_source = "aptitus"

oc_pred.load_offers(pred_source, pred_date_range)
print(len(oc_pred.offers))

In [None]:
pred_text_fields = ["título",
                    "descripción",
                    "requerimientos",
                    "NombreAviso",
                    "FuncionesResponsabilidades",
                    "Requisitos",
                   ]

X_pred = oc_pred.get_text(pred_text_fields)


In [None]:
y_preds = grid_search.best_estimator_.predict(X_pred)

In [None]:
cnt = 0
for offer, y in zip(X_pred, y_preds):
    if y == 1:
        cnt += 1
        print(offer)
        print("=============================================================================================")
        print()
cnt

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.multiclass import OneVsRestClassifier
from sklearn.feature_selection import SelectKBest, chi2, SelectFromModel
from nltk.corpus import stopwords
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


from time import time

vectorizers = [TfidfVectorizer(),
              ]

feature_selectors = [SelectKBest(chi2, k=1000),
                     #SelectFromModel(OneVsRestClassifier(LinearSVC(penalty="l1", dual=False, tol=1e-3, C=10000))), 
                    ]

classifiers = [LinearSVC(class_weight="balanced"),               
               MultinomialNB(),
               BernoulliNB(),
              ]

parameters = {
    'vec__max_df' : (0.5, 0.75, 1.0),
    #'vec__min_df' : (0.0, 0.05, 0.01),
    'vec__ngram_range' : ((1,1), (1,2), (1,3)),
    #'vec__vocabulary' : (None, vocab),    
    'vec__use_idf' : (False, True),
    'vec__norm' : (None, 'l2'),
    'vec__binary' : (True, False),    
    #'vec__tokenizer' : (None, word_tokenize),
    'vec__stop_words' : (None, stopwords.words('spanish')),
    #'fs__k' : (1000,),    
}


for vectorizer in vectorizers:
    vec = ('vec', vectorizer)
    for feature_selector in feature_selectors:
        fs = ('fs', feature_selector)
        for classifier in classifiers:
            custom_parameters = {}
            clf = ('clf', classifier)
            
            pipeline = Pipeline([vec, fs, clf])
            
            # Add to dictionaries without altering them
            all_params = {**parameters, **custom_parameters}
            
            grid_search = GridSearchCV(pipeline, all_params,
                                       scoring="precision",                                       
                                       n_jobs=-1)            
            
            
            print("Pipeline: ")
            print(vectorizer.__class__)
            print(feature_selector.__class__)
            print(classifier.__class__)
            
            t0 = time()
            grid_search.fit(X_train, y_train)
            #print(len([vocab[i] for i in grid_search.best_estimator_.steps[1][1].get_support(indices=True)]))
            print("done in %0.3fs" % (time() - t0))
            print()
            
            print("Best F1 score: %0.3f" % grid_search.best_score_)
                        
            y_pred = grid_search.predict(X_test)
            print("Metrics :")            
            
            print("Accuracy: %0.3f" %  accuracy_score(y_test, y_pred))            
            print(classification_report(y_test, y_pred))
            
            print("Best parameters set:")            
            best_parameters = grid_search.best_estimator_.get_params()
            for param_name in sorted(all_params.keys()):
                if param_name == "vec__stop_words":
                    if best_parameters[param_name] == None:
                        print("\t%s: None" % (param_name))
                    else:
                        print("\t%s: spanish" % (param_name))
                elif param_name == "vec__vocabulary":
                    if best_parameters[param_name] == None:
                        print("\t%s: Default" % (param_name))
                    else:
                        print("\t%s: Reviewed" % (param_name))
                else:
                    print("\t%s: %r" % (param_name, best_parameters[param_name]))            
            print("================================================================")
            print()

In [None]:
keyspace = "l4_test"
oc_pred = OfferController(keyspace, "all_offers")
pred_date_range = DateRange(1, 2016, 12, 2016)
pred_source = "aptitus"
