In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import csv
import nltk
from sklearn.pipeline import Pipeline
from nltk.tokenize import word_tokenize


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords

In [70]:
# Functions to visualize word vector information
def top_tfidf_feats(row, features, top_n=25):
    ''' Get top n tfidf values in row and return them with their corresponding feature names.'''
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf']
    return df

def top_feats_in_doc(Xtr, features, row_id, top_n=25):
    ''' Top tfidf features in specific document (matrix row) '''
    row = np.squeeze(Xtr[row_id].toarray())
    return top_tfidf_feats(row, features, top_n)

def top_mean_feats(Xtr, features, grp_ids=None, min_tfidf=0.1, top_n=25, func=np.mean):
    ''' Return the top n features that on average are most important amongst documents in rows
        indentified by indices in grp_ids. '''
    if grp_ids:
        D = Xtr[grp_ids].toarray()
    else:
        D = Xtr.toarray()

    D[D < min_tfidf] = 0
    #tfidf_means = np.nanmean(np.where(matrix!=0,matrix,np.nan),1)
    #tfidf_means = np.mean(D, axis=0)
    #tfidf_means = np.sum(D, axis=0)
    tfidf_means = func(D, axis=0)
    return top_tfidf_feats(tfidf_means, features, top_n)

def top_feats_by_class(Xtr, y, features, min_tfidf=0.1, top_n=25):
    ''' Return a list of dfs, where each df holds top_n features and their mean tfidf value
        calculated across documents with the same class label. '''
    dfs = {}
    labels = np.unique(y)
    for label in labels:
        ids = np.where(y==label)          
        feats_df = top_mean_feats(Xtr, features, ids, min_tfidf=min_tfidf, top_n=top_n)
        feats_df.label = label
        dfs[label] = feats_df        
    return dfs

def plot_tfidf_classfeats_h(dfs):
    ''' Plot the data frames returned by the function plot_tfidf_classfeats(). '''
    fig = plt.figure(figsize=(12, 9), facecolor="w")
    x = np.arange(len(dfs[0]))
    for i, df in enumerate(dfs):
        ax = fig.add_subplot(1, len(dfs), i+1)
        ax.spines["top"].set_visible(False)
        ax.spines["right"].set_visible(False)
        ax.set_frame_on(False)
        ax.get_xaxis().tick_bottom()
        ax.get_yaxis().tick_left()
        ax.set_xlabel("Mean Tf-Idf Score", labelpad=16, fontsize=14)
        ax.set_title("label = " + str(df.label), fontsize=16)
        ax.ticklabel_format(axis='x', style='sci', scilimits=(-2,2))
        ax.barh(x, df.tfidf, align='center', color='#3F5D7D')
        ax.set_yticks(x)
        ax.set_ylim([-1, x[-1]+1])
        yticks = ax.set_yticklabels(df.feature)
        plt.subplots_adjust(bottom=0.09, right=0.97, left=0.15, top=0.95, wspace=0.52)
    plt.show()
    
    
# Hamming score
def hamming_score(y_true, y_pred, normalize=True, sample_weight=None):
    '''
    Compute the Hamming score (a.k.a. label-based accuracy) for the multi-label case
    https://stackoverflow.com/q/32239577/395857
    '''
    acc_list = []
    for i in range(y_true.shape[0]):
        set_true = set( np.where(y_true[i])[0] )
        set_pred = set( np.where(y_pred[i])[0] )
        #print('\nset_true: {0}'.format(set_true))
        #print('set_pred: {0}'.format(set_pred))
        tmp_a = None
        if len(set_true) == 0 and len(set_pred) == 0:
            tmp_a = 1
        else:
            tmp_a = len(set_true.intersection(set_pred))/\
                    float( len(set_true.union(set_pred)) )
        #print('tmp_a: {0}'.format(tmp_a))
        acc_list.append(tmp_a)
    return np.mean(acc_list)    


In [25]:
# Get data to work with

from observatorio_laboral.offer.offer_controller import OfferController
from observatorio_laboral.offer.date_range import DateRange

text_fields = []
oc = OfferController(text_fields = ["Job Title", "Description", "Qualifications", "Software", "Organization Name"], table="train_offers")
date_range = DateRange(1, 2013, 5, 2017)
source = "symplicity"

# Get offers by date range
oc.load_offers(source, date_range)
print("Nro de conv. en la fecha ingresada: ", len(oc.offers))

# Get offers by career
oc.filter_offers_by_career("ECONOMÍA")
print("Nro de conv. de la carrera: ", len(oc.offers))

# Get labeled offers
oc.filter_offers_by_field("Areas")
print("Nro de conv. clasificadas: ", len(oc.offers))

Nro de conv. en la fecha ingresada:  553
Nro de conv. de la carrera:  553
Nro de conv. clasificadas:  553


In [26]:
# Optional: Ignore some classes
offer_classes = oc.get_field_labels("Areas")#, ignore=['TE', 'OI', 'EI'])

# Simple text preprocesing
offer_texts = oc.get_text()
punctuations = ['•','/', ')', '-']
translator = str.maketrans("".join(punctuations),' '*len(punctuations))

proc_data = []
for text in offer_texts:
    text = text.lower()
    text = text.translate(translator)
    proc_data.append(text)
    
offer_texts = proc_data

X = offer_texts
y = offer_classes

# Load reviewed vocabulary
vocab = set()
with open("diccionarioEconomia.csv") as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        word = row['Concepto']
        mark = row['Economía']
        
        if mark == 's':
            vocab.add(word)
            
vocab = list(vocab)
print("Tamaño del vocabulario: ", len(vocab))

Tamaño del vocabulario:  3254


In [42]:
# Transform labels to 0/1 arrays
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=40)
mlb = MultiLabelBinarizer().fit(y)
y_binary = mlb.transform(y)
y_train_binary = mlb.transform(y_train)
y_test_binary = mlb.transform(y_test)

print("Train shape: ", y_train_binary.shape)
print("Test shape: ", y_test_binary.shape)

Train shape:  (414, 7)
Test shape:  (139, 7)


In [72]:
vec = CountVectorizer(vocabulary=vocab, ngram_range=(1,4))
Xtr = vec.fit_transform(X)
features = vec.get_feature_names()

for idx, label in enumerate(mlb.classes_):
    class_y_binary = y_binary[:,idx]
    ids = np.where(class_y_binary==1)
    df = top_mean_feats(Xtr, features, ids, top_n=20, func=np.mean)
    print(df)
    break

               feature     tfidf
0             comercio  1.571429
1    comercio exterior  1.142857
2        importaciones  0.857143
3          exportación  0.857143
4             análisis  0.857143
5           importados  0.714286
6                 word  0.714286
7      internacionales  0.714286
8            logística  0.714286
9          financieros  0.714286
10               banca  0.571429
11               prima  0.571429
12               excel  0.571429
13               point  0.571429
14               datos  0.571429
15       materia prima  0.571429
16          económicas  0.571429
17  análisis económico  0.571429
18           económico  0.571429
19         seguimiento  0.571429


In [73]:
vec = CountVectorizer(vocabulary=vocab, ngram_range=(1,4))
Xtr = vec.fit_transform(X)
features = vec.get_feature_names()

for idx, label in enumerate(mlb.classes_):
    class_y_binary = y_binary[:,idx]
    ids = np.where(class_y_binary==1)
    df = top_mean_feats(Xtr, features, ids, top_n=20, func=np.sum)
    print(df)
    break

               feature  tfidf
0             comercio     11
1    comercio exterior      8
2        importaciones      6
3          exportación      6
4             análisis      6
5           importados      5
6                 word      5
7      internacionales      5
8            logística      5
9          financieros      5
10               banca      4
11               prima      4
12               excel      4
13               point      4
14               datos      4
15       materia prima      4
16          económicas      4
17  análisis económico      4
18           económico      4
19         seguimiento      4


In [80]:
vec = TfidfVectorizer(vocabulary=vocab, ngram_range=(1,4))
Xtr = vec.fit_transform(X)
features = vec.get_feature_names()

for idx, label in enumerate(mlb.classes_):
    class_y_binary = y_binary[:,idx]
    ids = np.where(class_y_binary==1)
    df = top_mean_feats(Xtr, features, ids, top_n=20, func=np.mean)
    print(df)
    break

                    feature     tfidf
0                  comercio  0.183162
1         comercio exterior  0.138860
2               exportación  0.097987
3                importados  0.087529
4        análisis económico  0.081229
5                económicas  0.078690
6             importaciones  0.077289
7           internacionales  0.075479
8                 económico  0.068509
9                 logística  0.060483
10     elaboración de bases  0.059112
11   paquetes econométricos  0.059112
12  evaluación de políticas  0.059112
13  evaluaciones económicas  0.059112
14      impacto regulatorio  0.059112
15                    banca  0.057877
16                 política  0.056874
17  estadísticas económicas  0.056377
18      análisis de impacto  0.056377
19   economía internacional  0.056377


In [207]:
for label in mlb.classes_:
    print(label)

EI
EM
FI
MC
OI
PP
TE


In [278]:
from sklearn.metrics import classification_report

MAX = 100

def training_classifiers():
    vec = TfidfVectorizer(vocabulary=vocab, ngram_range=(1,4))
    Xtr = vec.fit_transform(X)
    features = vec.get_feature_names()

    class_features = {}
    class_y_binary = {}
    class_y_train_binary = {}
    class_y_test_binary = {}
    class_y_pred_binary = {}
    class_prob_pred = {}
    class_labels = np.zeros(shape=(len(X_test), len(mlb.classes_)))
    class_probs = np.zeros(shape=(len(X_test), len(mlb.classes_)))
    
    for idx, label in enumerate(mlb.classes_):
        class_y_binary[label] = y_binary[:,idx]
        ids = np.where(class_y_binary[label]==1)
        class_features[label] = top_mean_feats(Xtr, features, ids, top_n=30, func=np.sum)['feature']
        
        class_y_train_binary[label] = y_train_binary[:,idx]
        
        x1, x2, y1, y2 = train_test_split(X_train, class_y_train_binary[label], random_state=40)
        
        max_fs = 0
        max_clf = None
        max_cp = None
        max_prior = None
        cp_list = list(range(1,MAX))
        cp_list.append(None)
        none_cp = None
        
        for class_prior in cp_list:
            if class_prior is None:                
                pipeline = Pipeline([
                    ('vec', TfidfVectorizer(vocabulary=class_features[label], ngram_range=(1,4))),
                    ('clf', MultinomialNB())
                ])
            else:
                pipeline = Pipeline([
                    ('vec', TfidfVectorizer(vocabulary=class_features[label], ngram_range=(1,4))),
                    ('clf', MultinomialNB(class_prior= [class_prior, MAX - class_prior]))
                ])
            
            pipeline.fit(x1, y1)
            yp = pipeline.predict(x2)
            
            fs = f1_score(y2, yp, average="binary")
            
            if fs > max_fs:
                max_fs = fs
                max_clf = pipeline
                max_cp = classification_report(y2, yp)
                max_prior = class_prior
                
            #pr = pipeline.predict_proba(x2)            
            #print(classification_report(y2, yp))
            #print(f1_score(y2, yp, average="binary"))
            #break
            
          
        print("Label: ", label)
        print(max_prior)
        print(max_cp)
        print()
        
        class_y_test_binary[label] = y_test_binary[:, idx]
        class_y_pred_binary[label] = max_clf.predict(X_test)
        L = pipeline.predict_proba(X_test)[:,1]
        class_probs[:, idx] = L
        L[L>=0.5] = 1
        L[L<0.5] = 0
        class_labels[:, idx] = L
        
    return class_labels, class_probs

In [279]:
y_pred_binary, y_probs = training_classifiers()

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Label:  EI
80
             precision    recall  f1-score   support

          0       0.99      1.00      1.00       102
          1       1.00      0.50      0.67         2

avg / total       0.99      0.99      0.99       104




  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Label:  EM
36
             precision    recall  f1-score   support

          0       0.97      0.40      0.56        93
          1       0.15      0.91      0.26        11

avg / total       0.89      0.45      0.53       104




  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Label:  FI
33
             precision    recall  f1-score   support

          0       0.81      0.51      0.63        43
          1       0.73      0.92      0.81        61

avg / total       0.76      0.75      0.74       104




  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Label:  MC
70
             precision    recall  f1-score   support

          0       0.90      0.94      0.92        84
          1       0.69      0.55      0.61        20

avg / total       0.86      0.87      0.86       104




  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Label:  OI
74
             precision    recall  f1-score   support

          0       0.98      0.98      0.98        99
          1       0.60      0.60      0.60         5

avg / total       0.96      0.96      0.96       104




  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Label:  PP
37
             precision    recall  f1-score   support

          0       0.88      0.43      0.58        49
          1       0.65      0.95      0.77        55

avg / total       0.76      0.70      0.68       104




  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Label:  TE
71
             precision    recall  f1-score   support

          0       0.98      0.97      0.97       100
          1       0.40      0.50      0.44         4

avg / total       0.96      0.95      0.95       104




In [314]:
def train_pred_classifiers(X_train, y_train_binary, X_test, class_priors):
    vec = TfidfVectorizer(vocabulary=vocab, ngram_range=(1,4))
    Xtr = vec.fit_transform(X)
    features = vec.get_feature_names()

    class_features = {}
    class_y_binary = {}
    class_y_train_binary = {}
    class_y_test_binary = {}
    class_y_pred_binary = {}
    class_prob_pred = {}
    class_labels = np.zeros(shape=(len(X_test), len(mlb.classes_)))
    class_probs = np.zeros(shape=(len(X_test), len(mlb.classes_)))
    classifiers = {}
    
    for idx, label in enumerate(mlb.classes_):
        class_y_binary[label] = y_binary[:,idx]
        ids = np.where(class_y_binary[label]==1)
        class_features[label] = top_mean_feats(Xtr, features, ids, top_n=30, func=np.sum)['feature']        
        class_y_train_binary[label] = y_train_binary[:,idx]
        
        if class_priors[idx]:
            pipeline = Pipeline([
                ('vec', TfidfVectorizer(vocabulary=class_features[label], ngram_range=(1,4))),
                ('clf', MultinomialNB(class_prior = [class_priors[idx], MAX-class_priors[idx]]))
            ])
        else:
            pipeline = Pipeline([
                ('vec', TfidfVectorizer(vocabulary=class_features[label], ngram_range=(1,4))),
                ('clf', MultinomialNB())
            ])
            
        
        pipeline.fit(X_train, class_y_train_binary[label])
        classifiers[label] = pipeline        
        class_y_pred_binary[label] = pipeline.predict(X_test)
        L = pipeline.predict_proba(X_test)[:,1]
        class_probs[:, idx] = L
        L[L>=0.5] = 1
        L[L<0.5] = 0
        class_labels[:, idx] = L
        
    return classifiers, class_labels, class_probs

def predict(classifiers, mlb, X_test):
    class_y_pred_binary = {}
    class_labels = np.zeros(shape=(len(X_test), len(mlb.classes_)))
    class_probs = np.zeros(shape=(len(X_test), len(mlb.classes_)))
    
    for idx, label in enumerate(mlb.classes_):        
        class_y_pred_binary[label] = pipeline.predict(X_test)
        L = pipeline.predict_proba(X_test)[:,1]
        class_probs[:, idx] = L
        L[L>=0.5] = 1
        L[L<0.5] = 0
        class_labels[:, idx] = L
        
    return class_labels, class_probs

In [315]:
class_priors = [None, None, None, None, None, None, None]

classifiers, y_pred_binary, probs = train_pred_classifiers(X, y_binary, X_test, class_priors)

In [319]:
import pickle

with open("Pickle/classifiersEconomia.p", "wb") as file:
    pickle.dump(classifiers, file)

with open("Pickle/binarizerEconomia.p", "wb") as file:
    pickle.dump(mlb, file)


In [310]:
print("Metrics :")            
print("Accuracy: %0.3f" %  accuracy_score(y_test_binary, y_pred_binary))
print("F1-micro: %0.3f" %  f1_score(y_test_binary, y_pred_binary, average='micro'))
print("F1-macro: %0.3f" %  f1_score(y_test_binary, y_pred_binary, average='macro'))            
print("Hamming: %0.3f" %  hamming_score(y_test_binary, y_pred_binary))

Metrics :
Accuracy: 0.504
F1-micro: 0.723
F1-macro: 0.320
Hamming: 0.643


  'precision', 'predicted', average, warn_for)


In [280]:
print("Metrics :")            
print("Accuracy: %0.3f" %  accuracy_score(y_test_binary, y_pred_binary))
print("F1-micro: %0.3f" %  f1_score(y_test_binary, y_pred_binary, average='micro'))
print("F1-macro: %0.3f" %  f1_score(y_test_binary, y_pred_binary, average='macro'))            
print("Hamming: %0.3f" %  hamming_score(y_test_binary, y_pred_binary))

Metrics :
Accuracy: 0.439
F1-micro: 0.674
F1-macro: 0.296
Hamming: 0.583


  'precision', 'predicted', average, warn_for)


In [281]:
cnt = 0
for y in y_pred_binary:
    if 1 not in y:
        cnt+= 1
cnt
       

16

In [283]:
idx = 2
for yt, yp, prob in zip(y_test_binary, y_pred_binary, y_probs):
    print(yt[idx], yp[idx], prob[idx])

1 1.0 0.854486686772
0 0.0 0.406624389406
1 1.0 0.610661428723
1 1.0 0.679818222266
0 1.0 0.750743136732
0 0.0 0.258567130961
1 0.0 0.413287069095
1 0.0 0.495515716266
0 0.0 0.269190339706
0 0.0 0.245371681249
0 0.0 0.440523884174
0 0.0 0.279478821329
1 1.0 0.906438153654
0 1.0 0.581514264412
0 0.0 0.3357188407
1 1.0 0.851352450713
1 1.0 0.816925137029
1 0.0 0.403610083165
0 0.0 0.490858346051
0 0.0 0.315097628377
1 1.0 0.654295791639
1 1.0 0.761071460708
0 1.0 0.570216827544
1 1.0 0.718347269071
1 1.0 0.815318643227
1 1.0 0.63276262967
1 1.0 0.665291912322
0 1.0 0.558064516129
0 1.0 0.558064516129
0 0.0 0.43242060117
1 0.0 0.357685893792
1 1.0 0.847580684452
0 0.0 0.40933578799
1 1.0 0.685941986918
1 0.0 0.402380284661
0 0.0 0.235655108398
1 0.0 0.396693556646
1 1.0 0.912366210471
1 1.0 0.7087047328
0 0.0 0.245253143862
1 0.0 0.484925112082
1 1.0 0.624305487582
1 1.0 0.805849788707
0 0.0 0.245164432655
0 0.0 0.214891565143
0 1.0 0.505512213663
1 1.0 0.815384154749
0 0.0 0.32214557825


In [None]:
dfs = top_feats_by_class(Xtr, y, features, top_n=30)

In [54]:
top_feats_in_doc(Xtr, features, 1, 10)

Unnamed: 0,feature,tfidf
0,investigación,5
1,proyectos,3
2,desarrollo,2
3,investigación económica,1
4,mining,1
5,participación en proyectos,1
6,públicos,1
7,tecnológicas,1
8,excel,1
9,económica,1


In [None]:
offers_by_label = {}
for doc in X_train:
    
    
    

    




In [48]:
vocab_by_label = {}





In [43]:
from sklearn.linear_model import RidgeClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.linear_model import RidgeClassifierCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.multiclass import OneVsRestClassifier
from sklearn.feature_selection import SelectKBest, chi2, SelectFromModel
from nltk.corpus import stopwords
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer

classificadores = {}

pipeline = Pipeline([
    ('vec',TfidfVectorizer()),
    ('fs', SelectKBest(chi2, k=1000)),
    ('clf', MultinomialNB()),
])    

for idx, label in enumerate(mlb.classes_):    
    print(label)
    classificadores[label] = Pipeline([
        ('vec', TfidfVectorizer(max_df=0.5)),
        ('clf', BernoulliNB()),        
    ])
    classificadores[label].fit(X_train, y_train_binary[:,idx])
    

EI
EM
FI
MC
OI
PP
TE


In [46]:
y_pred_binary = classificadores['EI'].predict(X_train)

In [47]:
for t, p in zip(y_test_binary[:,0], y_pred_binary):
    print(t, p)

0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
1 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
1 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
1 0
0 0


In [37]:
y_test_binary=y_test_binary[:,0]

In [38]:
print("Metrics :")            
print("Accuracy: %0.3f" %  accuracy_score(y_test_binary, y_pred_binary))
print("F1-micro: %0.3f" %  f1_score(y_test_binary, y_pred_binary, average='micro'))
print("F1-macro: %0.3f" %  f1_score(y_test_binary, y_pred_binary, average='macro'))            
print("Hamming: %0.3f" %  hamming_score(y_test_binary, y_pred_binary))

Metrics :
Accuracy: 0.978
F1-micro: 0.978
F1-macro: 0.495
Hamming: 0.978


  'precision', 'predicted', average, warn_for)


In [6]:
y_test_binary[:,0]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0])

In [7]:
y_train_binary[:,0]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0,

In [None]:
class Maximizador()