In [None]:
from google.colab import drive
drive.mount("/content/gdrive")

In [None]:
import pandas as pd

In [None]:
import pandas as pd
df = pd.read_csv('/content/gdrive/My Drive/Mestrado//second_level_preprocessed.csv')

In [None]:
from collections import defaultdict
result = defaultdict(int)
for labels in df.labels:
    result[len(labels)] += 1

In [None]:
result

In [None]:
names = list(result.keys())
import matplotlib.pyplot as plt

values = list(result.values())

plt.bar(names, values)
plt.xticks(rotation=90)
plt.ylabel('frequency')
plt.xlabel('number of labels / Document')
plt.show()

In [None]:
result.plot(kind='bar')

In [None]:
df.shape

In [None]:
import nltk
nltk.download('omw-1.4')

In [None]:
import string
regular_punct = list(string.punctuation)
def remove_punctuation(text,punct_list):
    for punc in punct_list:
        if punc in text:
            text = text.replace(punc, ' ' )
    return text.strip()

df['punct'] = df['text_corrected'].apply(lambda x : remove_punctuation(x, regular_punct)).apply(lambda x : " ".join(x.split()))



from nltk.stem.porter import PorterStemmer

porter_stemmer = PorterStemmer()

def stem_sentences(sentence):
    tokens = sentence.split()
    stemmed_tokens = [porter_stemmer.stem(token) for token in tokens]
    return ' '.join(stemmed_tokens)

df['porter_original'] = df['text_corrected'].apply(stem_sentences)
df['porter_punct'] = df['punct'].apply(stem_sentences)


import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

#example text text = 'What can I say about this place. The staff of these restaurants is nice and the eggplant is not bad'

class Splitter(object):
    """
    split the document into sentences and tokenize each sentence
    """
    def __init__(self):
        self.splitter = nltk.data.load('tokenizers/punkt/english.pickle')
        self.tokenizer = nltk.tokenize.TreebankWordTokenizer()

    def split(self,text):
        """
        out : ['What', 'can', 'I', 'say', 'about', 'this', 'place', '.']
        """
        # split into single sentence
        sentences = self.splitter.tokenize(text)
        # tokenization in each sentences
        tokens = [self.tokenizer.tokenize(sent) for sent in sentences]
        return tokens


class LemmatizationWithPOSTagger(object):
    def __init__(self):
        pass
    def get_wordnet_pos(self,treebank_tag):
        """
        return WORDNET POS compliance to WORDENT lemmatization (a,n,r,v) 
        """
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            # As default pos in lemmatization is Noun
            return wordnet.NOUN

    def pos_tag(self,tokens):
        # find the pos tagginf for each tokens [('What', 'WP'), ('can', 'MD'), ('I', 'PRP') ....
        pos_tokens = [nltk.pos_tag(token) for token in tokens]

        # lemmatization using pos tagg   
        # convert into feature set of [('What', 'What', ['WP']), ('can', 'can', ['MD']), ... ie [original WORD, Lemmatized word, POS tag]
        pos_tokens = [ [(word, lemmatizer.lemmatize(word,self.get_wordnet_pos(pos_tag)), [pos_tag]) for (word,pos_tag) in pos] for pos in pos_tokens]
        return pos_tokens

lemmatizer = WordNetLemmatizer()
splitter = Splitter()
lemmatization_using_pos_tagger = LemmatizationWithPOSTagger()

#step 1 split document into sentence followed by tokenization

def lemmatise(text):
  tokens = splitter.split(text)
  #step 2 lemmatization using pos tagger 
  lemma_pos_token = lemmatization_using_pos_tagger.pos_tag(tokens)
  ans = ""
  for sentence in lemma_pos_token:
    for word in sentence:
      ans += word[1] + " "

  return ans[:-1]


df['lemma_original'] = df['text_corrected'].apply(lemmatise)
df['lemma_punct'] = df['punct'].apply(lemmatise)

In [None]:
! pip install scikit-multilearn
! pip install iterative-stratification

In [None]:
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, BatchNormalization, Input
import tensorflow as tf


def create_model(input_dim, output_dim, dropout, hidden_size):

    inp = Input(input_dim)
    x = Dense(hidden_size, activation='relu', kernel_initializer= tf.keras.initializers.HeNormal(), bias_initializer='zeros')(inp)
    x = Dropout(dropout)(x, training=True)
    out = Dense(output_dim, activation='sigmoid',kernel_initializer= tf.keras.initializers.HeNormal(), bias_initializer='zeros')(x)
    model = Model(inputs = inp, outputs = out)

    #optimizer
    #model.compile(loss="binary_crossentropy", optimizer=opt, metrics=['categorical_accuracy'])
    return model



In [None]:
from ast import literal_eval

df['labels'] = df['labels'].apply(literal_eval)

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np
#### one hote enconding on the labels ########
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['labels'])

#### get the text column

X = df['text_processed']

In [None]:
X.shape, y.shape

In [None]:
import os
import random
def tf_seed(seed=0):
	os.environ['PYTHONHASHSEED'] = str(seed)
	# For working on GPUs from "TensorFlow Determinism"
	os.environ["TF_DETERMINISTIC_OPS"] = str(seed)
	np.random.seed(seed)
	random.seed(seed)
	tf.random.set_seed(seed)

In [None]:
tf_seed()

In [None]:
from tensorflow.python.ops.script_ops import numpy_function
from keras import backend as K

def calculating_class_weights(y_true):
    from sklearn.utils.class_weight import compute_class_weight
    number_dim = np.shape(y_true)[1]
    print(number_dim)
    weights = np.empty([number_dim, 2])
    for i in range(number_dim):
        print(compute_class_weight(class_weight='balanced', classes= [0,1], y= y_true[:, i]))
        weights[i] = compute_class_weight(class_weight='balanced', classes= [0.,1.], y= y_true[:, i])

    print(weights)
    return weights


def get_weighted_loss(weights):
    def weighted_loss(y_true, y_pred):
        return K.mean((weights[:,0]**(1-y_true))*(weights[:,1]**(y_true))*K.binary_crossentropy(y_true, y_pred), axis=-1)
    return weighted_loss

In [None]:
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize, word_tokenize 
import numpy as np

def average_sentences(X):

    new_X = []
    for instance in X:
      sentences = sent_tokenize(instance)
      current_embedding = embed(sentences).numpy()
      '''
      diff = number -len(current_embedding)
      if diff > 0:
          padding = np.zeros((diff, 512))
          current_embedding = np.concatenate((padding, current_embedding), axis=0)
      '''
      new_X.append(np.mean(current_embedding, axis=0))
      
    
    return np.array(new_X)




In [None]:
!pip install gensim

In [None]:
import gensim
import gensim.downloader
embedding = gensim.downloader.load('glove-twitter-50')

In [None]:
embedding_twitter200 = gensim.downloader.load('glove-twitter-200')

In [None]:
embedding_word2vec = gensim.downloader.load('word2vec-google-news-300')

In [None]:
embedding_fast = gensim.downloader.load('fasttext-wiki-news-subwords-300')

In [None]:
import tensorflow_hub as hub

embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

In [None]:
def vectorize_sentences(X):
    embeded_tweets = embed(X).numpy()
    return embeded_tweets


In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize 

def get_embedding_result(word, embedding):
  return embedding[word] if word in embedding else 1


def get_gloVe(X_train, X_test, embeddings_index):
  X_train_vector = []
  for text in X_train:
    words =  [get_embedding_result(word, embeddings_index) for word in word_tokenize(text)]
    X_train_vector.append(np.average(words, axis = 0))

  X_test_vector = []
  for text in X_test:
    words =  [get_embedding_result(word, embeddings_index) for word in word_tokenize(text)]
    X_test_vector.append(np.average(words, axis = 0))

    
  return X_train_vector, X_test_vector


from sklearn.feature_extraction.text import TfidfVectorizer

def get_gloVe_tf_idf(X_train, X_test, embeddings_index):
  tfidf = TfidfVectorizer()
  tfidf.fit(X_train)
  idf_dict = dict(zip(tfidf.get_feature_names(), tfidf.idf_))

  train_vector = []
  for text in X_train:
    weights = [idf_dict.get(word, 1) for word in word_tokenize(text)]
    words =  [get_embedding_result(word, embeddings_index) for word in word_tokenize(text)]
    train_vector.append(np.average(words, axis = 0, weights = weights))

  test_vector = []
  for text in X_test:
    weights = [idf_dict.get(word, 1) for word in word_tokenize(text)]
    words =  [get_embedding_result(word, embeddings_index) for word in word_tokenize(text)]
    test_vector.append(np.average(words, axis = 0, weights = weights))

  return train_vector, test_vector


In [None]:
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit
import numpy as np
from sklearn.feature_extraction.text import TfidfTransformer
from imblearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report, multilabel_confusion_matrix, accuracy_score, hamming_loss, jaccard_score, precision_score, recall_score
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from collections import defaultdict

msss = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)
counter = 1
store = defaultdict(float)

for train_index, test_index in msss.split(np.array(X), np.array(y)):
  import warnings
  warnings.simplefilter(action='ignore', category=FutureWarning)


  for dropout in [0., 0.1, 0.3, 0.5]:
    for hidden_layer in [16, 32, 64, 128, 256]:
      #for k_values in [200, 300, 500, 1000]:


   ####### performing the different splits here #########
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        text_clf = Pipeline([
              ('vect', CountVectorizer()),
              #('tfidf', TfidfTransformer()),
              #('smote', MLSmote()),
              #('fs', SelectKBest(chi2, k= k_values)),
              #('debug', Debug())
        ])

        #X_train = text_clf.fit_transform(X_train, y_train).toarray()
        #X_test = text_clf.transform(X_test).toarray()
        # boy2
        X_train, X_test = get_gloVe_tf_idf(X_train, X_test, embedding)

        #X_train = vectorize_sentences(X_train)
        #X_test = vectorize_sentences(X_test)
        X_train = np.array(X_train)
        X_test = np.array(X_test)

        print(X_train.shape)

        model = create_model(X_train.shape[1], y.shape[1], dropout=dropout, hidden_size=hidden_layer)
        #print(model.summary())
        class_weights = calculating_class_weights(y_train)
        print(class_weights)
        #print(class_weights)
        opt = tf.keras.optimizers.Adam(learning_rate=3e-4)

        model.compile(optimizer=opt, loss=get_weighted_loss(class_weights))
        print(y_train.shape, y_test.shape)

        model.fit(X_train, tf.cast(y_train, tf.float32), epochs=20, validation_data=(X_test, tf.cast(y_test, tf.float32)))

        y_predict = model.predict(X_test)
        thresholds = np.arange(0, 1, 0.005)
        from sklearn.metrics import classification_report, multilabel_confusion_matrix, f1_score, accuracy_score # we can use gmean

        def to_labels(pos_probs, threshold):
          return (pos_probs >= threshold).astype('int')
      

        mc_predictions = []

        RANGE_VALUES = 100
        for i in range(RANGE_VALUES):
          y_p = model.predict(X_test)
          mc_predictions.append(y_p)
        
        accs = []
        f1_scores = []
        recall_scores = []
        precision_scores = []
        matrix_sum = np.zeros((y.shape[1],2,2))

        for y_p in mc_predictions:
          #f1_score(y_test, to_labels(y_p, t), average="weighted") * 0.5 + 
          scores = [accuracy_score(y_test, to_labels(y_p, t)) for t in thresholds]
          ix = np.argmax(scores)
          y_predict = y_p > thresholds[ix]
          accs.append(accuracy_score(y_test, y_predict))
          f1_scores.append(f1_score(y_test, y_predict, average="weighted"))
          recall_scores.append(recall_score(y_test, y_predict, average="weighted"))
          precision_scores.append(precision_score(y_test, y_predict, average="weighted"))
          multi_matrix = multilabel_confusion_matrix(y_test, y_predict)
          matrix_sum = np.sum([matrix_sum, multi_matrix], axis=0)
          #print(multi_matrix)
          #print(matrix_sum)

        #print("Matrix confusion\n", matrix_sum/ (RANGE_VALUES))

        #
        #print("MC accuracy: {:.1%}".format(sum(accs)/len(accs)))
        #print("F1 weighted accuracy: {:.1%}".format(sum(f1_scores)/len(f1_scores)))
        #print("*" * 10)
        print(counter, "done", sum(f1_scores)/len(f1_scores))
        counter += 1
        #print("Precision weighted accuracy: {:.1%}".format(sum(precision_scores)/len(precision_scores)))
        #print("Recall weighted accuracy: {:.1%}".format(sum(recall_scores)/len(recall_scores)))

        store[sum(f1_scores)/len(f1_scores)] = [dropout, hidden_layer]
        labels = mlb.classes_

        conf_mat_dict={}
        from sklearn.metrics import confusion_matrix


        #print("accuracy: ", accuracy_score(y_test, y_predict))
        #print("hamming loss: ", hamming_loss(y_test, y_predict))
        #print("jaccard score: ", jaccard_score(y_test, y_predict, average='weighted'))
        #print(multilabel_confusion_matrix(y_test, y_predict))
        #print(classification_report(y_test, y_predict))

        

In [None]:
store


In [None]:
store

In [None]:
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit
import numpy as np
from sklearn.feature_extraction.text import TfidfTransformer
from imblearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.multiclass import OneVsRestClassifier ### does not take into account label correlations
from sklearn.svm import SVC
from sklearn.metrics import classification_report, multilabel_confusion_matrix, accuracy_score, hamming_loss, jaccard_score

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

#from sklearn.multioutput import ClassifierChain

from sklearn.utils import compute_class_weight
from skmultilearn.problem_transform import BinaryRelevance
from keras.wrappers.scikit_learn import KerasClassifier

from skmultilearn.problem_transform import ClassifierChain, LabelPowerset
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from skmultilearn.cluster import NetworkXLabelGraphClusterer
from skmultilearn.cluster import LabelCooccurrenceGraphBuilder
from skmultilearn.ensemble import LabelSpacePartitioningClassifier

msss = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)


# linear is nice 0.47
# poly 0.37
# rbf 0.34
# sigmoid 0.43

#from skmultilearn.ext import Keras

from skmultilearn.adapt import MLkNN
from sklearn.model_selection import GridSearchCV


################ run the pipeline #####################

#### RF E LR
for train_index, test_index in msss.split(np.array(X), np.array(y)):
  #index = - 1
  #for clf in [BinaryRelevance(), ClassifierChain(), LabelPowerset()]:
   #index = index + 1
   #for tfidf in [True, False]:
    #for embedding_str in  [\
    #   'word2vec-google-news-300'    ,
    #  'fasttext-wiki-news-subwords-300', 
   #   'glove-twitter-50', #done
    #  'glove-twitter-200' # done
    #  ]:

      #embedding = gensim.downloader.load(embedding_str)
    #for other_thing in [BinaryRelevance(), ClassifierChain(), LabelPowerset()]:
     #for tf_idf in [True, False]:

      import warnings
      warnings.simplefilter(action='ignore', category=FutureWarning)
      ####### performing the different splits here #########
      X_train, X_test = X[train_index], X[test_index]
      y_train, y_test = y[train_index], y[test_index]


      #if tfidf:
      #else:
      #  X_train, X_test = get_gloVe(X_train, X_test, embedding)


      from skmultilearn.problem_transform import BinaryRelevance



      ### model xgboost ###~
      from xgboost import XGBClassifier
      from sklearn.multiclass import OneVsRestClassifier


      
      KERAS_PARAMS = dict(epochs=15, batch_size=2, verbose=1)
        


      #### model b)
      from skmultilearn.adapt import BRkNNaClassifier # 1 parameter estimation

      ### model c)
     
      #clf = GridSearchCV(MLkNN(), parameters, scoring=score)

      ### model d) # alternative to for example classifier chain
      from sklearn.naive_bayes import GaussianNB
     

      #classifier = RakelD(
      #    base_classifier= SVC(class_weight='balanced', random_state=42, kernel=kernels[0], C=2),
      #    base_classifier_require_dense=[True, True],
      #    labelset_size=4
      #)

      ### model e) # another alternative
      from skmultilearn.ensemble import MajorityVotingClassifier
      from skmultilearn.cluster import FixedLabelSpaceClusterer
      from skmultilearn.problem_transform import ClassifierChain
      from sklearn.naive_bayes import GaussianNB



      ### labels embeddings #### => nao tou a conseguir por a dar
      import joblib
      import sys
      sys.modules['sklearn.externals.joblib'] = joblib
      #from skmultilearn.embedding import CLEMS, EmbeddingClassifier



      # Compile model
      ###### vectorize the text for training and test seperately #######
      from sklearn.preprocessing import FunctionTransformer


      ### perform data augmentation of the training data prior to the preprocessing phase ###
      #from textattack.augmentation import CheckListAugmenter
      #from textattack.augmentation import EasyDataAugmenter, EmbeddingAugmenter


      #X_min, y_min = get_minority_samples(X_train, pd.DataFrame(y_train, columns=mlb.classes_))
      #print("Size of the minority samples", X_min.shape[0])
      number=1
      #checklist_augmenter = EmbeddingAugmenter(pct_words_to_swap=0.1, transformations_per_example=number)

      #X_train, y_train = paraphrase_augmentation(X_train, y_train)
      
      #X_train, y_train = textattack_data_augment(X_train, y_train, checklist_augmenter, number)

      #X_train, y_train = augment_data(X_train, y_train)

      X_train = np.array(X_train)
      y_train = np.array(y_train)



      from sklearn.neighbors import KNeighborsClassifier

      #print("New dataset ", X_train.shape, "and ", y_train.shape)
      from sklearn.tree import DecisionTreeClassifier
      #classifier = DecisionTreeClassifier()
      from sklearn.naive_bayes import GaussianNB

      from sklearn.neighbors import KNeighborsClassifier
      from sklearn.linear_model import LogisticRegression

      
      print(text_clf.get_params().keys())
      from sklearn.naive_bayes import MultinomialNB
      '''
      '''
      parameters2 = {
        "vect__analyzer": ['char_wb', 'word'],
        "vect__ngram_range": [ (1,2), (1,1), (1,3)],
        'tfidf__use_idf': [ True, False],
        'tfidf__norm': [ 'l2', 'l1'],
        "criterion": ['gini', 'entropy'],
        #"class_weight": ["balanced"],
        #"ccp_alpha": [0, 0.005, 0.01, 0.02]

        #"classifier":[LogisticRegression(class_weight="balanced")],
        #"classifier__penalty":['l1', 'l2'],
        #"classifier__C": [0.1, 1, 10, 100],
        #"classifier__solver": ['liblinear']
        #"clf__classifier__loss": ["hinge", "squared_hinge"],
        #"clf__classifier__penalty": ['l1', 'l2']
        #"clf__classifier__var_smoothing": [1e-8, 1e-9, 1e-10]
        #"clf__n_neighbors":[3, 5, 10],
        #"clf__weights": ['uniform', 'distance'],
        #"clf__p": [1,2 ]
      }


      #X_train, X_test = get_gloVe(X_train, X_test, embedding)

      
      parameters3= {
        # DT
        #"criterion": ['gini', 'entropy'],
        #"class_weight": ["balanced"],
        #"ccp_alpha": [0, 0.005, 0.01, 0.02]

        # 

        #"classifier": [GaussianNB()],
        #"classifier__var_smoothing": [1e-2, 1e-3],

        #"n_neighbors":[3, 5, 10],
        #"weights": ['uniform', 'distance'],
        #"p": [1,2] ,


        #"classifier": [SVC()],
        #"classifier__kernel": ['linear'],
        #"classifier__C": [20],
        #"classifier__class_weight": ["balanced"]

        #"classifier":[LogisticRegression(class_weight="balanced")],
        #"classifier__penalty":['l1', 'l2'],
        #"classifier__C": [0.1, 1, 10, 100],
        #"classifier__solver": ['liblinear']


        #"bootstrap": [True, False],
        #"max_depth": [10, 20],
        #"n_estimators": [400, 600],
        #"class_weight":['balanced']


      
        #"classifier__alpha": [1e-2, 1e-3]
        #"vect__ngram_range": ((1, 1), (1, 2)),


        #'tfidf__use_idf': (True, False),
        #'tfidf__norm': ['l2', 'l1'],
        #"clf__n_neighbors":[3, 5, 10],
        #"clf__weights": ['uniform', 'distance'],
        #"clf__p": [1,2 ]
        #'clf__bootstrap': [True, False],
        #'clf__max_depth': [10, 30], #40, 50, 60, 100, None
        #'clf__n_estimators': [ 200, 300] # , 400, 600, 800, 1000
        }

      from sklearn.metrics import make_scorer

      #if tf_idf:
      #else:
      #  X_train, X_test = get_gloVe(X_train, X_test, embedding)
      #X_train = vectorize_sentences(X_train)
      #X_test = vectorize_sentences(X_test)
      #X_train = np.array(X_train)
      #X_test = np.array(X_test)


            ###### perform feature engineering and append those features #####
      from sklearn.preprocessing import Normalizer, StandardScaler, RobustScaler, MinMaxScaler, MaxAbsScaler 
      '''
      X_train_length = get_length_features(X_train)
      X_test_length = get_length_features(X_test)

      X_train_sentiment = calc_sentiment_scores(X_train)
      X_test_sentiment = calc_sentiment_scores(X_test)

      X_train_read = scores[train_index]
      X_test_read = scores[test_index]

      ##### topic values ########

      X_train_topic = topic_values[train_index]
      X_test_topic = topic_values[test_index]

      ######## ner part if we want #####

      X_train_ner = append_ner_features(df2.iloc[train_index], 'tags', tags_set)[tags_set]
      X_test_ner = append_ner_features(df2.iloc[test_index], 'tags', tags_set)[tags_set]

      ########### pos ##############
      X_train_pos = PoS_counter(X_train)
      X_test_pos = PoS_counter(X_test)


      feature_train = np.hstack(( 
          
          X_train_read,
     
                      X_train_topic,
                   
                     ))
      
      normalizer = MinMaxScaler()
      feature_train = normalizer.fit_transform(feature_train)
      feature_train = sparse.csr_matrix(feature_train)


      feature_test = np.hstack(( 
          X_test_read,
        
                      X_test_topic ,
                   
                   ))
      
      normalizer = MinMaxScaler()
      feature_test = normalizer.fit_transform(feature_test)
      feature_test = sparse.csr_matrix(feature_test)

      '''
   


      from sklearn.feature_extraction.text import TfidfVectorizer
      
      #vectorizer = TfidfVectorizer(analyzer="char_wb", ngram_range=(1,2), use_idf=True, norm='l2')
      #X_train = vectorizer.fit_transform(X_train)
      #X_test = vectorizer.transform(X_test)

      X_train  = average_sentences(X_train)
      X_test = average_sentences(X_test)
      #X_train, X_test = get_gloVe(X_train, X_test, embedding_fast)
      #X_train  = vectorize_sentences(X_train)
      #X_test = vectorize_sentences(X_test)
      # hereboy

      parameters = {
        "classifier":[LogisticRegression(class_weight="balanced")],
        "classifier__penalty":['l1', 'l2'],
        "classifier__C": [0.1, 1, 10, 100],
        "classifier__solver": ['liblinear']
      }

      
      '''
      X_train = sparse.hstack((
          sparse.csr_matrix(X_train),
          feature_train
      ))
      X_test = sparse.hstack((
          sparse.csr_matrix(X_test),
          feature_test
      ))

      print(X_train.shape)
      
      '''

      
    
      #clf = ClassifierChain(SVC(grid_search, parameters, class_weight='balanced', kernel='linear'))

    


      for al in [BinaryRelevance(), ClassifierChain(), LabelPowerset()]:

        grid_search = GridSearchCV(al, parameters, verbose=1, cv=5)
        grid_search.fit(X_train, y_train)

        #print("val", index)
        print("Best score: %0.3f" % grid_search.best_score_)
        print("Best parameters set:")
        best_parameters = grid_search.best_estimator_.get_params()
        print(parameters.keys())
        for param_name in sorted(parameters.keys()):
          print("\t%s: %r" % (param_name, best_parameters[param_name]))


        y_predict = grid_search.predict(X_test)
        # Find the best parameters for both the feature extraction and the
        # classifier

        ######## APPLY SMOTED IF WANTED  ###################

        #vectorizered_text = text_clf.fit_transform(X_train, y_train)

        ['fasttext-wiki-news-subwords-300',
        'word2vec-google-news-300', #done
        'glove-twitter-50', #done
        'glove-twitter-200' # done
        ]



        
    
        '''
        ### readability scores ###

        X_train_read = scores[X_train.index]
        X_test_read = scores[X_test.index]

        ##### topic values ########

        X_train_topic = topic_values[X_train.index]
        X_test_topic = topic_values[X_test.index]

        ######## ner part if we want #####

        X_train_ner = append_ner_features(df2.iloc[X_train.index], 'tags', tags_set)[tags_set]
        X_test_ner = append_ner_features(df2.iloc[X_test.index], 'tags', tags_set)[tags_set]

        print("ner", X_train_ner.shape, X_test_ner.shape)

        X_train_pos = PoS_counter(X_train)
        X_test_pos = PoS_counter(X_test)
        '''
        #embedding = gensim.downloader.load('glove-twitter-50')

        #vectorizered_text, test_text = get_gloVe_tf_idf(X_train, X_test, embedding)

        #### here for the others
        '''
        X_train = text_clf.fit_transform(X_train)
        test_text = text_clf.transform(X_test)

        print(np.array(X_train).shape, np.array(test_text).shape)
        X_train = pd.DataFrame(data = X_train)
        y_train =  pd.DataFrame(y_train, columns=mlb.classes_)
        '''


        

        #normalizer = MaxAbsScaler ()
        #X_train = normalizer.fit_transform(X_train)
        #X_test = normalizer.fit(X_test)

        #features = sparse.csr_matrix(features)


        '''
        PERCENTAGE = 0 # 0 means that is not applied
        NR_NEIGHBORS = 5

        vectorizered_text = text_clf.fit_transform(X_train, y_train)
        
        df_X = pd.DataFrame(data = vectorizered_text.toarray())
        df_y =  pd.DataFrame(y_train, columns=mlb.classes_)

        X_sub, y_sub = get_minority_samples(df_X, df_y)

        #print("Size of the minority samples", X_sub.shape[0])

        X_res, y_res = MLSMOTE(X_sub, y_sub, int(df_X.shape[0] * PERCENTAGE), NR_NEIGHBORS)
        '''
        ############## train the classifier

        #X_train_augmented = pd.concat([df_X, X_res])
        #classifier.fit(pd.concat([df_X, X_res]), pd.concat([df_y, y_res]))
        #classifier.fit(X_train, y_train)
        ###### perform model validation #####################


        #y_predict = classifier.predict(X_test)

        ### trying monte carlo ####
        '''
        test_text = text_clf.transform(X_test)
        mc_predictions = []

        for i in range(20):
          y_p = classifier.predict(test_text)
          mc_predictions.append(y_p)
        
        accs = []
        for y_p in mc_predictions:
          acc = accuracy_score(y_pred=y_p, y_true=y_test)
          accs.append(acc)

        print("MC accuracy: {:.1%}".format(sum(accs)/len(accs)))
        '''
        #print("embedding", embedding_str)
        #print("tfidf", tfidf)
        from sklearn.metrics import f1_score
        
        print("accuracy: ", accuracy_score(y_test, y_predict))
        print("hamming loss: ", hamming_loss(y_test, y_predict))
        print( round(hamming_loss(y_test, y_predict),3), round(jaccard_score(y_test, y_predict, average='weighted'),3))
        print( " + ".join(   [str(round(x,2)) for x in f1_score(y_test, y_predict, average=None) ]       ), "(", round(f1_score(y_test, y_predict, average="weighted"),2), ")")
        print("jaccard score: ", jaccard_score(y_test, y_predict, average='weighted'))
        print(multilabel_confusion_matrix(y_test, y_predict))
        print(classification_report(y_test, y_predict))


In [None]:
X_train