In [1]:
import re
import pandas as pd
import numpy as np
#NLP libraries
import sklearn
# sklearn packages
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.externals import joblib
from sklearn.metrics import confusion_matrix,classification_report,roc_curve
from sklearn.metrics import auc,f1_score,roc_auc_score,precision_recall_curve,precision_score
# xgboost packages
import xgboost as xgb
from xgboost import XGBClassifier

# nltk packages
import nltk
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords
from string import punctuation
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')
import french_lefff_lemmatizer
from french_lefff_lemmatizer.french_lefff_lemmatizer import FrenchLefffLemmatizer


ImportError: cannot import name 'joblib' from 'sklearn.externals' (C:\Users\marco\anaconda3\lib\site-packages\sklearn\externals\__init__.py)

In [2]:
# Getting all the required file for cleaning and predictions
rep_file = "Input/df_replacement_fr.csv"
channel_file = "Input/Channel_selected_fr.csv"
stop_file = "Input/df_stopwords_fr.csv"
# Getting the wanted word for dictionary
replacement = pd.read_csv(rep_file, encoding ='latin-1' ).drop(["old","new"],axis =1)
channel = pd.read_csv(channel_file, encoding ='latin-1' ).drop(["Channel_selected","ChannelName","ChannelName_clean"], axis=1)
stopwords_list = pd.read_csv(stop_file, encoding ='latin-1' ).drop("type", axis = 1).rename(columns={"translation":"old"})
frames = [replacement, channel]
result = pd.concat(frames)
result = result.dropna()
dico = dict(zip(result.trans_old, result.trans_new))
stop_words1 =  stopwords.words('french') + ['projet', "dévelopement","programme", 'pays',"sustainable","gouvernement",'national','international','plus',
              'durable','sdg'] + list(stopwords_list["old"].values)

In [3]:

# lemmatizer + tokenizer (+ stemming) class
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = FrenchLefffLemmatizer()
        # we define (but not use) a stemming method, uncomment the last line in __call__ to get stemming tooo
        self.stemmer = nltk.stem.SnowballStemmer('french') 
    def __call__(self, doc):
        # pattern for numbers | words of length=2 | punctuations | words of length=1
        pattern = re.compile(r'[0-9]+|\b[\w]{2,2}\b|[%.,_`!"&?\')({~@;:#}+-]+|\b[\w]{1,1}\b')
        # tokenize document
        doc_tok = word_tokenize(doc)
        #filter out patterns from words
        doc_tok = [x for x in doc_tok if x not in stop_words1]
        doc_tok = [pattern.sub('', x) for x in doc_tok]
        # get rid of anything with length=1
        doc_tok = [x for x in doc_tok if len(x) > 1]
        doc = [self.wnl.lemmatize(t) for t in doc_tok]
        # uncomment if you want stemming as well
        doc = [self.stemmer.stem(x) for x in doc]
        return doc

In [4]:
def old_new_replace(s, dico):
    """
   Replaces word in a string to the new wanted word in the dictionary
    INPUT: -s (string): string we want to modify the given word
           -dico (dict): list of word and their replace new word.
    OUTPUT: - (string): 
    """
    new_s = s
    for old, new in dico.items():
        new_s = new_s.replace(old, new)
    return new_s

In [5]:
#Getting the final trainset, 
train_file = "Input/final_trainset.csv"

train = pd.read_csv("Input/final_trainset.csv")

In [6]:
# CountVectorizer instantiation with stop words, setting to count words of 1-grams and do not filter words based on their frequency
count_vec = CountVectorizer(lowercase=True,min_df=1,analyzer='word',tokenizer = LemmaTokenizer(), ngram_range=(1,2))#tokenizer = LemmaTokenizer()
vect_word = TfidfVectorizer(lowercase=True,min_df=1,analyzer='word',tokenizer = LemmaTokenizer(),ngram_range=(1,2))#tokenizer = LemmaTokenizer()
fit = train["new text"]
feat = vect_word.fit_transform(fit.values)
feat_name = vect_word.get_feature_names()

In [7]:
def split(train, target_col = ["SDG{}".format(i) for i in range(1,18)]):
    """
    Perform train test split on given data-set
    INPUT: -train (dataframe): Training data we want to split
           -target_col(list): Columns name that we want to predict results.
    OUTPUT:  -  (dataframe): splitting of dataframe to validate model.
    """
    train_pdf = train[:15798]
    train_crs = train[15798:]
    train_crs = train_crs.sample(frac=1)
    N = len(train_crs)
    X_train_crs = train_crs["new text"][0:int(N*0.9)]
    y_train_crs = train_crs[target_col][0:int(N*0.9)]
    X_test = train_crs["new text"][int(N*0.9):]
    y_test = train_crs[target_col][int(N*0.9):]
    X_train_pdf = train_pdf["new text"]
    y_train_pdf = train_pdf[target_col]
    frames_X = [X_train_pdf,X_train_crs]
    frames_y = [y_train_pdf,y_train_crs]
    X_train = pd.concat(frames_X)
    y_train = pd.concat(frames_y)
    return X_train,X_test,y_train,y_test

In [8]:
def get_tres(precision, recall, threshold,col):
    """
    Find the best treshold that maximizes the F1-score for class 1
    """    
    thres = list(threshold)
    thres.append(1)
    F1 = 2 * (precision * recall) / (precision + recall)
    F1 = list(F1)
    ind = F1.index(max(F1))
    if thres[ind]< 0.1:
        return 0.1
    else:
        return round(thres[ind],2)

In [9]:
def separate_prediction_xgb(train,target_col = ["SDG{}".format(i) for i in range(1,18)]):
    """
    Perform 17 binary logistic predictions to see if each entries is in one or many classes.
    INPUT: -train (dataframe): Training data we want to split
           -target_col(list): Columns name that we want to predict results.
    OUTPUT:  -  fits(joblib): Export the fits as a joblib file
                treshold(dataframe): Export the chosen treshold as a csv file
    """
    eta_par = 0.1
    #max_step = 1
    nrounds_par = 5 / eta_par
    X_train,X_test,y_train,y_test = split(train)
    mat = vect_word.transform(X_train.values)
    mat_test = vect_word.transform(X_test.values)
    prd = pd.DataFrame()
    prd_proba = pd.DataFrame()
    prd["Index"] = X_test.index.values
    prd_proba['Index'] = X_test.index.values
    treshold = []
    cv_score = []
    for i,col in enumerate(target_col):
        # parameters to be tries in the grid search
        #cv_params = {'eta':[0.1,0.2,0.3]}  
        if i in [16]:
            eta_par = 0.2
            max_step = 2
        fix_params = {'random_state':42, 'seed':2, 'max_depth':17, 'verbosity':1,'eta': eta_par,
                    'nrounds':nrounds_par, 'objective':"binary:logistic","max_delta_step" :0}
        
        clf = XGBClassifier(**fix_params)
        clf.fit(mat, y_train[col])
        #clf_test.fit(mat,y_train[col])
        prd_proba[col] = clf.predict_proba(mat_test)[:,1]
        joblib.dump(clf,open('Saved_Fit/Model_Fit_{}.joblib'.format(col), "wb"))
        model_probs = clf.predict_proba(mat_test)[:,1]
        # calculate roc auc
        roc_auc = roc_auc_score(y_test[col], model_probs,)
        print('XGBoost ROC AUC %.3f' % roc_auc)
        # calculate the precision-recall auc
        precision, recall, threshold = precision_recall_curve(y_test[col], model_probs)
        tres = get_tres(precision, recall, threshold,col)
        auc_score = auc(recall, precision)
        print('XGBoost PR AUC: %.3f' % auc_score)
        prd[col] = 0
        for j in range(len(prd_proba[col])):
            if prd_proba[col][j] > tres:
                prd[col][j]  = 1
            else:
                prd[col][j] = 0
        cv_score.append(metrics.accuracy_score(y_test[col],prd[col]))
        print(cv_score)
        print('\nConfusion matrix\n',confusion_matrix(y_test[col],prd[col]))
        print(classification_report(y_test[col],prd[col]))
        treshold.append(tres)
    treshold_frame = pd.DataFrame(treshold).rename(columns={0:"Tresholds"})
    treshold_frame.to_csv("Saved_Fit/Treshold.csv")

In [10]:
separate_prediction_xgb(train)

XGBoost ROC AUC 0.969
XGBoost PR AUC: 0.779
[0.9695493300852619]

Confusion matrix
 [[763  13]
 [ 12  33]]
              precision    recall  f1-score   support

         0.0       0.98      0.98      0.98       776
         1.0       0.72      0.73      0.73        45

    accuracy                           0.97       821
   macro avg       0.85      0.86      0.85       821
weighted avg       0.97      0.97      0.97       821

XGBoost ROC AUC 0.950
XGBoost PR AUC: 0.855
[0.9695493300852619, 0.9610231425091352]

Confusion matrix
 [[717  11]
 [ 21  72]]
              precision    recall  f1-score   support

         0.0       0.97      0.98      0.98       728
         1.0       0.87      0.77      0.82        93

    accuracy                           0.96       821
   macro avg       0.92      0.88      0.90       821
weighted avg       0.96      0.96      0.96       821

XGBoost ROC AUC 0.978
XGBoost PR AUC: 0.897
[0.9695493300852619, 0.9610231425091352, 0.9695493300852619]

Confus

In [11]:
def separate_prediction(train,target_col = ["SDG{}".format(i) for i in range(1,18)]):
    """
    Perform 17 binary logistic predictions to see if each entries is in one or many classes.
    INPUT: -train (dataframe): Training data we want to split
           -target_col(list): Columns name that we want to predict results.
    OUTPUT:  -  fits(joblib): Export the fits as a joblib file
                treshold(dataframe): Export the chosen treshold as a csv file
    """
    eta_par = 0.1
    #max_step = 1
    nrounds_par = 5 / eta_par
    X_train,X_test,y_train,y_test = split(train)
    mat = vect_word.transform(X_train.values)
    mat_test = vect_word.transform(X_test.values)
    prd = pd.DataFrame()
    prd_proba = pd.DataFrame()
    prd["Index"] = X_test.index.values
    prd_proba['Index'] = X_test.index.values
    treshold = []
    cv_score = []
    for i,col in enumerate(target_col):
        # parameters to be tries in the grid search
        #cv_params = {'eta':[0.1,0.2,0.3]}  
        if i in [16]:
            eta_par = 0.2
            max_step = 2
        fix_params = {'random_state':42, 'seed':2, 'max_depth':17, 'verbosity':1,'eta': eta_par,
                    'nrounds':nrounds_par, 'objective':"binary:logistic","max_delta_step" :0}
        
        clf = XGBClassifier(**fix_params)
        clf.fit(mat, y_train[col])
        #clf_test.fit(mat,y_train[col])
        prd_proba[col] = clf.predict_proba(mat_test)[:,1]
        joblib.dump(clf,open('Saved_Fit/Model_Fit_{}.joblib'.format(col), "wb"))
        model_probs = clf.predict_proba(mat_test)[:,1]
        # calculate roc auc
        roc_auc = roc_auc_score(y_test[col], model_probs,)
        print('XGBoost ROC AUC %.3f' % roc_auc)
        # calculate the precision-recall auc
        precision, recall, threshold = precision_recall_curve(y_test[col], model_probs)
        tres = get_tres(precision, recall, threshold,col)
        auc_score = auc(recall, precision)
        print('XGBoost PR AUC: %.3f' % auc_score)
        prd[col] = 0
        for j in range(len(prd_proba[col])):
            if prd_proba[col][j] > 0.5:
                prd[col][j]  = 1
            else:
                prd[col][j] = 0
        cv_score.append(metrics.accuracy_score(y_test[col],prd[col]))
        print(cv_score)
        print('\nConfusion matrix\n',confusion_matrix(y_test[col],prd[col]))
        print(classification_report(y_test[col],prd[col]))
        treshold.append(tres)
    treshold_frame = pd.DataFrame(treshold).rename(columns={0:"Tresholds"})
    treshold_frame.to_csv("Saved_Fit/Treshold.csv")

In [12]:
separate_prediction(train)

XGBoost ROC AUC 0.958
XGBoost PR AUC: 0.704
[0.97442143727162]

Confusion matrix
 [[784   3]
 [ 18  16]]
              precision    recall  f1-score   support

         0.0       0.98      1.00      0.99       787
         1.0       0.84      0.47      0.60        34

    accuracy                           0.97       821
   macro avg       0.91      0.73      0.80       821
weighted avg       0.97      0.97      0.97       821

XGBoost ROC AUC 0.951
XGBoost PR AUC: 0.857
[0.97442143727162, 0.9500609013398295]

Confusion matrix
 [[718  10]
 [ 31  62]]
              precision    recall  f1-score   support

         0.0       0.96      0.99      0.97       728
         1.0       0.86      0.67      0.75        93

    accuracy                           0.95       821
   macro avg       0.91      0.83      0.86       821
weighted avg       0.95      0.95      0.95       821

XGBoost ROC AUC 0.960
XGBoost PR AUC: 0.853
[0.97442143727162, 0.9500609013398295, 0.951278928136419]

Confusion mat