# Préparer le modèle

In [1]:
# Libraries
from pandas import DataFrame, Series
from bs4 import BeautifulSoup 
import re
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import joblib
import numpy as np
import pandas as pd

# Functions

def post_to_words( raw_post ):
    """Function to convert a raw document to a string of words.
    
    Inputs : 
    
    - raw_post : a single string 
    
    Output :
    
    - a single string containing a preprocessed document"""
    
    # 1. Remove code
    
    liste = raw_post.split('code>')
    
    liste_clean = []
    for i in range(0,len(liste),2):
        elt = liste[i]
        liste_clean.append(elt) 
        
    string_clean = " ".join(liste_clean)
    
    # 2. Remove HTML
    post_text = BeautifulSoup(string_clean).get_text() 
    #
    # 3. Remove non-letters        
    letters_only = re.sub("[^#+a-zA-Z]", " ", post_text) 
    #
    # 4. Convert to lower case, split into individual words
    words = letters_only.lower().split()                             
    #
    # 5. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))                  
    # 
    # 6. Remove stop words
    meaningful_words = [w for w in words if not w in stops]   
    #
    # 7. Lematize
    lemmatizer = WordNetLemmatizer() 

    lems = []

    for word in meaningful_words:
        word_clean = lemmatizer.lemmatize(word)
        lems.append(word_clean)
    #
    # 8. Join the words back into one string separated by space, 
    # and return the result.
    return( " ".join( lems ))


def my_tokenizer(doc):
    
    """Function defining personalized tokenizer for sklearn's CountVectorizer in order to keep the created 
    bigrams in tag's format.
    
    Input:
    
    - doc : string to be tokenized
    
    Output:
    
    - tokenized string
    """
    
    
    tokens = doc.split()
    
    return tokens

stop_words = ["i'm", 'would', '1', '0', '2', "i've", 'could', 'anyone', 'also', '3', 'thanks', 
               'two', 'however', "i'd", '5', "+", "#", "im", "ive", "dont", "cant", "id", ")", "(", 'code','using','file','error','get',
 'like','using', 'get', 'like', 'want', 'use', 'work', 'one', 'trying', 'need', 'way', 'tried',
 'problem', 'following', 'run', 'example', 'help', 'new', 'know', 'working','make', 'create','first',
 'issue', 'find', 'see', 'different', 'show', 'return', 'test', 'question', 'getting', 'something',
 'try', 'able', 'e', 'another', 'used','without', 'look', 'please', 'possible', 'x', 'found','fine',
 'created', 'case', 'would-like', 'still', 'inside','wrong','right', 'give', 'seems', 'cannot',
 'idea', 'instead', 'sure', 'b', 'every', 'react', 'based', 'simple', 'got', 'v', 'already', 'look-like',
 'many', 'called', 'say', 'correct', 'main','specific','understand', 'added', 'since', 'currently',
 'back', 'current']

# Initialize the "CountVectorizer" object, which is scikit-learn's bag of words tool.  
tf_idf_final = TfidfVectorizer(analyzer = "word",   \
                             tokenizer = my_tokenizer,    \
                             preprocessor = None, \
                             strip_accents=None,
                             max_features = None, 
                             lowercase=False,
                             stop_words = stop_words) 

top_tags = np.load('Data/top_tags.npy')

def reduce_tags(y):
    
    #new_tags = []
    
    tag = y.split()
    tag = [word for word in tag if word in top_tags]
    
        
    #new_tags.append(tag)
        
    return " ".join(tag)

def cleaning_target(raw_target):
    """Function to remove '<>' signs from target list and to replace them with a space
    
    Arguments :
    - raw_target : a Series of tags
    
    Return :
    - Series with cleaned tags
       
    """
    
    #Remove first '<'
    tag = raw_target[1:]
    
    #Remove last '>'
    tag = tag[:-1]
    
    #Remove remaining '><' signs
    tag = tag.split('><')
    
    #Converting back to string
    tag_clean = " ".join(tag)  
    
    return tag_clean

# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool.  
tag_vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = my_tokenizer,    \
                             preprocessor = None, \
                             strip_accents=None,
                             max_features=None,
                             lowercase=False,
                             stop_words = stop_words) 

model_final = joblib.load('model_final.plk')


# X train cleaning & formatting
X_train = pd.read_csv('Data/X_train.csv', sep='\t')
X_train['body_clean']=X_train['body'].apply(lambda x: post_to_words(x))
X_train['title_clean']=X_train['title'].apply(lambda x: post_to_words(x))
X_train['post_w'] = (X_train['title_clean'] + " ") *3 + " " + X_train['body_clean']
X_train_final = tf_idf_final.fit_transform(X_train['post_w'])
X_train_final = X_train_final.toarray()

# y train cleaning and formatting
y_train = pd.read_csv('Data/y_train.csv', sep='\t', header=None)
y_train = y_train[1]
y_train_clean = y_train.apply(lambda x: cleaning_target(x)) 
y_train_top_tags = y_train_clean.apply(lambda x : reduce_tags(x))
y_train_final = tag_vectorizer.fit_transform(y_train_top_tags)
y_train_final = y_train_final.toarray()
y_train_final_vocab = tag_vectorizer.get_feature_names()

# Saisir le texte

In [2]:
# User's title input
title = input("Title: ")

Title: Image classification using CNN


In [3]:
# User's body input
body = input("Body: ")

Body: How could I implement CNN with Keras ?


# Retourner les tags prédits

In [4]:
# Cleaning
body_clean = post_to_words(body)
title_clean = post_to_words(title)
post_w = (title_clean + " ") *3 + " " + body_clean
post_w = Series(post_w)

# Formatting
new_question_final = tf_idf_final.transform(post_w)
new_question_final = new_question_final.toarray()

# Fit the sample
predicted_tags_new_quest = model_final.predict(new_question_final)

# Print the tags
for freq, word in zip(predicted_tags_new_quest[0], y_train_final_vocab):
    if freq > 0:
        print(word) 

keras
