# Ceci est un notebook qui a servi de test avant la création du fichier app.py

In [65]:
#importation des packages de base :

import pandas as pd
import numpy as np
import streamlit as st


#import package nécessaire au prétraitement de texte :

from bs4 import BeautifulSoup

import nltk 
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize, wordpunct_tokenize, RegexpTokenizer
from nltk.corpus import words, stopwords

#import des packages pour la prédiction :

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier

#import modeles : 
from sklearn.linear_model import  SGDClassifier
#import bag of words : 
from sklearn.feature_extraction.text import CountVectorizer 


#import pour charger fichier : 
import pickle

#import package mise en page : 
from PIL import Image

# Organisation du notebook : 

### 1. Déclaration des fonctions nécessaire à notre modèle de prédiction
### 2. Mise en page de l'API avec le module Streamlite
### 3. Zone de test pour vérifier le bon fonctionnement de l'application

Seul les 1. et 2. seront transférés

In [66]:
#### Chargement des fichiers : 
toptag = pickle.load(open("toptag.pkl","rb"))

eng_words = pickle.load(open("eng_words","rb"))

sgd = pickle.load(open("sgd","rb"))

mlb = pickle.load(open("mlb","rb"))

vect = pickle.load(open("vect","rb"))

# 1. Déclaration des fonctions :

In [98]:
###### Fonctions prétraitement de texte  : 

########### Fonction 1 : ###########
####################################

#fonction suppression des balises html : 

def clean_balise(text):
    soup = BeautifulSoup(text, 'html.parser')
    clean_text = soup.get_text()
    return clean_text
    
########### Fonction 2 : ###########
####################################


def preprocessing(txt, list_rare_words = None,
                  format_txt=False):

    """
    txt : contient le document au format str qui subira le preprocessing
    format_txt : Si True cela renvoie une chaine de caractère, sinon une liste
    list_rare_words : liste de token a fournir si on souhaite les supprimer
    """
    #tokenization et separation de la ponctuation
    tokens = nltk.wordpunct_tokenize(txt)
    
    #suppression ponctuation
    tokenizer = RegexpTokenizer(r"\w+")
    tokens = tokenizer.tokenize(txt)
    
    #suppression majuscule : 
    tokens = [w.lower() for w in tokens]
    
        
    #suppression des chiffres : 
    tokens = [w for w in tokens if not w.isnumeric()]

    
    #suppression stopwords : 
    stopw = nltk.corpus.stopwords.words("english")
    tokens = [w for w in tokens if w not in stopw]

    #Supprime les tokens fournie dans la liste en hyperparametres
    if list_rare_words:
        tokens = [w for w in tokens if w not in list_rare_words]      
        
    #Lemmatization des mots s'ils n'appartiennent pas a la liste toptag : 
    lemm = WordNetLemmatizer()
    tmp_list = []

    for i in tokens:
        if i not in toptag: #si le token n'est pas dans la toptag liste alors on le lemmatize
            tmp_list.append(lemm.lemmatize(i))
        else: #sinon on conserve le token tel quel
            tmp_list.append(i)
    
    #Suppression des mots token qui ne sont pas des mots dans le dictionnaire anglais 
    #OU qui ne sont pas dans la liste des top tags à conserver :
    
    tokens = [w for w in tmp_list if w in eng_words or w in toptag]    
        
    if format_txt:
        tokens = " ".join(tokens)
    return tokens


########### Fonction 3 : ###########
####################################


#fonction d'application de notre prétraitement de texte :
def cleaning(doc):
    new_doc = preprocessing(doc, 
                            list_rare_words = None, 
                            format_txt=True, 
                             )
    return new_doc

In [90]:
def BoW(text):#le text doit etre une chaine de caractère en entrée
    sentence = vect.transform([text])#ensuite on le met dans une seule liste (et non en split)
    cv_sentence= pd.DataFrame(sentence.toarray(),columns=vect.get_feature_names_out()) #récupération du dataframe du bow
    return cv_sentence

In [114]:
### Fonction qui, à partir du texte rentré par l'utilisateur, va retourner une prédiction de tag :

def applying(text):
    text = clean_balise(text) #utilisation des fonctions 2 de prétraitement de texte
    text = cleaning(text)#ici le text devient une chaine de caractère
    text = BoW(text) # transformation du texte en feature compatible avec notre modèle de prédiction
    prediction = sgd.predict(text) #prediction du texte
    tag_pred = mlb.inverse_transform(prediction) #transformation de la target binarizée en target lisible 

    return tag_pred #affichage des tags prédits

In [94]:
cleaning("i am an engineering professional who like use python and java")

'engineering professional like use python java'

# 2. Mise en page

In [None]:
# Titre : 
st.title("Keyword prediction tool Stackoverflow ") 


# Données entrées par l'utilisateur :
Title_input = st.text_input("Write the title of your request below")
input_body_utilisateurs = st.text_input("Enter the content of your request below ")

#Réponse de notre modèle : 

reponse = applying(input_body_utilisateurs)
st.text(reponse)

### Ajout d'une image :
image = Image.open('logo.png')



 ## TEST ZONE

##### Testons d'abord les fonctions de preprocessing et de bag of words :


In [99]:
text_prep = cleaning("i am an engineering professional who like use python and java")
text_prep
#ok pour preprocessing

'engineering professional like use python java'

In [100]:
BoW(text_prep)
#ok pour la transformation en BoW

Unnamed: 0,ability,able,abort,aborted,absolute,absolutely,abstract,abstraction,abuse,accept,...,yield,york,young,youtube,zed,zero,zip,zombie,zone,zoom
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


##### Testons maintenant notre fonction applying qui, a partir  du BoW créé, doit nous renvoyer une prédiction de tag

### Testons sur différentes phrases :

In [101]:
test = "I use to like coding with python, java or c++"
test2 = 'This is a test. I can program in c++, java and R'
test3 = "expert python could someone explain problem like collect supervisor http made different script print output bash php collect output without problem python work sh bin php php sleep echo test sh bin bash sleep echo item done sh test print output bin import time import import range time sleep print test write test print test supervisor file bash program command home user sh home user log php program command home user sh home user log python program command home user sh home user log thank much help driving crazy"
test4 = 'question whether use guarantee visibility field respect synchronized example following class field need declared volatile synchronized used class private double public synchronized void method double temp temp temp example using however volatile field necessary class b private final lock new private volatile double public void method lock lock try double temp temp temp finally lock know using volatile anyway likely impose performance would still like code correctly'
test5 = pickle.load(open("ex_prediction","rb"))
test6 = "new webpack struggling convert css minify file structure public css j j css css css map main css main css map main j j j like dev webpack mode development public j j output public j j build webpack mode production public j j output public j j found thing like cant make work webpack j module export module rule test use option minimize true dont know path included file path output advice"

In [102]:
test

'I use to like coding with python, java or c++'

In [103]:
applying(test)

[()]

In [104]:
test2

'This is a test. I can program in c++, java and R'

In [105]:
applying(test2)

[()]

In [106]:
test3

'expert python could someone explain problem like collect supervisor http made different script print output bash php collect output without problem python work sh bin php php sleep echo test sh bin bash sleep echo item done sh test print output bin import time import import range time sleep print test write test print test supervisor file bash program command home user sh home user log php program command home user sh home user log python program command home user sh home user log thank much help driving crazy'

In [107]:
applying(test3)

[('python',)]

In [108]:
test4

'question whether use guarantee visibility field respect synchronized example following class field need declared volatile synchronized used class private double public synchronized void method double temp temp temp example using however volatile field necessary class b private final lock new private volatile double public void method lock lock try double temp temp temp finally lock know using volatile anyway likely impose performance would still like code correctly'

In [109]:
applying(test4)

[()]

In [79]:
#test 5 correspond a la valeur sgd
mlb.inverse_transform(test5)

[('  c++',
  '  css',
  '  fonts',
  '  java-native-interface',
  '  loops',
  '  parallel-processing',
  '  sass',
  '  webpack',
  'javascript')]

In [112]:
test6 = "new webpack struggling convert css minify file structure public css j j css css css map main css main css map main j j j like dev webpack mode development public j j output public j j build webpack mode production public j j output public j j found thing like cant make work webpack j module export module rule test use option minimize true dont know path included file path output advice"

In [113]:
applying(test6)

[('  c++',
  '  css',
  '  fonts',
  '  java-native-interface',
  '  loops',
  '  parallel-processing',
  '  sass',
  '  webpack',
  'javascript')]