In [55]:
import pandas as pd
import numpy as np

#Prétraitement
from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

nltk.download('stopwords')
nltk.download('wordnet')

#Extraction de features
#Bag of words
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
#Word2Vec
from gensim.models import Word2Vec
#BERT
#from transformers import BertTokenizer, BertModel
#import torch
#USE
import tensorflow_hub as hub

#empecher les messages d'erreur
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Formatage taille cellule pour s'adapter aux graphiques
from IPython.display import display, HTML
display(HTML("<style>.output_scroll { height: auto !important; }</style>"))


df = pd.read_csv("./filtered_questions.csv",sep=',', encoding='utf-8')  
df.head(5)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\maudt\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\maudt\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!





Unnamed: 0,title,body,link,view_count,score,tags
0,Why is processing a sorted array faster than p...,"<p>In this C++ code, sorting the data (<em>bef...",https://stackoverflow.com/questions/11227809/w...,1894410,27300,"java,c++,performance,cpu-architecture,branch-p..."
1,How do I delete a Git branch locally and remot...,<p>Failed Attempts to Delete a Remote Branch:<...,https://stackoverflow.com/questions/2003505/ho...,11766932,20374,"git,version-control,git-branch,git-push,git-re..."
2,What is the &#39;--&gt;&#39; operator in C/C++?,"<p>After reading <a href=""http://groups.google...",https://stackoverflow.com/questions/1642028/wh...,1018216,10170,"c++,c,operators,code-formatting,standards-comp..."
3,How do I force &quot;git pull&quot; to overwri...,<p>How do I force an overwrite of local files ...,https://stackoverflow.com/questions/1125968/ho...,8726770,9730,"git,version-control,overwrite,git-pull,git-fetch"
4,What and where are the stack and heap?,<ul>\n<li>What are the stack and heap?</li>\n<...,https://stackoverflow.com/questions/79923/what...,1950792,9450,"data-structures,memory-management,heap-memory,..."


## Prétraitement de texte

In [57]:
# Fonction pour nettoyer le texte
def clean_text(text):
    text = BeautifulSoup(text, "html.parser").get_text()  # Supprimer les balises HTML
    text = re.sub(r'[^a-zA-Z]', ' ', text)  # Supprimer la ponctuation
    words = text.lower().split()  # Mettre en minuscule et diviser en mots
    stop_words = set(stopwords.words('english'))  # Obtenir les stopwords anglais
    words = [w for w in words if w not in stop_words]  # Supprimer les stopwords
    lemmatizer = WordNetLemmatizer()  # Initialiser le lemmatizer
    words = [lemmatizer.lemmatize(w) for w in words]  # Lemmatization
    return ' '.join(words)

# Appliquer le nettoyage aux colonnes 'title' et 'body'
df['clean_title'] = df['title'].apply(clean_text)
df['clean_body'] = df['body'].apply(clean_text)

# Séparer les tags en liste
df['tags'] = df['tags'].apply(lambda x: x.split(','))

# Binariser les tags
mlb = MultiLabelBinarizer()
df_tags = mlb.fit_transform(df['tags'])

# Séparer les données en jeux d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(df[['clean_title', 'clean_body']], df_tags, test_size=0.2, random_state=42)

# Afficher un aperçu des données nettoyées
df['clean_title'].head()


0    processing sorted array faster processing unso...
1                   delete git branch locally remotely
2                                         operator c c
3                  force git pull overwrite local file
4                                           stack heap
Name: clean_title, dtype: object

In [59]:
df['clean_body'].head()

0    c code sorting data timed region make primary ...
1    failed attempt delete remote branch git branch...
2    reading hidden feature dark corner c stl comp ...
3    force overwrite local file git pull local repo...
4    stack heap located physically computer memory ...
Name: clean_body, dtype: object

## Extraction de Features

### Bag of Words

In [61]:
# Créer le CountVectorizer pour 'title'
count_vect = CountVectorizer()
count_vect.fit(X_train['clean_title'])

# Transformer 'title' et 'body' des jeux d'entraînement et de test
X_train_count = count_vect.transform(X_train['clean_title'] + " " + X_train['clean_body'])
X_test_count = count_vect.transform(X_test['clean_title'] + " " + X_test['clean_body'])

# Créer le TfidfVectorizer pour 'title'
tfidf_vect = TfidfVectorizer()
tfidf_vect.fit(X_train['clean_title'])

# Transformer 'title' et 'body' des jeux d'entraînement et de test
X_train_tfidf = tfidf_vect.transform(X_train['clean_title'] + " " + X_train['clean_body'])
X_test_tfidf = tfidf_vect.transform(X_test['clean_title'] + " " + X_test['clean_body'])

### Word2Vec

In [63]:
# Préparer les phrases pour Word2Vec
sentences = [text.split() for text in X_train['clean_title'] + " " + X_train['clean_body']]
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

# Fonction pour obtenir la moyenne des embeddings Word2Vec pour un texte
def get_w2v_features(text, model):
    words = text.split()
    word_vecs = [model.wv[word] for word in words if word in model.wv]
    return np.mean(word_vecs, axis=0) if word_vecs else np.zeros(model.vector_size)

# Appliquer aux jeux d'entraînement et de test
X_train_w2v = np.array([get_w2v_features(text, word2vec_model) for text in X_train['clean_title'] + " " + X_train['clean_body']])
X_test_w2v = np.array([get_w2v_features(text, word2vec_model) for text in X_test['clean_title'] + " " + X_test['clean_body']])

### BERT

In [None]:
# Initialiser le tokenizer et le modèle BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

# Fonction pour obtenir les embeddings BERT
def get_bert_features(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Appliquer aux jeux d'entraînement et de test
X_train_bert = np.array([get_bert_features(text, tokenizer, bert_model).flatten() for text in X_train['clean_title'] + " " + X_train['clean_body']])
X_test_bert = np.array([get_bert_features(text, tokenizer, bert_model).flatten() for text in X_test['clean_title'] + " " + X_test['clean_body']])

### USE

In [65]:
# Charger le modèle USE
use_model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

# Fonction pour obtenir les embeddings USE
def get_use_features(text, model):
    return model([text]).numpy().flatten()

# Appliquer aux jeux d'entraînement et de test
X_train_use = np.array([get_use_features(text, use_model) for text in X_train['clean_title'] + " " + X_train['clean_body']])
X_test_use = np.array([get_use_features(text, use_model) for text in X_test['clean_title'] + " " + X_test['clean_body']])











