# construction du modèle

## Importation des modules

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from joblib import dump
from utils import clean_text
from utils import evaluate_model
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

## Définition des fonctions

## Importation des données et brève exploration

In [2]:
data= pd.read_csv(r"QueryResults.csv")  
data.head()

Unnamed: 0,Title,Body,Tags,Id,Score,ViewCount,FavoriteCount,AnswerCount
0,How to convert Decimal to Double in C#?,<p>I want to assign the decimal variable &quot...,<c#><floating-point><type-conversion><double><...,4,807,77338,0.0,13
1,Calculate relative time in C#,<p>Given a specific <code>DateTime</code> valu...,<c#><datetime><time><datediff><relative-time-s...,11,1654,203176,0.0,42
2,Determine a user's timezone,<p>Is there a standard way for a web server to...,<html><browser><timezone><user-agent><timezone...,13,719,308410,0.0,27
3,What is the fastest way to get the value of π?,<p>I'm looking for the fastest way to obtain t...,<performance><algorithm><language-agnostic><un...,19,352,70669,0.0,23
4,How to use the C socket API in C++ on z/OS,<p>I'm having issues getting the C sockets API...,<c++><c><sockets><mainframe><zos>,25,176,16473,0.0,9


In [3]:
data.fillna('', inplace=True)

  data.fillna('', inplace=True)


## Nettoyage

In [4]:
data['Title'] = data['Title'].apply(clean_text)
data['Body'] = data['Body'].apply(clean_text)

  text = BeautifulSoup(text, "html.parser").get_text()


In [5]:
tags, nested_tags = [], []
# recuperer top k des tags importants
for i in range(len(data)):
  tags.append(data["Tags"].iloc[i].split('|'))
for tag in tags:
  for subtag in tag:
    if subtag !="":
      nested_tags.append(subtag)

df_tags =  pd.DataFrame(nested_tags).rename(columns={0:"tags"})
df_tags = df_tags["tags"].value_counts().reset_index().sort_values("count", ascending=False)
real_tags = df_tags.head(30)["tags"].tolist()

In [6]:
# transformer le contenu de tag en list de simple tag
data['filtered_tags'] = data['Tags'].str.split("|")

In [7]:
# strip les virgules au debut des tags
def strip_and_filter_commas(tag_list):
    return [tag.strip(',') for tag in tag_list if tag.strip(',')]
data['filtered_tags'] = data['filtered_tags'].apply(strip_and_filter_commas)

In [8]:
# Vérifier si chaque ligne de 'filtered_tags' contient au moins un élément de 'real_tags'
data['contains_real_tag'] = data['filtered_tags'].apply(lambda x: any(tag in x for tag in real_tags))

In [9]:
# garder les tags pertinents et remplacer par others si impertinents
allowed_tags = real_tags

def filter_tags(tag_list):
    if any(tag in allowed_tags for tag in tag_list):
        return [tag for tag in tag_list if tag in allowed_tags]
    else:
        return ["Others"]   

	
data['filtered_tags'] = data['filtered_tags'].apply(filter_tags)

## Encodage

In [10]:
mlb = MultiLabelBinarizer()
mlb.fit(data['filtered_tags'])
y = mlb.transform(data['filtered_tags'])

## Vectorisation

In [11]:
X = data[['Title', 'Body']].apply(lambda x: ' '.join(x), axis=1)
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(X)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Entrainement du modèle

In [13]:
model = LogisticRegression(random_state=42, max_iter = 2000)
pipeline = Pipeline([
      ('classifier', OneVsRestClassifier(model))
  ])
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X)

## Evaluation du modèle

In [14]:
evaluate_model(pipeline, X_test, y_test, mlb)

Accuracy: 0.9928
Precision: 0.9900838458713146
Recall: 0.9937
F1-score: 0.9918655315839647


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Enregistrement du modèle et des fonctions

In [15]:
dump(pipeline, 'model.classifier')
dump(vectorizer, 'model_vectorizer')
dump(mlb, 'mlb_encoder')

['mlb_encoder']

## Test

In [16]:
# Exemple pour tester avec de nouvelles données

new_data = pd.DataFrame({
    'Title': ['django update model field based another field'],
    'Body': ['new django python want something used often java consider following model relevant class class item model model name model charfield model decimalfield default def self return self name class saledetail model model item model foreignkey item deposit model foreignkey deposit quantity model positiveintegerfield model decimalfield sale model foreignkey sale def self value self none self value want time item added saledetail update saledetail item saledetail new set used java pojos include logic setter method tried using python property encapsulate item property django update field directly hood would break automatic functionallity also tried subclassing foreignkey accept callback function find way call method container class want provide default want include logic view logic since conceptually think logic model server side use case would update total sale detail sale would like calculate user decides save sale save signal would work thanks']})

new_data['Title'] = new_data['Title'].apply(lambda x: clean_text(x))
new_data['Body'] = new_data['Body'].apply(lambda x: clean_text(x))

new_data_combined = new_data['Title'] + ' ' + new_data['Body']

new_data_combined = new_data[['Title', 'Body']].apply(lambda x: ' '.join(x), axis=1)
# Vectorisation des nouvelles données
new_X = vectorizer.transform(new_data_combined)

# Prédictions pour les nouvelles données
result = pipeline.predict(new_X)
# Transformation des prédictions en tags originaux
result = mlb.inverse_transform(result)
result

[('Others',)]

In [17]:
result

[('Others',)]