In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

from tqdm import tqdm
import os

In [7]:
def regression_metrics(true_labels, predicted_labels, model_name):
    mse = mean_squared_error(true_labels, predicted_labels)
    mae = mean_absolute_error(true_labels, predicted_labels)
    rmse = np.sqrt(mse)
    r2 = r2_score(true_labels, predicted_labels)

    print(f"Results for {model_name}:")
    print(f"Mean Absolute Error (MAE): {mae:.4f}")
    print(f"Mean Squared Error (MSE): {mse:.4f}")
    print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
    print(f"R^2 Score: {r2:.4f}")

In [8]:
dataset = pd.read_csv("./dataset.csv")
dataset_filtered = dataset.dropna(subset=['Chiffrage TMA JH'])

dataset_filtered.head()

Unnamed: 0,Clé,Résumé,Projet,Responsable,État,Versions corrigées,Epic Link,Lot de commande,Créateur,Type de ticket,...,Référent Développement,Date de Livraison estimée,Date de Livraison réelle,Référent étude TMA,Chiffrage TMA JH,Versions affectées,Description,Description US,Critères d’acceptation,Résumé du ticket
4,SIG-13948,FP-968 - Décommissionnement des appels sortant...,FE968 - Process recouvrement clients particuli...,Gildas MOEVI,En étude,SIG V24.5,PE-2071,Non_Défini,Gildas MOEVI,Story,...,,,,Nassima BENGUERNANE,1,SIG V24.5,,{panel:bgColor=#e3fcef}\n*Contexte* : \n\nLe c...,||*Cas d'utilisation*||\n|{panel:bgColor=#e3fc...,*Contexte*\r\nLors du processus de recouvremen...
14,SIG-13909,[FP867 - Remplacement EJB] [Etude] Découpage m...,FP1085 - Petits sujets techniques SIG S1 2024,Sylvain EYDIEUX,Prêt,,PE-2110,,Emile MIQUEU,Etude,...,,2023-12-22,2023-12-22,Sylvain EYDIEUX,2,,,{panel:bgColor=#e3fcef}\n*Contexte* : Dans le ...,||*Cas d'utilisation*||\n|{panel:bgColor=#e3fc...,
15,SIG-13908,[FP867 - Remplacement EJB] [Etude] Découpage m...,FP1085 - Petits sujets techniques SIG S1 2024,Sylvain EYDIEUX,Prêt,,PE-2110,,Emile MIQUEU,Etude,...,,2023-12-22,2023-12-22,Sylvain EYDIEUX,6,,,{panel:bgColor=#e3fcef}\n*Contexte* : Dans le ...,||*Cas d'utilisation*||\n|{panel:bgColor=#e3fc...,
17,SIG-13900,[Archivage et suppression des données SIG] Eta...,FE1000 - Archivage/suppression des données cli...,Tatiana KARSENTI (absente semaine du 25/03),Prêt,,PE-2105,Non_Défini,Amine ABDOU,Etude,...,,,,Jean-Michel FORHAN,7,,L’objectif de cette étude est de :\n\n* Mettre...,{panel:bgColor=#e3fcef}\n*Contexte* : \n\nL'ét...,||Cas d'utilisation||\r\n|{panel:bgColor=#e3fc...,
18,SIG-13889,FP1054 - Gestion des dates de campagne,FP1054 - Evolutions iR C24,Pierre Niclas,En recette,SIG V24.4,PE-2101,Non_Défini,Gildas MOEVI,Story,...,Ayoub ELJADID,2024-03-13,2024-03-13,Nassima BENGUERNANE,11,SIG V24.4,,{panel:bgColor=#e3fcef}\n*Contexte* : Dans le ...,||*Cas d'utilisation*||\n|{panel:bgColor=#e3fc...,*Contexte*\r\n\r\nPermettre à la RBB et aux éq...


In [9]:
import spacy
from spacy.tokenizer import Tokenizer

nlp = spacy.load("fr_core_news_sm")
nlp.tokenizer = Tokenizer(nlp.vocab)

OSError: [E050] Can't find model 'fr_core_news_sm'. It doesn't seem to be a Python package or a valid path to a data directory.

In [74]:
dataset_filtered['entry'] = dataset_filtered['Projet'] + " " + dataset_filtered['Critères d’acceptation'] + " " + dataset_filtered['Description US']

documents = []
import multiprocessing

max_processes = multiprocessing.cpu_count()

for doc in nlp.pipe(dataset_filtered['entry'].astype(str), n_process=max_processes - 1, batch_size=1000):
    documents.append(' '.join([tok.lemma_ for tok in doc if not tok.is_stop and tok.is_alpha]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset_filtered['entry'] = dataset_filtered['Projet'] + " " + dataset_filtered['Critères d’acceptation'] + " " + dataset_filtered['Description US']


In [75]:
cv = CountVectorizer(max_features=1500)
x_texts = cv.fit_transform(documents).toarray()
dataset_filtered['Chiffrage TMA JH'] = dataset_filtered['Chiffrage TMA JH'].str.replace(',', '.')

# Conversion des valeurs en flottants
y_chiffrage = dataset_filtered["Chiffrage TMA JH"].astype(float).values
print(y_chiffrage)

[ 1.  2.  6. ...  7. 21.  8.]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset_filtered['Chiffrage TMA JH'] = dataset_filtered['Chiffrage TMA JH'].str.replace(',', '.')


In [76]:
train_texts, test_texts, train_labels, test_labels = train_test_split(x_texts, y_chiffrage, test_size=0.3)


In [77]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(train_texts, train_labels)

# Prédiction sur l'ensemble de test
y_pred = regressor.predict(test_texts)
regression_metrics(test_labels,y_pred,'LinearRegression')

Results for LinearRegression:
Mean Absolute Error (MAE): 23628740716786.5000
Mean Squared Error (MSE): 3521658428457642829469974528.0000
Root Mean Squared Error (RMSE): 59343562653902.4297
R^2 Score: -47035309496524216677171200.0000


In [78]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np




In [79]:
from sklearn.linear_model import Ridge

# Créer et entraîner le modèle de régression Ridge
ridge_regressor = Ridge()
ridge_regressor.fit(train_texts, train_labels)

# Prédiction sur l'ensemble de test
y_pred = ridge_regressor.predict(test_texts)
regression_metrics(test_labels,y_pred,'LinearRegression')

Results for LinearRegression:
Mean Absolute Error (MAE): 6.5257
Mean Squared Error (MSE): 98.3541
Root Mean Squared Error (RMSE): 9.9174
R^2 Score: -0.3136


In [80]:
from sklearn.ensemble import RandomForestRegressor

# Créer et entraîner le modèle de forêt aléatoire pour la régression
rf_regressor = RandomForestRegressor()
rf_regressor.fit(train_texts, train_labels)

# Prédiction sur l'ensemble de test
y_pred = rf_regressor.predict(test_texts)
regression_metrics(test_labels,y_pred,'LinearRegression')


Results for LinearRegression:
Mean Absolute Error (MAE): 5.1108
Mean Squared Error (MSE): 69.6380
Root Mean Squared Error (RMSE): 8.3449
R^2 Score: 0.0699


On passe au model de IA


In [81]:
dataset_filtered = dataset.dropna(subset=['Chiffrage TMA JH'])

dataset_filtered['combined_text'] = dataset_filtered['Projet'] + " " + dataset_filtered['Critères d’acceptation'] + " " + dataset_filtered['Description US']
X = dataset_filtered['combined_text'].values

y_beforeParse= dataset_filtered['Chiffrage TMA JH'].str.replace(',', '.')
y = y_beforeParse.astype(float).values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset_filtered['combined_text'] = dataset_filtered['Projet'] + " " + dataset_filtered['Critères d’acceptation'] + " " + dataset_filtered['Description US']


In [82]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


In [83]:
from transformers import BertTokenizer, BertModel, BertForSequenceClassification
import torch
from torch import nn

# Initialize the tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=1)  # Change for regression


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [84]:
from torch.utils.data import Dataset, DataLoader
import torch

class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        inputs = self.tokenizer(text, max_length=512, truncation=True, padding='max_length', return_tensors="pt")
        inputs = {key: val.squeeze() for key, val in inputs.items()}  # Remove the batch dimension added by return_tensors
        label = torch.tensor(self.labels[idx], dtype=torch.float)
        return inputs, label


In [85]:
from transformers import BertForSequenceClassification, BertTokenizer
import torch.nn as nn

class BertForRegression(BertForSequenceClassification):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = 1  # This is for regression, indicating a single continuous output
        self.classifier = nn.Linear(config.hidden_size, 1)  # Ensure this outputs a single value

    def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None):
        outputs = self.bert(input_ids,
                            attention_mask=attention_mask,
                            token_type_ids=token_type_ids,
                            position_ids=position_ids,
                            head_mask=head_mask,
                            inputs_embeds=inputs_embeds)
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits  # Directly return logits without squeezing here



In [86]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForRegression.from_pretrained('bert-base-uncased')

# Assume X and y are your dataset's features and labels
dataset = TextDataset(X_train,y_train, tokenizer)
loader = DataLoader(dataset, batch_size=8, shuffle=True)


Some weights of BertForRegression were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [87]:
from torch.optim import AdamW
from torch.nn import MSELoss

if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f'GPU: {torch.cuda.get_device_name(0)}')
else:
    device = torch.device("cpu")
    print('GPU not available, using CPU instead.')

optimizer = AdamW(model.parameters(), lr=5e-5)
loss_function = MSELoss()

model = model.to(device)
model.train()

for epoch in range(3):  # Assuming 3 epochs for demonstration
    for batch in loader:
        # Unpack the batch
        inputs, labels = batch

        # Move inputs (a dictionary of tensors) to the GPU
        inputs = {k: v.to(device) for k, v in inputs.items()}

        # Move labels to the GPU
        labels = labels.to(device)

        optimizer.zero_grad()

        # Forward pass and get logits directly. Ensure model's output matches the loss function's expectation
        outputs = model(**inputs)
        logits = outputs.squeeze(-1)  # Adjust this line based on your model's specific output structure

        # Calculate loss
        loss = loss_function(logits, labels)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch + 1} finished.")


GPU: Tesla T4
Epoch 1 finished.
Epoch 2 finished.
Epoch 3 finished.


In [88]:
model_path = "bert_for_regression.pth"
torch.save(model.state_dict(), model_path)

# If you're using a tokenizer, save it too
tokenizer_path = "bert_tokenizer"
tokenizer.save_pretrained(tokenizer_path)

('bert_tokenizer/tokenizer_config.json',
 'bert_tokenizer/special_tokens_map.json',
 'bert_tokenizer/vocab.txt',
 'bert_tokenizer/added_tokens.json')

In [89]:
model.eval()
test_dataset = TextDataset(X_test, y_test, tokenizer)  # Replace X_test, y_test with your test data and labels
test_loader = DataLoader(test_dataset, batch_size=8)

In [90]:
test_dataset = TextDataset(X_test, y_test, tokenizer)  # Replace X_test, y_test with your test data and labels
test_loader = DataLoader(test_dataset, batch_size=8)


In [92]:
# Collect all predictions and true labels
all_predictions = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        inputs, labels = batch
        inputs = {k: v.to(device) for k, v in inputs.items()}
        labels = labels.to(device)

        outputs = model(**inputs).squeeze(-1)
        all_predictions.extend(outputs.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Calculate MSE
regression_metrics(all_labels, all_predictions, "BERT for Regression")

print(f"Mean Squared Error on Test Set: {mse}")

Results for BERT for Regression:
Mean Absolute Error (MAE): 5.2955
Mean Squared Error (MSE): 67.4701
Root Mean Squared Error (RMSE): 8.2140
R^2 Score: -0.0010
Mean Squared Error on Test Set: 67.47013854980469
