**Load data**

In [None]:
train_file_path = 'train.txt'
test_file_path = 'test.txt'

# Fonction pour charger les données à partir d'un fichier texte
def load_data(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
    pairs = [line.strip().split('\t') for line in lines]
    phrases1, phrases2, similarities = zip(*pairs)
    return phrases1, phrases2, similarities

# Charger les données d'entraînement et de test
train_phrases1, train_phrases2, train_similarities = load_data(train_file_path)
test_phrases1, test_phrases2, test_similarities = load_data(test_file_path)
import pandas as pd
df = pd.DataFrame({'Phrase1': train_phrases1, 'Phrase2': train_phrases2, 'Similarity': train_similarities})
test_data = pd.DataFrame({'Phrase1': test_phrases1, 'Phrase2': test_phrases2, 'Similarity': test_similarities})

**BERT**

In [None]:
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

**Generate Embeddings**

In [None]:
import torch
import torch.nn.functional as F
from sklearn.metrics.pairwise import cosine_similarity

def get_sentence_embeddings(tokenizer, model, s1, s2):
    # Tokenize the input sentences
    encoded_s1 = tokenizer.encode(s1, return_tensors='pt')
    encoded_s2 = tokenizer.encode(s2, return_tensors='pt')

    # Generate embeddings for the input sentences
    with torch.no_grad():
        output_s1 = model(encoded_s1)
        output_s2 = model(encoded_s2)

    # Extract the embeddings
    embeddings_s1 = output_s1.last_hidden_state.squeeze(0)
    embeddings_s2 = output_s2.last_hidden_state.squeeze(0)

    return embeddings_s1, embeddings_s2


In [None]:
df[['Embeddings1', 'Embeddings2']] = df.apply(lambda row: get_sentence_embeddings(tokenizer, model, row['Phrase1'], row['Phrase2']), axis=1, result_type='expand')

**calculate cosine similarity, Euclidean distance and Jaccard similarity**

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import jaccard_score

# Function to calculate cosine similarity
def calculate_cosine_similarity(embedding1, embedding2):
    # Reshape embeddings if needed (assuming they are 2D arrays)
    embedding1 = embedding1.reshape(1, -1)
    embedding2 = embedding2.reshape(1, -1)

    # Pad or truncate embeddings to have the same number of dimensions
    max_dim = max(embedding1.shape[1], embedding2.shape[1])
    embedding1 = np.pad(embedding1, ((0, 0), (0, max_dim - embedding1.shape[1])), mode='constant')
    embedding2 = np.pad(embedding2, ((0, 0), (0, max_dim - embedding2.shape[1])), mode='constant')

    # Calculate cosine similarity
    similarity = cosine_similarity(embedding1, embedding2)

    return similarity[0][0]  # Extract the similarity value from the resulting matrix

# Function to calculate Euclidean distance
def calculate_euclidean_distance(vec1, vec2):
    # Ensure both vectors have the same size
    min_len = min(len(vec1), len(vec2))
    vec1 = vec1[:min_len]
    vec2 = vec2[:min_len]
    # Calculate Euclidean distance
    return np.linalg.norm(vec1 - vec2)

# Function to calculate Jaccard similarity
def calculate_jaccard_similarity(embeddings_s1, embeddings_s2):
    # Convert embeddings to sets of tokens
    set_s1 = set(embeddings_s1)
    set_s2 = set(embeddings_s2)
    intersection = len(set_s1.intersection(set_s2))
    union = len(set_s1.union(set_s2))
    return intersection / union if union != 0 else 0


In [None]:
df['Cosine_Embeddings'] = df.apply(lambda row: calculate_cosine_similarity(row['Embeddings1'], row['Embeddings2']), axis=1)
df['Euclidean_Embeddings'] = df.apply(lambda row: calculate_euclidean_distance(row['Embeddings1'], row['Embeddings2']), axis=1)
df['Jaccard_Embeddings'] = df.apply(lambda row: calculate_jaccard_similarity(row['Embeddings1'], row['Embeddings2']), axis=1)


**Test Data**

In [None]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Phrase1     250 non-null    object
 1   Phrase2     250 non-null    object
 2   Similarity  250 non-null    object
dtypes: object(3)
memory usage: 6.0+ KB


In [None]:
test_data[['Embeddings1', 'Embeddings2']] = test_data.apply(lambda row: get_sentence_embeddings(tokenizer, model, row['Phrase1'], row['Phrase2']), axis=1, result_type='expand')

test_data['Cosine_Embeddings'] = test_data.apply(lambda row: calculate_cosine_similarity(row['Embeddings1'], row['Embeddings2']), axis=1)
test_data['Euclidean_Embeddings'] = test_data.apply(lambda row: calculate_euclidean_distance(row['Embeddings1'], row['Embeddings2']), axis=1)
test_data['Jaccard_Embeddings'] = test_data.apply(lambda row: calculate_jaccard_similarity(row['Embeddings1'], row['Embeddings2']), axis=1)



**Models**

In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import ElasticNet, Ridge, Lasso,LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Assuming you have already loaded your data into 'df' and 'test_data' dataframes

# Extracting features and target variable from the training and test data
X_train = df[['Cosine_Embeddings', 'Euclidean_Embeddings', 'Jaccard_Embeddings']]
y_train = df['Similarity']

X_test = test_data[['Cosine_Embeddings', 'Euclidean_Embeddings', 'Jaccard_Embeddings']]
y_test = test_data['Similarity']

# linear regression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
y_pred_test_lr = lr_model.predict(X_test)
mse_test_lr = mean_squared_error(y_test, y_pred_test_lr)
r2_test_lr = r2_score(y_test, y_pred_test_lr)
print("Linear Regression Testing Mean Squared Error:", mse_test_lr)
print("Linear Regression Testing R-squared:", r2_test_lr)

# ElasticNet Regression
elasticnet_model = ElasticNet(alpha=0.1, l1_ratio=0.5)  # You can adjust alpha and l1_ratio as needed
elasticnet_model.fit(X_train, y_train)
elasticnet_y_pred = elasticnet_model.predict(X_test)
elasticnet_mse = mean_squared_error(y_test, elasticnet_y_pred)
elasticnet_r2 = r2_score(y_test, elasticnet_y_pred)
print("ElasticNet Testing Mean Squared Error:", elasticnet_mse)
print("ElasticNet Testing R-squared:", elasticnet_r2)

# Ridge Regression
ridge_model = Ridge(alpha=0.1)  # You can adjust alpha as needed
ridge_model.fit(X_train, y_train)
ridge_y_pred = ridge_model.predict(X_test)
ridge_mse = mean_squared_error(y_test, ridge_y_pred)
ridge_r2 = r2_score(y_test, ridge_y_pred)
print("Ridge Testing Mean Squared Error:", ridge_mse)
print("Ridge Testing R-squared:", ridge_r2)

# Lasso Regression
lasso_model = Lasso(alpha=0.1)  # You can adjust alpha as needed
lasso_model.fit(X_train, y_train)
lasso_y_pred = lasso_model.predict(X_test)
lasso_mse = mean_squared_error(y_test, lasso_y_pred)
lasso_r2 = r2_score(y_test, lasso_y_pred)
print("Lasso Testing Mean Squared Error:", lasso_mse)
print("Lasso Testing R-squared:", lasso_r2)


Linear Regression Testing Mean Squared Error: 2.036263387225339
Linear Regression Testing R-squared: 0.0048385636732088955
ElasticNet Testing Mean Squared Error: 2.30434091245691
ElasticNet Testing R-squared: 0.13712726320820212
Ridge Testing Mean Squared Error: 2.0362981161106344
Ridge Testing R-squared: 0.004855701398827961
Lasso Testing Mean Squared Error: 2.3616546960858136
Lasso Testing R-squared: 0.16540999931279798
