**Load data**

In [1]:
train_file_path = 'train.txt'
test_file_path = 'test.txt'


def load_data(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
    pairs = [line.strip().split('\t') for line in lines]
    phrases1, phrases2, similarities = zip(*pairs)
    return phrases1, phrases2, similarities

train_phrases1, train_phrases2, train_similarities = load_data(train_file_path)
test_phrases1, test_phrases2, test_similarities = load_data(test_file_path)
import pandas as pd
df = pd.DataFrame({'Phrase1': train_phrases1, 'Phrase2': train_phrases2, 'Similarity': train_similarities})
test_data = pd.DataFrame({'Phrase1': test_phrases1, 'Phrase2': test_phrases2, 'Similarity': test_similarities})

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13365 entries, 0 to 13364
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Phrase1     13365 non-null  object
 1   Phrase2     13365 non-null  object
 2   Similarity  13365 non-null  object
dtypes: object(3)
memory usage: 313.4+ KB


**Co-Occurence**

In [3]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [4]:
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import euclidean
from sklearn.metrics import jaccard_score

def calculate_similarity_co_occurrence(sentence1, sentence2):
    # Tokenize sentences
    X_list = word_tokenize(sentence1)
    Y_list = word_tokenize(sentence2)

    # Remove stopwords
    sw = stopwords.words('english')
    X_list_no_sw = [w.lower() for w in X_list if not w.lower() in sw]
    Y_list_no_sw = [w.lower() for w in Y_list if not w.lower() in sw]

    # Lemmatize words
    lemmatizer = WordNetLemmatizer()
    X_lem_set = {lemmatizer.lemmatize(w.lower()) for w in X_list_no_sw}
    Y_lem_set = {lemmatizer.lemmatize(w.lower()) for w in Y_list_no_sw}

    # Calculate cosine similarity
    l1 = [1 if w in X_lem_set else 0 for w in (X_lem_set.union(Y_lem_set))]
    l2 = [1 if w in Y_lem_set else 0 for w in (X_lem_set.union(Y_lem_set))]
    cosine_sim = cosine_similarity(np.array(l1).reshape(1, -1), np.array(l2).reshape(1, -1))

    # Calculate Euclidean distance
    euclidean_dist = euclidean(np.array(l1), np.array(l2))

    # Calculate Jaccard similarity
    jaccard_sim = jaccard_score(l1, l2)

    return cosine_sim[0][0], euclidean_dist, jaccard_sim


**TF-IDF**

In [5]:
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from scipy.spatial.distance import euclidean
from sklearn.metrics import jaccard_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
def lemmatize_sentence(sentence):
    # Initialize the WordNet lemmatizer
    lemmatizer = WordNetLemmatizer()

    # Tokenize the sentence into words
    words = word_tokenize(sentence)

    # Lemmatize each word in the sentence
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]

    # Join the lemmatized words back into a sentence
    #lemmatized_sentence = ' '.join(lemmatized_words)

    return lemmatized_words
def calculate_similarity_TF_IDF(sentence1, sentence2) :
    list_sentences=[sentence1, sentence2]
    tfidf_vectorizer = TfidfVectorizer(tokenizer=lemmatize_sentence, stop_words='english')
    tfidf_matrix = tfidf_vectorizer.fit_transform(list_sentences)
    feature_names = tfidf_vectorizer.get_feature_names_out()
    tfidf_df = tfidf_matrix.toarray()
    cosine_sim = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])
    vec1 = tfidf_matrix[0].toarray().flatten() if hasattr(tfidf_matrix[0], 'toarray') else tfidf_matrix[0].flatten()
    vec2 = tfidf_matrix[1].toarray().flatten() if hasattr(tfidf_matrix[1], 'toarray') else tfidf_matrix[1].flatten()
    intersection = np.sum(np.minimum(vec1, vec2))
    union = np.sum(np.maximum(vec1, vec2))
    jaccard_sim = intersection / union
    euclidean_dist = euclidean(tfidf_matrix[0].toarray().flatten(), tfidf_matrix[1].toarray().flatten())
    return cosine_sim, euclidean_dist, jaccard_sim




In [6]:
df[['Cosine Co-occurrence', 'Euclidean Co-occurrence', 'Jaccard Co-occurrence']] = df.apply(lambda x: calculate_similarity_co_occurrence(x['Phrase1'], x['Phrase2']), axis=1, result_type='expand')
df[['Cosine TF-IDF', 'Euclidean TF-IDF', 'Jaccard TF-IDF']] = df.apply(lambda x: calculate_similarity_TF_IDF(x['Phrase1'], x['Phrase2']), axis=1, result_type='expand')



In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13365 entries, 0 to 13364
Data columns (total 9 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Phrase1                  13365 non-null  object 
 1   Phrase2                  13365 non-null  object 
 2   Similarity               13365 non-null  object 
 3   Cosine Co-occurrence     13365 non-null  float64
 4   Euclidean Co-occurrence  13365 non-null  float64
 5   Jaccard Co-occurrence    13365 non-null  float64
 6   Cosine TF-IDF            13365 non-null  object 
 7   Euclidean TF-IDF         13365 non-null  float64
 8   Jaccard TF-IDF           13365 non-null  float64
dtypes: float64(5), object(4)
memory usage: 939.9+ KB


**Test Data**

In [8]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Phrase1     250 non-null    object
 1   Phrase2     250 non-null    object
 2   Similarity  250 non-null    object
dtypes: object(3)
memory usage: 6.0+ KB


In [9]:
test_data[['Cosine Co-occurrence', 'Euclidean Co-occurrence', 'Jaccard Co-occurrence']] = test_data.apply(lambda x: calculate_similarity_co_occurrence(x['Phrase1'], x['Phrase2']), axis=1, result_type='expand')
test_data[['Cosine TF-IDF', 'Euclidean TF-IDF', 'Jaccard TF-IDF']] = test_data.apply(lambda x: calculate_similarity_TF_IDF(x['Phrase1'], x['Phrase2']), axis=1, result_type='expand')



In [10]:
X_train = df[['Cosine Co-occurrence', 'Euclidean Co-occurrence', 'Jaccard Co-occurrence',
              'Euclidean TF-IDF', 'Jaccard TF-IDF']]
y_train = df['Similarity']

X_test = test_data[['Cosine Co-occurrence', 'Euclidean Co-occurrence', 'Jaccard Co-occurrence',
                    'Euclidean TF-IDF', 'Jaccard TF-IDF']]
y_test = test_data['Similarity']

**LinearRegression**

In [11]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

y_pred_test_lr = lr_model.predict(X_test)
mse_test_lr = mean_squared_error(y_test, y_pred_test_lr)

print("Linear Regression Testing Mean Squared Error:", mse_test_lr)


Linear Regression Testing Mean Squared Error: 0.9909606450181514


**RandomForestRegressor**

In [12]:
from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)


y_pred_test_rf = rf_model.predict(X_test)
mse_test_rf = mean_squared_error(y_test, y_pred_test_rf)

print("Random Forest Testing Mean Squared Error:", mse_test_rf)



Random Forest Testing Mean Squared Error: 1.1910818757046437


**GradientBoostingRegressor**

In [13]:
from sklearn.ensemble import GradientBoostingRegressor


gb_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
gb_model.fit(X_train, y_train)

y_pred_test_gb = gb_model.predict(X_test)
mse_test_gb = mean_squared_error(y_test, y_pred_test_gb)


print("Gradient Boosting Testing Mean Squared Error:", mse_test_gb)



Gradient Boosting Testing Mean Squared Error: 0.9558532147046316
