---
# <ins>**TASK 2**</ins>

In [1]:
import pandas as pd
import json
import nltk
import matplotlib.pyplot as plt
import re
import numpy as np
import optuna

from gensim.models import Word2Vec
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from tqdm.auto import tqdm

In [None]:
# nltk.download('punkt')

# **Utils Functions**

## 1. <ins>Original functions from starter_notebooks</ins>

In [2]:
def load_json_data(file_path):
    """
    Load JSON data from a file.

    Parameters:
    file_path (str): Path to the JSON file.

    Returns:
    dict: JSON data loaded from the file.
    """
    with open(file_path, "r") as file:
        contents = json.load(file)
    
    return contents


def create_tfidf_matrix(citing_dataset, nonciting_dataset, vectorizer=TfidfVectorizer()):
    """
    Creates TF-IDF matrix for the given citing and non-citing datasets based on the specified text column.

    Parameters:
    citing_dataset (json)): DataFrame containing citing patents.
    nonciting_dataset (json): DataFrame containing non-citing patents.
    vectorizer (TfidfVectorizer, optional): TfidfVectorizer object for vectorizing text data.
                                             Defaults to TfidfVectorizer().

    Returns:
    tuple: A tuple containing TF-IDF matrices for citing and non-citing patents respectively.
           (tfidf_matrix_citing, tfidf_matrix_nonciting)
    """
    all_text = [patent['text'] for patent in citing_dataset + nonciting_dataset]

    # Vectorizing descriptions
    # print("Vectorizing descriptions...")
    # tfidf_matrix = vectorizer.fit_transform(tqdm(all_text, desc="TF-IDF"))
    tfidf_matrix = vectorizer.fit_transform(all_text)

    # Since we're interested in similarities between citing and cited patents,
    # we need to split the TF-IDF matrix back into two parts
    split_index = len(citing_dataset)
    tfidf_matrix_citing = tfidf_matrix[:split_index]
    tfidf_matrix_nonciting = tfidf_matrix[split_index:]

    # Size of vocabulary
    # print("Size of vocabulary:", len(vectorizer.vocabulary_))

    return tfidf_matrix_citing, tfidf_matrix_nonciting

def create_word2vec_matrix(citing_dataset, nonciting_dataset, word2vec_model):
    """
    Creates Word2Vec matrix for the given citing and non-citing datasets based on the specified Word2Vec model.

    Parameters:
    citing_dataset (list of str): List of citing patents' text.
    nonciting_dataset (list of str): List of non-citing patents' text.
    word2vec_model (Word2Vec): Pre-trained Word2Vec model.

    Returns:
    tuple: A tuple containing Word2Vec matrices for citing and non-citing patents respectively.
           (word2vec_matrix_citing, word2vec_matrix_nonciting)
    """
    def get_average_vector(text, model):
        words = text.split()
        vectors = [model.wv.get_vector(word) for word in words if word in model.wv]
        if vectors:
            return np.mean(vectors, axis=0)
        else:
            # If no words in the document are found in the model vocabulary,
            # return a zero vector of the same dimensionality
            return np.zeros(model.vector_size)

    # Convert text data to Word2Vec vectors
    word2vec_matrix_citing = np.array([get_average_vector(patent["text"], word2vec_model) for patent in citing_dataset])
    word2vec_matrix_nonciting = np.array([get_average_vector(patent["text"], word2vec_model) for patent in nonciting_dataset])

    return word2vec_matrix_citing, word2vec_matrix_nonciting

def get_mapping_dict(mapping_df):
    """
    Creates dictionary of citing ids to non-citing id based on given dataframe (which is based on providedjson)

    Parameters:
    mapping_df (DataFrame): DataFrame containing mapping between citing and cited patents
    Returns:
    dict: dictionary of unique citing patent ids to list of cited patent ids
    """
    mapping_dict = {}

    for _, row in mapping_df.iterrows():
        key = row[0]  # Value from column 0
        value = row[2]  # Value from column 2
        if key in mapping_dict:
            mapping_dict[key].append(value)
        else:
            mapping_dict[key] = [value]

    return mapping_dict

def create_corpus(corpus, text_type):
    """
    Extracts text data from a corpus based on the specified text type.

    Parameters:
    corpus (list): List of dictionaries representing patent documents.
    text_type (str): Type of text to extract ('title', 'abstract', 'claim1', 'claims', 'description', 'fulltext').

    Returns:
    list: List of dictionaries with 'id' and 'text' keys representing each document in the corpus.
    """

    app_ids = [doc['Application_Number'] + doc['Application_Category'] for doc in corpus]

    cnt = 0 # count the number of documents without text
    texts = []  # list of texts
    ids_to_remove = []  # list of ids of documents without text, to remove them from the corpus

    if text_type == 'title':
        for doc in corpus:
            try:
                texts.append(doc['Content']['title'])
            except: # if the document does not have a title
                ids_to_remove.append(doc['Application_Number']+doc['Application_Category'])
                cnt += 1
        if cnt > 0:
            print(f"Number of documents without title: {cnt}")

    elif text_type == 'abstract':
        for doc in corpus:
            try:
                texts.append(doc['Content']['pa01'])
            except: # if the document does not have an abstract
                ids_to_remove.append(doc['Application_Number']+doc['Application_Category'])
                cnt += 1
        if cnt > 0:
            print(f"Number of documents without abstract: {cnt}")

    elif text_type == 'claim1':
        for doc in corpus:
            try:
                texts.append(doc['Content']['c-en-0001'])
            except: # if the document does not have claim 1
                ids_to_remove.append(doc['Application_Number']+doc['Application_Category'])
                cnt += 1

        if cnt > 0:
            print(f"Number of documents without claim 1: {cnt}")

    elif text_type == 'claims':
        # all the values with the key starting with 'c-en-', each element in the final list is a list of claims
        for doc in corpus:
            doc_claims = []
            for key in doc['Content'].keys():
                if key.startswith('c-en-'):
                    doc_claims.append(doc['Content'][key])
            if len(doc_claims) == 0:    # if the document does not have any claims
                ids_to_remove.append(doc['Application_Number']+doc['Application_Category'])
                cnt += 1
            else:
                doc_text_string = ' '.join(doc_claims)
                texts.append(doc_text_string)
        if cnt > 0:
            print(f"Number of documents without claims: {cnt}")

    elif text_type == 'description':
        # all the values with the key starting with 'p'
        for doc in corpus:
            doc_text = []
            for key in doc['Content'].keys():
                if key.startswith('p'):
                    doc_text.append(doc['Content'][key])
            if len(doc_text) == 0:  # if the document does not have any description
                ids_to_remove.append(doc['Application_Number']+doc['Application_Category'])
                cnt += 1
            else:
                doc_text_string = ' '.join(doc_text)
                texts.append(doc_text_string)
        if cnt > 0:
            print(f"Number of documents without description: {cnt}")

    elif text_type == 'fulltext':
        for doc in corpus:
            doc_text = list(doc['Content'].values())
            doc_text_string = ' '.join(doc_text)
            texts.append(doc_text_string)
        if cnt > 0:
            print(f"Number of documents without any text: {cnt}")

    else:
        raise ValueError("Invalid text type")

    if len(ids_to_remove) > 0:
        print(f"Removing {len(ids_to_remove)} documents without required text")
        for id_ in ids_to_remove[::-1]:
            idx = app_ids.index(id_)
            del app_ids[idx]

    # Create a list of dictionaries with app_ids and texts
    corpus_data = [{'id': app_id, 'text': text} for app_id, text in zip(app_ids, texts)]

    return corpus_data


def get_true_and_predicted(citing_to_cited_dict, recommendations_dict):
    """
    Get the true and predicted labels for the metrics calculation.

    Parameters:
    citing_to_cited_dict : dict of str : list of str
        Mapping between citing patents and the list of their cited patents
    recommendations_dict : dict of str : list of str
        Mapping between citing patents and the sorted list of recommended patents

    Returns:
    list of list
        True relevant items for each recommendation list.
    list of list
        Predicted recommended items for each recommendation list.
    int
        Number of patents not in the citation mapping
    """
    # Initialize lists to store true labels and predicted labels
    true_labels = []
    predicted_labels = []
    not_in_citation_mapping = 0

    # Iterate over the items in both dictionaries
    for citing_id in recommendations_dict.keys():
        # Check if the citing_id is present in both dictionaries
        if citing_id in citing_to_cited_dict:
            # If yes, append the recommended items from both dictionaries to the respective lists
            true_labels.append(citing_to_cited_dict[citing_id])
            predicted_labels.append(recommendations_dict[citing_id])
        else:
            not_in_citation_mapping += 1

    return true_labels, predicted_labels, not_in_citation_mapping



def mean_recall_at_k(true_labels, predicted_labels, k=10):
    """
    Calculate the mean Recall@k for a list of recommendations.

    Parameters:
    true_labels : list of list
        True relevant items for each recommendation list.
    predicted_labels : list of list
        Predicted recommended items for each recommendation list.
    k : int
        Number of recommendations to consider.

    Returns:
    float
        Mean Recall@k value.
    """
    recalls_at_k = []

    for true, pred in zip(true_labels, predicted_labels):
        # Calculate Recall@k for each recommendation list
        true_set = set(true)
        k = min(k, len(pred))
        relevant_count = sum(1 for item in pred[:k] if item in true_set)
        recalls_at_k.append(relevant_count / len(true_set))

    # Calculate the mean Recall@k
    mean_recall = sum(recalls_at_k) / len(recalls_at_k)

    return mean_recall

def mean_inv_ranking(true_labels, predicted_labels):
    """
    Calculate the mean of lists of the mean inverse rank of true relevant items
    in the lists of sorted recommended items.

    Parameters:
    true_labels : list of list
        True relevant items for each recommendation list.
    predicted_labels : list of list
        Predicted recommended items for each recommendation list.

    Returns:
    float
        Mean of lists of the mean inverse rank of true relevant items.
    """
    mean_ranks = []

    for true, pred in zip(true_labels, predicted_labels):
        # Calculate the inverse rank of true relevant items
        # in the recommendation list
        ranks = []
        for item in true:
            try:
                rank = 1 / (pred.index(item) + 1)
            except ValueError:
                rank = 0  # If item not found, assign 0
            ranks.append(rank)

        # Calculate the mean inverse rank of true relevant items
        # in the recommendation list
        mean_rank = sum(ranks) / len(ranks)
        mean_ranks.append(mean_rank)

    # Calculate the mean of the mean inverse ranks across all recommendation lists
    mean_of_mean_ranks = sum(mean_ranks) / len(mean_ranks)

    return mean_of_mean_ranks


def mean_ranking(true_labels, predicted_labels):
    """
    Calculate the mean of lists of the mean rank of true relevant items
    in the lists of sorted recommended items.

    Parameters:
    true_labels : list of list
        True relevant items for each recommendation list.
    predicted_labels : list of list
        Predicted recommended items for each recommendation list.

    Returns:
    float
        Mean of lists of the mean rank of true relevant items.
    """
    mean_ranks = []

    for true, pred in zip(true_labels, predicted_labels):
        # Calculate the rank of true relevant items
        # in the recommendation list
        ranks = []
        for item in true:
            try:
                rank = pred.index(item) + 1
            except ValueError:
                rank = len(pred)  # If item not found, assign the length of the list
            ranks.append(rank)

        # Calculate the mean rank of true relevant items
        # in the recommendation list
        mean_rank = sum(ranks) / len(ranks)
        mean_ranks.append(mean_rank)

    # Calculate the mean of the mean ranks across all recommendation lists
    mean_of_mean_ranks = sum(mean_ranks) / len(mean_ranks)

    return mean_of_mean_ranks



def mean_average_precision(true_labels, predicted_labels, k=10):
    """
    Calculate the mean Average Precision for a list of recommendations.

    Parameters:
    true_labels : list of list
        True relevant items for each recommendation list.
    predicted_labels : list of list
        Predicted recommended items for each recommendation list.
    k : int
        Number of recommendations to consider.

    Returns:
    float
        Mean Average Precision value.
    """
    average_precisions = []

    for true, pred in zip(true_labels, predicted_labels):
        # Calculate Average Precision for each recommendation list
        true_set = set(true)
        precision_at_k = []
        relevant_count = 0
        for i, item in enumerate(pred[:k]):
            if item in true_set:
                relevant_count += 1
                precision_at_k.append(relevant_count / (i + 1))
        average_precision = sum(precision_at_k) / len(true_set)
        average_precisions.append(average_precision)

    # Calculate the mean Average Precision
    mean_average_precision = sum(average_precisions) / len(average_precisions)

    return mean_average_precision

def top_k_ranks(citing, cited, cosine_similarities, k=10):
    # Create a dictionary to store the top k ranks for each citing patent
    top_k_ranks = {}
    for i, content_id in enumerate(citing):
        top_k_ranks[content_id['id']] = [cited[j]['id'] for j in np.argsort(cosine_similarities[i])[::-1][:k]]
    return top_k_ranks

def get_true_and_predicted_task_2(citation_to_paragraph_dict, recommendations_dict):

    """
    Get the true and predicted labels for the metrics calculation.

    Parameters:
    citation_to_paragraph_dict :
        dict of str : list of str
        Mapping between the concatenation of citing and cited id and the list of their paragraphs
    recommendations_dict :
        dict of str : list of str
        Mapping between the concatenation of citing and cited id and the sorted list of recommended paragraphs

    Returns:
    list of list
        True relevant items for each recommendation list.
    list of list
        Predicted recommended items sorted for each recommendation list.
    int
        Number of pairs of citing and cited patents not in the citation mapping
    """
    # Initialize lists to store true labels and predicted labels
    true_labels = []
    predicted_labels = []
    not_in_citation_mapping = 0

    # Iterate over the items in both dictionaries
    for id in citation_to_paragraph_dict.keys():
        # Check if the pair of citing_id and cited_id is present in both dictionaries
        true_labels.append(citation_to_paragraph_dict[id])
        if id in recommendations_dict:
            predicted_labels.append(recommendations_dict[id])
        else:
            predicted_labels.append([])
            not_in_citation_mapping += 1

    return true_labels, predicted_labels, not_in_citation_mapping


## 2. <ins>Customs functions</ins>

In [9]:
def clean_text(text, remove_stopwords=True, stem=False):
    """
    Clean the text data by converting to lowercase, removing stopwords, stemming, and removing special characters.

    Parameters:
    text (str): Text data to clean.
    remove_stopwords (bool, optional): Whether to remove stopwords. Defaults to True.
    stem (bool, optional): Whether to perform stemming. Defaults to False.

    Returns:
    str: Cleaned text data.
    """
    text = text.lower()
    
    if remove_stopwords:
        stop_words = set(stopwords.words('english'))
        text = ' '.join([word for word in text.split() if word not in stop_words])

    text = re.sub(r'\(([0-9])+\)', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^a-z0-9\s]', '', text)
    
    if stem:
        ps = PorterStemmer()
        text = ' '.join([ps.stem(word) for word in text.split()])

    return text

def clean_corpus(corpus, verbose=True, stem=False):
    """
    Clean the text data in the corpus by converting to lowercase, removing stopwords, stemming, and removing special characters.

    Parameters:
    corpus (list): List of dictionaries representing patent documents.
    verbose (bool, optional): Whether to show progress bar. Defaults to True.
    stem (bool, optional): Whether to perform stemming. Defaults to False.

    Returns:
    list: List of dictionaries with 'id' and 'text' keys representing each document in the corpus.
    """
    if verbose:
        with ProcessPoolExecutor() as executor:
            corpus_clean = list(tqdm(executor.map(clean_text, [patent['text'] for patent in corpus]), total=len(corpus), desc="Cleaning"))
        
        return [{'id': patent['id'], 'text': text} for patent, text in zip(corpus, corpus_clean)]

    else:
        return [{'id': patent['id'], 'text': clean_text(patent['text'], stem=stem)} for patent in corpus]

def split_text_w2v(cited_corpus, citing_corpus):
    """
    Split text data into sentences for Word2Vec training.

    Parameters:
    cited_corpus (dict): Dictionary containing cited patent paragraphs.
    citing_paragraphs (dict): Dictionary containing citing patent paragraphs.

    Returns:
    list: List of lists of sentences for Word2Vec training.
    """
    sentences = []

    for k in cited_corpus.keys():
        for paragraph in cited_corpus[k]:
            sentences.append(nltk.sent_tokenize(paragraph["text"]))

        sentences.append(nltk.sent_tokenize(citing_corpus[k][0]["text"]))

    return sentences

def get_y_true(mapping_dataset_df):
    """
    Get the true labels for the citation mapping dataset.

    Parameters:
    mapping_dataset_df (DataFrame): DataFrame containing the citation mapping dataset.

    Returns:
    dict: Dictionary of citing and cited patent pairs to the citation label.
    """
    y_true = {}
    for i, rows in mapping_dataset_df.iterrows():
        key = rows[0] + "_" + rows[2]
        y_true[key] = rows[3]

    return y_true

def create_pairs(data, n_cited=10):
    """
    Create pairs of citing and cited patents from the given dataset.

    Parameters:
    data (dict): Dictionary containing the dataset.
    n_cidted (int, optional): Number of cited patents to consider. Defaults to 10.

    Returns:
    list: List of tuples representing citing and cited patent pairs.
    """
    pairs = []

    for key in data.keys():
        for i, value in enumerate(data[key]):
            pairs.append((key, value))

            if i == n_cited - 1:
                break

    return pairs

def get_cited_corpus(val):
    """
    Get the cited paragraphs from the given patent document.

    Parameters:
    val (dict): Dictionary containing the patent document.

    Returns:
    list: List of cited paragraphs.
    """
    ids = []
    paragraphs = []

    for k in val.keys():
        if k.startswith('p') or k.startswith('c'):
            paragraphs.append(val[k])
            ids.append(k)

    return ids, paragraphs

def get_citing_corpus(val):
    """
    Get the citing paragraphs from the given patent document.

    Parameters:
    val (dict): Dictionary containing the patent document.

    Returns:
    str: Concatenated citing paragraphs.
    """
    paragraphs = []

    for k in val.keys():
        paragraphs.append(val[k])

    return " ".join(paragraphs)

def get_citing_paragraphs(pairs, df):
    """
    Get the citing paragraphs from the given citing patent dataset.

    Parameters:
    pairs (list): List of citing and cited patent pairs.

    Returns:
    list: List of citing patent ids.
    """
    citing_ids = np.unique([e[0] for e in pairs]).tolist()
    all_citing_paragraphs = []

    for citing in tqdm(citing_ids, desc="get citing paragraphs"):
        citing_text = df[df['Application_Number'] == citing[:-2]]["Content"].values[0]
        descriptions = get_citing_corpus(citing_text)
        all_citing_paragraphs += [descriptions]

    return citing_ids, all_citing_paragraphs

def get_cited_paragraphs(pairs, nonciting_dataset_df):
    """
    Get the cited paragraphs from the given cited patent dataset.

    Parameters:
    pairs (list): List of citing and cited patent pairs.
    nonciting_dataset_df (DataFrame): DataFrame containing the non-citing patent dataset.

    Returns:
    list: List of cited patent ids.
    """
    all_cited_ids = []
    all_cited_paragraphs = []
    all_type_of_text = []

    for citing, cited in tqdm(pairs, desc="get cited paragraphs"):
        cited_text = nonciting_dataset_df[nonciting_dataset_df['Application_Number'] == cited[:-2]]['Content'].values[0]
        cited_ids, cited_paragraphs = get_cited_corpus(cited_text)
        all_cited_paragraphs += [cited_paragraphs]
        all_cited_ids += [cited]
        all_type_of_text += [cited_ids]


    return all_cited_ids, all_cited_paragraphs, all_type_of_text

def map_ids_to_text(ids, texts):
    """
    Map patent ids to their corresponding text data.

    Parameters:
    ids (list): List of patent ids.
    texts (list): List of text data.

    Returns:
    list: List of dictionaries containing patent ids and text data.
    """
    mapping = []

    for i, id in enumerate(ids):
        mapping.append({"id": id, "text": texts[i]})

    return mapping

def ind_citing(citing_dataset, citing_id):
    """
    Get the index of the citing patent in the citing dataset.

    Parameters:
    citing_dataset (list): List of citing patents.
    citing_id (str): Citing patent id.

    Returns:
    int: Index of the citing patent in the citing dataset.
    """
    for i, citing in enumerate(citing_dataset):
        if citing['id'] == citing_id:
            return i

    return -1

def predict_pipeline(train_pairs, corpus_citing, corpus_cited, create_matrix, vectorizer, k=100):
    """
    Predict citing patents

    Parameters:
    train_pairs (list): List of citing and cited patent pairs.
    corpus_citing (list): List of citing patent paragraphs.
    corpus_cited (list): List of cited patent paragraphs.
    create_matrix (function): Function to create the matrix for the model.
    vectorizer (TfidfVectorizer): TF-IDF vectorizer.
    k (int, optional): Number of recommendations to consider. Defaults to 100.

    Returns:
    dict: Dictionary of citing patent ids to the sorted list of recommended cited patent ids.
    """
    pred = {}

    for citing, cited in tqdm(train_pairs, total=len(train_pairs)):
        key = citing + "_" + cited
        corpus_cited_sub = corpus_cited[key]
        corpus_citing_sub = corpus_citing[key]

        matrix_citing, matrix_cited = create_matrix(corpus_citing_sub, corpus_cited_sub, vectorizer)
        cosine_similarities = linear_kernel(matrix_citing, matrix_cited)
        top_k_ranks_dict = top_k_ranks(corpus_citing_sub, corpus_cited_sub, cosine_similarities, k=k)

        pred[key] = top_k_ranks_dict[citing]

    return pred

# **I. Prepare data**

<ins>Explaination of the data preparation steps:</ins>
1. For each citing patent, we concatenate all text data into one string. This includes the title, abstract, claims and paragraphs.
2. For each cited patent corresponding to the citing patent, we store paragraphs and claims to later perform a ranking.
3. We clean the text data by removing `special characters`, `digits`, and `stopwords`.
4. We perform `stemming` on the text data, to reduce words to their root form.

After the data preparation steps, we obtain two dictionaries:
1. `citing_dict`: A dictionary where the key is the patent number and the value is the concatenated text data.
2. `cited_dict`: A dictionary where the key is the patent number and the value contains paragraphs and claims.

In [4]:
json_citing_train = load_json_data("./datasets/Content_JSONs/Citing_2020_Cleaned_Content_12k/Citing_Train_Test/citing_TRAIN.json")
json_citing_test = load_json_data("./datasets/Content_JSONs/Citing_2020_Cleaned_Content_12k/Citing_Train_Test/citing_TEST.json")

json_nonciting = load_json_data("./datasets/Content_JSONs/Cited_2020_Uncited_2010-2019_Cleaned_Content_22k/CLEANED_CONTENT_DATASET_cited_patents_by_2020_uncited_2010-2019.json")
json_citing_to_cited = load_json_data("./datasets/Citation_JSONs/Citation_Train.json")
json_citing_id = load_json_data("./Citing_ID_List_Test.json")

In [5]:
citing_dataset_df = pd.DataFrame(json_citing_train)
nonciting_dataset_df = pd.DataFrame(json_nonciting)
mapping_dataset_df = pd.DataFrame(json_citing_to_cited)

train_mapping_dict = get_mapping_dict(mapping_dataset_df)

In [10]:
# Get all train pairs (id_citing, id_cited)
train_pairs = create_pairs(train_mapping_dict)

# Get paragraphs of citing and cited
citing_ids, citing_paragraphs = get_citing_paragraphs(train_pairs, citing_dataset_df)
cited_ids, cited_corpus, cited_type = get_cited_paragraphs(train_pairs, nonciting_dataset_df)

# Create the citing corpus
citing_corpus = map_ids_to_text(citing_ids, citing_paragraphs)

# Clean the citing corpus
citing_corpus = clean_corpus(citing_corpus, stem=True)

# Create the citing and cited corpus dictionaries
# Key: id_citing + "_" + id_cited
corpus_citing_sub_dict = {}
corpus_cited_sub_dict = {}

for i, (citing, cited) in tqdm(enumerate(train_pairs), total=len(train_pairs), desc="Creating Corpus Dictionaries"):
    key = citing + "_" + cited

    corpus_cited_sub = map_ids_to_text(cited_type[i], cited_corpus[i])
    corpus_citing_sub = [citing_corpus[ind_citing(citing_corpus, citing)]]

    # Clean the cited corpus
    corpus_cited_sub = clean_corpus(corpus_cited_sub, verbose=False, stem=True)

    # Add citing and cited corpus to the dictionaries
    corpus_citing_sub_dict[key] = corpus_citing_sub
    corpus_cited_sub_dict[key] = corpus_cited_sub

get citing paragraphs:   0%|          | 0/6831 [00:00<?, ?it/s]

get cited paragraphs:   0%|          | 0/8594 [00:00<?, ?it/s]

Cleaning:   0%|          | 0/6831 [00:00<?, ?it/s]

Creating Corpus Dictionaries:   0%|          | 0/8594 [00:00<?, ?it/s]

In [6]:
# Store the citing and cited corpus dictionaries to save time
# with open("datasets/corpus_citing_sub_dict.json", "w") as file:
#     json.dump(corpus_citing_sub_dict, file)

# with open("datasets/corpus_cited_sub_dict.json", "w") as file:
#     json.dump(corpus_cited_sub_dict, file)

# Load the citing and cited corpus dictionaries
with open("datasets/corpus_citing_sub_dict.json", "r") as file:
    corpus_citing_sub_dict = json.load(file)

with open("datasets/corpus_cited_sub_dict.json", "r") as file:
    corpus_cited_sub_dict = json.load(file)

In [11]:
# Get the true labels for the citation mapping dataset
y_true = get_y_true(mapping_dataset_df)

# Create pairs of citing and cited patents
train_pairs = create_pairs(train_mapping_dict)

# **II. Basic Count Vectorizer**

In [12]:
# 1m30
# Predicting using CountVectorizer
vectorizer = CountVectorizer(max_features=10000)
y_pred = predict_pipeline(train_pairs, corpus_citing_sub_dict, corpus_cited_sub_dict, create_tfidf_matrix, vectorizer)

# Calculate metrics
y_true = get_y_true(mapping_dataset_df)
true_labels, predicted_labels, not_in_citation_mapping = get_true_and_predicted_task_2(y_true, y_pred)
print("Mean average precision (k=10): ", mean_average_precision(true_labels, predicted_labels, k=10))
print("Mean average precision (k=100): ", mean_average_precision(true_labels, predicted_labels, k=100))

  0%|          | 0/8594 [00:00<?, ?it/s]

Mean average precision (k=10):  0.11951776095505955
Mean average precision (k=100):  0.31461492981372485


In [16]:
# 1m30
# Predicting using CountVectorizer
vectorizer = CountVectorizer(binary=True)
y_pred = predict_pipeline(train_pairs, corpus_citing_sub_dict, corpus_cited_sub_dict, create_tfidf_matrix, vectorizer)

# Calculate metrics
y_true = get_y_true(mapping_dataset_df)
true_labels, predicted_labels, not_in_citation_mapping = get_true_and_predicted_task_2(y_true, y_pred)
print("Mean average precision (k=10): ", mean_average_precision(true_labels, predicted_labels, k=10))
print("Mean average precision (k=100): ", mean_average_precision(true_labels, predicted_labels, k=100))

  0%|          | 0/8594 [00:00<?, ?it/s]

Mean average precision (k=10):  0.14377675730094247
Mean average precision (k=100):  0.34109617938791215


# **III. Basic TF-IDF**

In [26]:
# Predicting using TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=10000)
y_pred = predict_pipeline(train_pairs, corpus_citing_sub_dict, corpus_cited_sub_dict, create_tfidf_matrix, vectorizer)

# Calculate metrics
y_true = get_y_true(mapping_dataset_df)
true_labels, predicted_labels, not_in_citation_mapping = get_true_and_predicted_task_2(y_true, y_pred)
print("Mean average precision (k=10): ", mean_average_precision(true_labels, predicted_labels, k=10))
print("Mean average precision (k=100): ", mean_average_precision(true_labels, predicted_labels, k=100))

  0%|          | 0/8594 [00:00<?, ?it/s]

Mean average precision (k=10):  0.08909157880725094
Mean average precision (k=100):  0.2749577513529771


# **IV. Word2Vec**

In [28]:
# Fit the Word2Vec model
sentences = split_text_w2v(cited_corpus, citing_paragraphs)
w2v = Word2Vec(sentences=sentences, vector_size=50, min_count=10, workers=4)
w2v.save("word2vec.model")

# Predict using Word2Vec
# w2v = Word2Vec.load("word2vec.model")
y_pred = predict_pipeline(train_pairs, corpus_citing_sub_dict, corpus_cited_sub_dict, create_word2vec_matrix, w2v)

# Calculate metrics
y_true = get_y_true(mapping_dataset_df)
true_labels, predicted_labels, not_in_citation_mapping = get_true_and_predicted_task_2(y_true, y_pred)
print("Mean average precision (k=10): ", mean_average_precision(true_labels, predicted_labels, k=10))
print("Mean average precision (k=100): ", mean_average_precision(true_labels, predicted_labels, k=100))

# **V. Fine-tune TF-IDF with Optuna**

In [35]:
# Objective function for Optuna
def objective(trial):
    # Define the hyperparameters to optimize
    sublinear_tf = trial.suggest_categorical("sublinear_tf", [True, False])
    norm = trial.suggest_categorical("norm", ['l1', 'l2'])
    max_features = trial.suggest_int("max_features", 1000, 10000)
    stop_words = trial.suggest_categorical("stop_words", ['english', None])
    n_gram_lower = trial.suggest_int("n_gram_lower", 1, 3)
    n_gram_upper = trial.suggest_int("n_gram_upper", 1, 3)

    if n_gram_upper < n_gram_lower:
        n_gram_upper = n_gram_lower

    # Create the vectorizer with the hyperparameters
    vectorizer = TfidfVectorizer(sublinear_tf=sublinear_tf, 
                                 norm=norm, 
                                 max_features=max_features, 
                                 stop_words=stop_words, 
                                 ngram_range=(n_gram_lower, n_gram_upper))

    # Predict using the pipeline
    y_pred = predict_pipeline(train_pairs, corpus_citing_sub_dict, corpus_cited_sub_dict, vectorizer)
    true_labels, predicted_labels, _ = get_true_and_predicted_task_2(y_true, y_pred)
    map_10 = mean_average_precision(true_labels, predicted_labels, k=10)
    
    return map_10

# Create the study and optimize the objective function
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=10)

# Print the best hyperparameters and the best MAP@10
print("Best hyperparameters:", study.best_params)
print("Best MAP@10:", study.best_value)


[I 2024-04-22 15:14:10,707] A new study created in memory with name: no-name-78ca1beb-1108-44f9-b684-73d3d8171df8


TF-IDF Predicting:   0%|          | 0/8594 [00:00<?, ?it/s]

[I 2024-04-22 15:19:57,690] Trial 0 finished with value: 0.06386463802374108 and parameters: {'sublinear_tf': False, 'norm': 'l1', 'max_features': 6626, 'stop_words': 'english', 'n_gram_lower': 2, 'n_gram_upper': 3}. Best is trial 0 with value: 0.06386463802374108.


TF-IDF Predicting:   0%|          | 0/8594 [00:00<?, ?it/s]

[I 2024-04-22 15:23:36,289] Trial 1 finished with value: 0.05476813803532983 and parameters: {'sublinear_tf': False, 'norm': 'l1', 'max_features': 8773, 'stop_words': None, 'n_gram_lower': 3, 'n_gram_upper': 2}. Best is trial 0 with value: 0.06386463802374108.


TF-IDF Predicting:   0%|          | 0/8594 [00:00<?, ?it/s]

[I 2024-04-22 15:30:04,921] Trial 2 finished with value: 0.06259411026390539 and parameters: {'sublinear_tf': False, 'norm': 'l1', 'max_features': 7412, 'stop_words': 'english', 'n_gram_lower': 1, 'n_gram_upper': 3}. Best is trial 0 with value: 0.06386463802374108.


TF-IDF Predicting:   0%|          | 0/8594 [00:00<?, ?it/s]

[I 2024-04-22 15:33:38,399] Trial 3 finished with value: 0.0598311855738742 and parameters: {'sublinear_tf': True, 'norm': 'l2', 'max_features': 5898, 'stop_words': None, 'n_gram_lower': 3, 'n_gram_upper': 2}. Best is trial 0 with value: 0.06386463802374108.


TF-IDF Predicting:   0%|          | 0/8594 [00:00<?, ?it/s]

[I 2024-04-22 15:37:07,875] Trial 4 finished with value: 0.04932945627562682 and parameters: {'sublinear_tf': True, 'norm': 'l1', 'max_features': 6839, 'stop_words': None, 'n_gram_lower': 3, 'n_gram_upper': 2}. Best is trial 0 with value: 0.06386463802374108.


TF-IDF Predicting:   0%|          | 0/8594 [00:00<?, ?it/s]

[I 2024-04-22 15:40:57,981] Trial 5 finished with value: 0.05812304382293742 and parameters: {'sublinear_tf': False, 'norm': 'l1', 'max_features': 9142, 'stop_words': None, 'n_gram_lower': 1, 'n_gram_upper': 2}. Best is trial 0 with value: 0.06386463802374108.


TF-IDF Predicting:   0%|          | 0/8594 [00:00<?, ?it/s]

[I 2024-04-22 15:43:58,222] Trial 6 finished with value: 0.047296627783998 and parameters: {'sublinear_tf': True, 'norm': 'l1', 'max_features': 5966, 'stop_words': None, 'n_gram_lower': 2, 'n_gram_upper': 2}. Best is trial 0 with value: 0.06386463802374108.


TF-IDF Predicting:   0%|          | 0/8594 [00:00<?, ?it/s]

[I 2024-04-22 15:47:32,158] Trial 7 finished with value: 0.05966976983726978 and parameters: {'sublinear_tf': True, 'norm': 'l2', 'max_features': 7052, 'stop_words': None, 'n_gram_lower': 3, 'n_gram_upper': 2}. Best is trial 0 with value: 0.06386463802374108.


TF-IDF Predicting:   0%|          | 0/8594 [00:00<?, ?it/s]

[I 2024-04-22 15:53:49,496] Trial 8 finished with value: 0.11279120534194183 and parameters: {'sublinear_tf': True, 'norm': 'l2', 'max_features': 4603, 'stop_words': 'english', 'n_gram_lower': 1, 'n_gram_upper': 3}. Best is trial 8 with value: 0.11279120534194183.


TF-IDF Predicting:   0%|          | 0/8594 [00:00<?, ?it/s]

[I 2024-04-22 15:56:40,274] Trial 9 finished with value: 0.08202457071281724 and parameters: {'sublinear_tf': True, 'norm': 'l2', 'max_features': 7806, 'stop_words': 'english', 'n_gram_lower': 2, 'n_gram_upper': 2}. Best is trial 8 with value: 0.11279120534194183.


Best hyperparameters: {'sublinear_tf': True, 'norm': 'l2', 'max_features': 4603, 'stop_words': 'english', 'n_gram_lower': 1, 'n_gram_upper': 3}
Best MAP@10: 0.11279120534194183


# **VI. Predictions on test**

In [26]:
# Predicting using TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=10000)
y_pred = predict_pipeline(train_pairs, corpus_citing_sub_dict, corpus_cited_sub_dict, create_tfidf_matrix, vectorizer)

# Calculate metrics
y_true = get_y_true(mapping_dataset_df)
true_labels, predicted_labels, not_in_citation_mapping = get_true_and_predicted_task_2(y_true, y_pred)
print("Mean average precision (k=10): ", mean_average_precision(true_labels, predicted_labels, k=10))
print("Mean average precision (k=100): ", mean_average_precision(true_labels, predicted_labels, k=100))

  0%|          | 0/8594 [00:00<?, ?it/s]

Mean average precision (k=10):  0.08909157880725094
Mean average precision (k=100):  0.2749577513529771


In [15]:
# Predicting using TfidfVectorizer
vectorizer = TfidfVectorizer(binary=True)
y_pred = predict_pipeline(train_pairs, corpus_citing_sub_dict, corpus_cited_sub_dict, create_tfidf_matrix, vectorizer)

# Calculate metrics
y_true = get_y_true(mapping_dataset_df)
true_labels, predicted_labels, not_in_citation_mapping = get_true_and_predicted_task_2(y_true, y_pred)
print("Mean average precision (k=10): ", mean_average_precision(true_labels, predicted_labels, k=10))
print("Mean average precision (k=100): ", mean_average_precision(true_labels, predicted_labels, k=100))

  0%|          | 0/8594 [00:00<?, ?it/s]

Mean average precision (k=10):  0.12540232509610746
Mean average precision (k=100):  0.31538537701093666


# **IV. Word2Vec**

In [25]:
# Fit the Word2Vec model
# sentences = split_text_w2v(corpus_cited_sub_dict, corpus_citing_sub_dict)
# w2v = Word2Vec(sentences=sentences, vector_size=50, min_count=10, workers=4)
# w2v.save("word2vec.model")

# Predict using Word2Vec
w2v = Word2Vec.load("word2vec.model")
y_pred = predict_pipeline(train_pairs, corpus_citing_sub_dict, corpus_cited_sub_dict, create_word2vec_matrix, w2v)

# Calculate metrics
y_true = get_y_true(mapping_dataset_df)
true_labels, predicted_labels, not_in_citation_mapping = get_true_and_predicted_task_2(y_true, y_pred)
print("Mean average precision (k=10) : ", mean_average_precision(true_labels, predicted_labels, k=10))
print("Mean average precision (k=100): ", mean_average_precision(true_labels, predicted_labels, k=100))

  0%|          | 0/8594 [00:00<?, ?it/s]

Mean average precision (k=10):  0.03848086624093492
Mean average precision (k=100):  0.1938707256086945


# **V. Fine-tune TF-IDF with Optuna**

In [28]:
# Objective function for Optuna
def objective(trial):
    # Define the hyperparameters to optimize
    sublinear_tf = trial.suggest_categorical("sublinear_tf", [True, False])
    norm = trial.suggest_categorical("norm", ['l1', 'l2'])
    max_features = trial.suggest_int("max_features", 1000, 10000)
    n_gram_lower = trial.suggest_int("n_gram_lower", 1, 3)
    n_gram_upper = trial.suggest_int("n_gram_upper", 1, 3)
    binary = trial.suggest_categorical("binary", [True, False])

    if n_gram_upper < n_gram_lower:
        n_gram_upper = n_gram_lower

    # Create the vectorizer with the hyperparameters
    vectorizer = TfidfVectorizer(sublinear_tf=sublinear_tf, 
                                 norm=norm,
                                binary=binary,
                                 max_features=max_features, 
                                 ngram_range=(n_gram_lower, n_gram_upper))

    # Predict using the pipeline
    y_pred = predict_pipeline(train_pairs, corpus_citing_sub_dict, corpus_cited_sub_dict, create_tfidf_matrix, vectorizer)
    true_labels, predicted_labels, _ = get_true_and_predicted_task_2(y_true, y_pred)
    map_10 = mean_average_precision(true_labels, predicted_labels, k=10)

    return map_10

# Create the study and optimize the objective function
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=10)

# Print the best hyperparameters and the best MAP@10
print("Best hyperparameters:", study.best_params)
print("Best Mean Average Precision (k=10):", study.best_value)


[I 2024-04-22 17:53:44,581] A new study created in memory with name: no-name-44d0043d-897b-41d8-ad7b-6baba654a552


  0%|          | 0/8594 [00:00<?, ?it/s]

[I 2024-04-22 17:56:54,671] Trial 0 finished with value: 0.05176587036313196 and parameters: {'sublinear_tf': True, 'norm': 'l1', 'max_features': 9387, 'n_gram_lower': 2, 'n_gram_upper': 2}. Best is trial 0 with value: 0.05176587036313196.


  0%|          | 0/8594 [00:00<?, ?it/s]

[I 2024-04-22 18:00:09,839] Trial 1 finished with value: 0.07348436911327604 and parameters: {'sublinear_tf': True, 'norm': 'l2', 'max_features': 1039, 'n_gram_lower': 2, 'n_gram_upper': 1}. Best is trial 1 with value: 0.07348436911327604.


  0%|          | 0/8594 [00:00<?, ?it/s]

[I 2024-04-22 18:04:11,291] Trial 2 finished with value: 0.05504875562242186 and parameters: {'sublinear_tf': True, 'norm': 'l2', 'max_features': 2316, 'n_gram_lower': 3, 'n_gram_upper': 1}. Best is trial 1 with value: 0.07348436911327604.


  0%|          | 0/8594 [00:00<?, ?it/s]

[I 2024-04-22 18:05:53,521] Trial 3 finished with value: 0.12321246696699874 and parameters: {'sublinear_tf': True, 'norm': 'l2', 'max_features': 8303, 'n_gram_lower': 1, 'n_gram_upper': 1}. Best is trial 3 with value: 0.12321246696699874.


  0%|          | 0/8594 [00:00<?, ?it/s]

[I 2024-04-22 18:09:59,312] Trial 4 finished with value: 0.055080564546617805 and parameters: {'sublinear_tf': True, 'norm': 'l2', 'max_features': 1819, 'n_gram_lower': 3, 'n_gram_upper': 2}. Best is trial 3 with value: 0.12321246696699874.


  0%|          | 0/8594 [00:00<?, ?it/s]

# **VI. Predictions on test**

In [6]:
test_mapping_dict = load_json_data("predictions/task1/prediction1_k100_449.json")

In [16]:
# Get all train pairs (id_citing, id_cited)
test_pairs = create_pairs(test_mapping_dict, 20)
test_citing_dataset_df = pd.DataFrame(json_citing_test)

# Get paragraphs of citing and cited
citing_ids, citing_paragraphs = get_citing_paragraphs(test_pairs, test_citing_dataset_df)
cited_ids, cited_corpus, cited_type = get_cited_paragraphs(test_pairs, nonciting_dataset_df)

# Create the citing corpus
citing_corpus = map_ids_to_text(citing_ids, citing_paragraphs)

# Clean the citing corpus
citing_corpus = clean_corpus(citing_corpus, stem=True)

# Create the citing and cited corpus dictionaries
# Key: id_citing + "_" + id_cited
corpus_citing_sub_dict = {}
corpus_cited_sub_dict = {}

for i, (citing, cited) in tqdm(enumerate(test_pairs), total=len(test_pairs), desc="Creating Corpus Dictionaries"):
    key = citing + "_" + cited

    corpus_cited_sub = map_ids_to_text(cited_type[i], cited_corpus[i])
    corpus_citing_sub = [citing_corpus[ind_citing(citing_corpus, citing)]]

    # Clean the cited corpus
    corpus_cited_sub = clean_corpus(corpus_cited_sub, verbose=False, stem=True)

    # Add citing and cited corpus to the dictionaries
    corpus_citing_sub_dict[key] = corpus_citing_sub
    corpus_cited_sub_dict[key] = corpus_cited_sub

get citing paragraphs:   0%|          | 0/1000 [00:00<?, ?it/s]

get cited paragraphs:   0%|          | 0/20000 [00:00<?, ?it/s]

Cleaning:   0%|          | 0/1000 [00:00<?, ?it/s]

Creating Corpus Dictionaries:   0%|          | 0/20000 [00:00<?, ?it/s]

In [21]:
# Store the citing and cited corpus dictionaries to save time
# with open("datasets/corpus_citing_sub_dict_test.json", "w") as file:
#     json.dump(corpus_citing_sub_dict, file)

with open("datasets/corpus_cited_sub_dict_test.json", "w") as file:
    json.dump(corpus_cited_sub_dict, file)

# Load the citing and cited corpus dictionaries
# with open("datasets/corpus_citing_sub_dict.json", "r") as file:
#     corpus_citing_sub_dict = json.load(file)

# with open("datasets/corpus_cited_sub_dict.json", "r") as file:
#     corpus_cited_sub_dict = json.load(file)

In [22]:
# Predicting using CountVectorizer
vectorizer = CountVectorizer(binary=True)
y_pred = predict_pipeline(test_pairs, corpus_citing_sub_dict, corpus_cited_sub_dict, create_tfidf_matrix, vectorizer)

# Store the result
filename = "prediction2_CountVec_Bin.json"
with open(f"predictions/task2/{filename}", 'w') as f:
    json.dump(y_pred, f)

  0%|          | 0/20000 [00:00<?, ?it/s]

In [None]:
# Predicting using TfidfVectorizer
vectorizer = TfidfVectorizer(sublinear_tf=True, stop_words='english', max_features=10000)
y_pred = predict_pipeline(test_pairs, corpus_citing_sub_dict, corpus_cited_sub_dict, create_tfidf_matrix, vectorizer)

# Store the result
filename = "prediction2_.json"
with open(f"predictions/task2/{filename}", 'w') as f:
    json.dump(y_pred, f)