---
# <ins>**TASK 1**</ins>

In [4]:
import pandas as pd
import json
import nltk
import matplotlib.pyplot as plt
import re
import numpy as np
import optuna
import torch

from transformers import BertTokenizer, BertModel
from gensim.models import Word2Vec
from concurrent.futures import ProcessPoolExecutor
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from tqdm.auto import tqdm

In [14]:
# nltk.download('punkt')

# **Utils Functions**

## 1. <ins>Original functions from starter_notebooks</ins>

In [5]:
def load_json_data(file_path):
    with open(file_path, "r") as file:
        contents = json.load(file)
    return contents


def create_tfidf_matrix(citing_dataset, nonciting_dataset, vectorizer=TfidfVectorizer()):
    """
    Creates TF-IDF matrix for the given citing and non-citing datasets based on the specified text column.

    Parameters:
    citing_dataset (json)): DataFrame containing citing patents.
    nonciting_dataset (json): DataFrame containing non-citing patents.
    vectorizer (TfidfVectorizer, optional): TfidfVectorizer object for vectorizing text data.
                                             Defaults to TfidfVectorizer().

    Returns:
    tuple: A tuple containing TF-IDF matrices for citing and non-citing patents respectively.
           (tfidf_matrix_citing, tfidf_matrix_nonciting)
    """
    all_text = [patent['text'] for patent in citing_dataset + nonciting_dataset]

    # Vectorizing descriptions
    # tfidf_matrix = vectorizer.fit_transform(tqdm(all_text, desc="TF-IDF"))
    tfidf_matrix = vectorizer.fit_transform(all_text)

    # Since we're interested in similarities between citing and cited patents,
    # we need to split the TF-IDF matrix back into two parts
    split_index = len(citing_dataset)
    tfidf_matrix_citing = tfidf_matrix[:split_index]
    tfidf_matrix_nonciting = tfidf_matrix[split_index:]

    # Size of vocabulary
    # print("Size of vocabulary:", len(vectorizer.vocabulary_))

    return tfidf_matrix_citing, tfidf_matrix_nonciting



def get_mapping_dict(mapping_df):
    """
    Creates dictionary of citing ids to non-citing id based on given dataframe (which is based on providedjson)

    Parameters:
    mapping_df (DataFrame): DataFrame containing mapping between citing and cited patents
    Returns:
    dict: dictionary of unique citing patent ids to list of cited patent ids
    """
    mapping_dict = {}

    for _, row in mapping_df.iterrows():
        key = row[0]  # Value from column 0
        value = row[2]  # Value from column 2
        if key in mapping_dict:
            mapping_dict[key].append(value)
        else:
            mapping_dict[key] = [value]

    return mapping_dict

def create_corpus(corpus, text_type):
    """
    Extracts text data from a corpus based on the specified text type.

    Parameters:
    corpus (list): List of dictionaries representing patent documents.
    text_type (str): Type of text to extract ('title', 'abstract', 'claim1', 'claims', 'description', 'fulltext').

    Returns:
    list: List of dictionaries with 'id' and 'text' keys representing each document in the corpus.
    """

    app_ids = [doc['Application_Number'] + doc['Application_Category'] for doc in corpus]

    cnt = 0 # count the number of documents without text
    texts = []  # list of texts
    ids_to_remove = []  # list of ids of documents without text, to remove them from the corpus

    if text_type == 'title':
        for doc in corpus:
            try:
                texts.append(doc['Content']['title'])
            except: # if the document does not have a title
                ids_to_remove.append(doc['Application_Number']+doc['Application_Category'])
                cnt += 1
        if cnt > 0:
            print(f"Number of documents without title: {cnt}")

    elif text_type == 'abstract':
        for doc in corpus:
            try:
                texts.append(doc['Content']['pa01'])
            except: # if the document does not have an abstract
                ids_to_remove.append(doc['Application_Number']+doc['Application_Category'])
                cnt += 1
        if cnt > 0:
            print(f"Number of documents without abstract: {cnt}")

    elif text_type == 'claim1':
        for doc in corpus:
            try:
                texts.append(doc['Content']['c-en-0001'])
            except: # if the document does not have claim 1
                ids_to_remove.append(doc['Application_Number']+doc['Application_Category'])
                cnt += 1

        if cnt > 0:
            print(f"Number of documents without claim 1: {cnt}")

    elif text_type == 'claims':
        # all the values with the key starting with 'c-en-', each element in the final list is a list of claims
        for doc in corpus:
            doc_claims = []
            for key in doc['Content'].keys():
                if key.startswith('c-en-'):
                    doc_claims.append(doc['Content'][key])
            if len(doc_claims) == 0:    # if the document does not have any claims
                ids_to_remove.append(doc['Application_Number']+doc['Application_Category'])
                cnt += 1
            else:
                doc_text_string = ' '.join(doc_claims)
                texts.append(doc_text_string)
        if cnt > 0:
            print(f"Number of documents without claims: {cnt}")

    elif text_type == 'description':
        # all the values with the key starting with 'p'
        for doc in corpus:
            doc_text = []
            for key in doc['Content'].keys():
                if key.startswith('p'):
                    doc_text.append(doc['Content'][key])
            if len(doc_text) == 0:  # if the document does not have any description
                ids_to_remove.append(doc['Application_Number']+doc['Application_Category'])
                cnt += 1
            else:
                doc_text_string = ' '.join(doc_text)
                texts.append(doc_text_string)
        if cnt > 0:
            print(f"Number of documents without description: {cnt}")

    elif text_type == 'fulltext':
        for doc in corpus:
            doc_text = list(doc['Content'].values())
            doc_text_string = ' '.join(doc_text)
            texts.append(doc_text_string)
        if cnt > 0:
            print(f"Number of documents without any text: {cnt}")

    else:
        raise ValueError("Invalid text type")

    if len(ids_to_remove) > 0:
        print(f"Removing {len(ids_to_remove)} documents without required text")
        for id_ in ids_to_remove[::-1]:
            idx = app_ids.index(id_)
            del app_ids[idx]

    # Create a list of dictionaries with app_ids and texts
    corpus_data = [{'id': app_id, 'text': text} for app_id, text in zip(app_ids, texts)]

    return corpus_data


def get_true_and_predicted(citing_to_cited_dict, recommendations_dict):
    """
    Get the true and predicted labels for the metrics calculation.

    Parameters:
    citing_to_cited_dict : dict of str : list of str
        Mapping between citing patents and the list of their cited patents
    recommendations_dict : dict of str : list of str
        Mapping between citing patents and the sorted list of recommended patents

    Returns:
    list of list
        True relevant items for each recommendation list.
    list of list
        Predicted recommended items for each recommendation list.
    int
        Number of patents not in the citation mapping
    """
    # Initialize lists to store true labels and predicted labels
    true_labels = []
    predicted_labels = []
    not_in_citation_mapping = 0

    # Iterate over the items in both dictionaries
    for citing_id in recommendations_dict.keys():
        # Check if the citing_id is present in both dictionaries
        if citing_id in citing_to_cited_dict:
            # If yes, append the recommended items from both dictionaries to the respective lists
            true_labels.append(citing_to_cited_dict[citing_id])
            predicted_labels.append(recommendations_dict[citing_id])
        else:
            not_in_citation_mapping += 1

    return true_labels, predicted_labels, not_in_citation_mapping



def mean_recall_at_k(true_labels, predicted_labels, k=10):
    """
    Calculate the mean Recall@k for a list of recommendations.

    Parameters:
    true_labels : list of list
        True relevant items for each recommendation list.
    predicted_labels : list of list
        Predicted recommended items for each recommendation list.
    k : int
        Number of recommendations to consider.

    Returns:
    float
        Mean Recall@k value.
    """
    recalls_at_k = []

    for true, pred in zip(true_labels, predicted_labels):
        # Calculate Recall@k for each recommendation list
        true_set = set(true)
        k = min(k, len(pred))
        relevant_count = sum(1 for item in pred[:k] if item in true_set)
        recalls_at_k.append(relevant_count / len(true_set))

    # Calculate the mean Recall@k
    mean_recall = sum(recalls_at_k) / len(recalls_at_k)

    return mean_recall

def mean_inv_ranking(true_labels, predicted_labels):
    """
    Calculate the mean of lists of the mean inverse rank of true relevant items
    in the lists of sorted recommended items.

    Parameters:
    true_labels : list of list
        True relevant items for each recommendation list.
    predicted_labels : list of list
        Predicted recommended items for each recommendation list.

    Returns:
    float
        Mean of lists of the mean inverse rank of true relevant items.
    """
    mean_ranks = []

    for true, pred in zip(true_labels, predicted_labels):
        # Calculate the inverse rank of true relevant items
        # in the recommendation list
        ranks = []
        for item in true:
            try:
                rank = 1 / (pred.index(item) + 1)
            except ValueError:
                rank = 0  # If item not found, assign 0
            ranks.append(rank)

        # Calculate the mean inverse rank of true relevant items
        # in the recommendation list
        mean_rank = sum(ranks) / len(ranks)
        mean_ranks.append(mean_rank)

    # Calculate the mean of the mean inverse ranks across all recommendation lists
    mean_of_mean_ranks = sum(mean_ranks) / len(mean_ranks)

    return mean_of_mean_ranks


def mean_ranking(true_labels, predicted_labels):
    """
    Calculate the mean of lists of the mean rank of true relevant items
    in the lists of sorted recommended items.

    Parameters:
    true_labels : list of list
        True relevant items for each recommendation list.
    predicted_labels : list of list
        Predicted recommended items for each recommendation list.

    Returns:
    float
        Mean of lists of the mean rank of true relevant items.
    """
    mean_ranks = []

    for true, pred in zip(true_labels, predicted_labels):
        # Calculate the rank of true relevant items
        # in the recommendation list
        ranks = []
        for item in true:
            try:
                rank = pred.index(item) + 1
            except ValueError:
                rank = len(pred)  # If item not found, assign the length of the list
            ranks.append(rank)

        # Calculate the mean rank of true relevant items
        # in the recommendation list
        mean_rank = sum(ranks) / len(ranks)
        mean_ranks.append(mean_rank)

    # Calculate the mean of the mean ranks across all recommendation lists
    mean_of_mean_ranks = sum(mean_ranks) / len(mean_ranks)

    return mean_of_mean_ranks



def mean_average_precision(true_labels, predicted_labels, k=10):
    """
    Calculate the mean Average Precision for a list of recommendations.

    Parameters:
    true_labels : list of list
        True relevant items for each recommendation list.
    predicted_labels : list of list
        Predicted recommended items for each recommendation list.
    k : int
        Number of recommendations to consider.

    Returns:
    float
        Mean Average Precision value.
    """
    average_precisions = []

    for true, pred in zip(true_labels, predicted_labels):
        # Calculate Average Precision for each recommendation list
        true_set = set(true)
        precision_at_k = []
        relevant_count = 0
        for i, item in enumerate(pred[:k]):
            if item in true_set:
                relevant_count += 1
                precision_at_k.append(relevant_count / (i + 1))
        average_precision = sum(precision_at_k) / len(true_set)
        average_precisions.append(average_precision)

    # Calculate the mean Average Precision
    mean_average_precision = sum(average_precisions) / len(average_precisions)

    return mean_average_precision

def top_k_ranks(citing, cited, cosine_similarities, k=10):
    # Create a dictionary to store the top k ranks for each citing patent
    top_k_ranks = {}
    for i, content_id in enumerate(citing):
        top_k_ranks[content_id['id']] = [cited[j]['id'] for j in np.argsort(cosine_similarities[i])[::-1][:k]]
    return top_k_ranks


## 2. <ins>BM25</ins>

In [6]:
d

## 3. <ins>Customs functions</ins>

In [7]:
def clean_text(text, remove_stopwords=True, remove_digits=False, stem=False, lemmatize=False):
    """
    Clean the text data by converting to lowercase, removing stopwords, stemming, and removing special characters.

    Parameters:
    text (str): Text data to clean.
    remove_stopwords (bool, optional): Whether to remove stopwords. Defaults to True.
    stem (bool, optional): Whether to perform stemming. Defaults to False.
    lemmatize (bool, optional): Whether to perform lemmatization. Defaults to False.

    Returns:
    str: Cleaned text data.
    """
    text = text.lower()

    if remove_stopwords:
        stop_words = set(stopwords.words('english'))
        text = ' '.join([word for word in text.split()
                        if word not in stop_words])

    # Remove (1), (2), etc patterns
    text = re.sub(r'\(([0-9])+\)', '', text)

    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text)

    # Remove special characters
    if not remove_digits:
        text = re.sub(r'[^a-z0-9\s]', '', text)

    # Keep only alphabetic characters
    else:
        text = re.sub(r'[^a-z\s]', '', text)

    if stem:
        text = stem_text(text)
    
    elif lemmatize:
        text = lemmatize_text(text)

    return text


def clean_corpus(corpus, verbose=True, remove_digits=False, stem=False, lemmatize=False):
    """
    Clean the text data in the corpus by converting to lowercase, removing stopwords, stemming, and removing special characters.

    Parameters:
    corpus (list): List of dictionaries representing patent documents.
    verbose (bool, optional): Whether to show progress bar. Defaults to True.
    stem (bool, optional): Whether to perform stemming. Defaults to False.
    lemmatize (bool, optional): Whether to perform lemmatization. Defaults to False.

    Returns:
    list: List of dictionaries with 'id' and 'text' keys representing each document in the corpus.
    """
    if verbose:
        for patent in tqdm(corpus, desc="Cleaning"):
            patent['text'] = clean_text(patent['text'], stem=stem, lemmatize=lemmatize, remove_digits=remove_digits)
    
    else:
        for patent in corpus:
            patent['text'] = clean_text(patent['text'], stem=stem, lemmatize=lemmatize, remove_digits=remove_digits)


def stem_text(text):
    """
    Perform stemming on the given text data.

    Parameters:
    text (str): Text data to perform stemming on.

    Returns:
    str: Stemmed text data.
    """
    ps = PorterStemmer()
    return ' '.join([ps.stem(word) for word in text.split()])

def lemmatize_text(text):
    """
    Perform lemmatization on the given text data.

    Parameters:
    text (str): Text data to perform lemmatization on.

    Returns:
    str: Lemmatized text data.
    """
    lemmatizer = nltk.WordNetLemmatizer()
    return ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

def create_word2vec_matrix(citing_dataset, nonciting_dataset, word2vec_model):
    """
    Creates Word2Vec matrix for the given citing and non-citing datasets based on the specified Word2Vec model.

    Parameters:
    citing_dataset (list of str): List of citing patents' text.
    nonciting_dataset (list of str): List of non-citing patents' text.
    word2vec_model (Word2Vec): Pre-trained Word2Vec model.

    Returns:
    tuple: A tuple containing Word2Vec matrices for citing and non-citing patents respectively.
           (word2vec_matrix_citing, word2vec_matrix_nonciting)
    """
    def get_average_vector(text, model):
        words = text.split()
        vectors = [model.wv.get_vector(word) for word in words if word in model.wv]
        if vectors:
            return np.mean(vectors, axis=0)
        else:
            # If no words in the document are found in the model vocabulary,
            # return a zero vector of the same dimensionality
            return np.zeros(model.vector_size)

    # Convert text data to Word2Vec vectors
    word2vec_matrix_citing = np.array([get_average_vector(patent["text"], word2vec_model) for patent in citing_dataset])
    word2vec_matrix_nonciting = np.array([get_average_vector(patent["text"], word2vec_model) for patent in nonciting_dataset])

    return word2vec_matrix_citing, word2vec_matrix_nonciting

def create_bert_matrix(citing_dataset, nonciting_dataset, model_name='bert-base-uncased'):
    """
    Creates BERT matrix for the given citing and non-citing datasets based on the pre-trained BERT model.

    Parameters:
    citing_dataset (list of str): List of citing patents' text.
    nonciting_dataset (list of str): List of non-citing patents' text.
    model_name (str, optional): Name of the pre-trained BERT model. Defaults to 'bert-base-uncased'.

    Returns:
    tuple: A tuple containing BERT matrices for citing and non-citing patents respectively.
           (bert_matrix_citing, bert_matrix_nonciting)
    """
    # Load pre-trained BERT tokenizer and model
    tokenizer = BertTokenizer.from_pretrained(model_name)
    model = BertModel.from_pretrained(model_name)

    # Put the model in evaluation mode
    model.eval()

    # Function to tokenize and encode text
    def tokenize_and_encode(text):
        input_ids = tokenizer.encode(text, add_special_tokens=True, truncation=True, max_length=512, padding='max_length', return_tensors='pt')
        with torch.no_grad():
            outputs = model(input_ids)
        # Take the output of the [CLS] token as the sentence representation
        return outputs[0][:, 0, :].numpy()

    # Encode the text data using the BERT model
    bert_matrix_citing = [tokenize_and_encode(text) for text in citing_dataset]
    bert_matrix_nonciting = [tokenize_and_encode(text) for text in nonciting_dataset]

    return bert_matrix_citing, bert_matrix_nonciting

def predict_pipeline_task1(corpus_citing, corpus_nonciting, vectorizer, create_matrix=create_tfidf_matrix, cleaning_data=False, k=100):
    """
    Pipeline for predicting the top k ranks for each citing patent based on the cosine similarities between citing and non-citing patents.

    Parameters:
    corpus_citing (list): List of dictionaries representing citing patents.
    corpus_nonciting (list): List of dictionaries representing non-citing patents.
    vectorizer: Object for vectorizing text data, e.g., TfidfVectorizer.
    create_matrix (function, optional): Function for creating the matrix. Defaults to create_tfidf_matrix.
    cleaning_data (bool, optional): Whether to clean the text data. Defaults to False.
    k (int, optional): Number of recommendations to consider. Defaults to 100.

    Returns:
    dict: Dictionary containing the top k ranks for each citing patent.
    """
    if cleaning_data:
        print("Cleaning data...")
        clean_corpus(corpus_citing)
        clean_corpus(corpus_nonciting)

    vec_citing, vec_nonciting = create_matrix(corpus_citing, corpus_nonciting, vectorizer)
    cosine_similarities = linear_kernel(vec_citing, vec_nonciting)
    top_k_ranks_dict = top_k_ranks(corpus_citing, corpus_nonciting, cosine_similarities, k=k)

    return top_k_ranks_dict

def show_results(predicted_ranking, mapping_dict, k=100):
    """
    Show the results of the evaluation metrics.

    Parameters:
    predicted_ranking (dict): Dictionary containing the top k ranks for each citing patent.
    mapping_dict (dict): Dictionary of citing ids to non-citing id based on given dataframe.
    k (int, optional): Number of recommendations to consider. Defaults to 100.
    """
    print("\n")
    # Get the true and predicted labels
    true_labels, predicted_labels, not_in_citation_mapping = get_true_and_predicted(mapping_dict, predicted_ranking)

    if not_in_citation_mapping > 0:
        print(f"Number of patents not in the citation mapping: {not_in_citation_mapping}")

    # Calculate the evaluation metrics
    mat10 = mean_recall_at_k(true_labels, predicted_labels, 10)
    mat20 = mean_recall_at_k(true_labels, predicted_labels, 20)
    mat50 = mean_recall_at_k(true_labels, predicted_labels, 50)
    mat100 = mean_recall_at_k(true_labels, predicted_labels, 100)
    mean_avg_precision = mean_average_precision(true_labels, predicted_labels, k)

    # Display the evaluation metrics
    print(f"Mean Recal@10 : {mat10:.4f}")
    print(f"Mean Recal@20 : {mat20:.4f}")
    print(f"Mean Recal@50 : {mat50:.4f}")
    print(f"Mean Recal@100: {mat100:.4f}")
    print(f"Mean Average Precision: {mean_avg_precision:.4f}")

In [6]:
json_citing_train = load_json_data("./datasets/Content_JSONs/Citing_2020_Cleaned_Content_12k/Citing_Train_Test/citing_TRAIN.json")
json_citing_test = load_json_data("./datasets/Content_JSONs/Citing_2020_Cleaned_Content_12k/Citing_Train_Test/citing_TEST.json")

json_nonciting = load_json_data("./datasets/Content_JSONs/Cited_2020_Uncited_2010-2019_Cleaned_Content_22k/CLEANED_CONTENT_DATASET_cited_patents_by_2020_uncited_2010-2019.json")
json_citing_to_cited = load_json_data("./datasets/Citation_JSONs/Citation_Train.json")
json_citing_id = load_json_data("./Citing_ID_List_Test.json")

In [7]:
citing_dataset_df = pd.DataFrame(json_citing_train)
nonciting_dataset_df = pd.DataFrame(json_nonciting)
mapping_dataset_df = pd.DataFrame(json_citing_to_cited)

mapping_dict = get_mapping_dict(mapping_dataset_df)

In [None]:
mapping_dict

# **I. Basic TFIDF**

In [None]:
# 3m
print("Basic TF-IDF matrix")
print("Full text for citing and non-citing patents not cleaned.")

corpus_citing_train = create_corpus(json_citing_train, "fulltext")
corpus_nonciting = create_corpus(json_nonciting, "fulltext")

vectorizer = TfidfVectorizer(stop_words='english', max_features=10000, sublinear_tf=True)
predicted_ranking = predict_pipeline_task1(corpus_citing_train, corpus_nonciting, vectorizer)

show_results(predicted_ranking, mapping_dict)

Basic TF-IDF matrix
Full text for citing and non-citing patents not cleaned.


Mean Recal@10 : 0.6651
Mean Recal@20 : 0.7456
Mean Recal@50 : 0.8320
Mean Recal@100: 0.8932
Mean Average Precision: 0.4571


In [32]:
# 1m40s
print("Basic TF-IDF matrix")
print("Claims for citing and description for non-citing patents not cleaned.")

corpus_citing_train = create_corpus(json_citing_train, "claims")
corpus_nonciting = create_corpus(json_nonciting, "description")

vectorizer = TfidfVectorizer(stop_words='english', max_features=10000, sublinear_tf=True)
predicted_ranking = predict_pipeline_task1(corpus_citing_train, corpus_nonciting, vectorizer)

show_results(predicted_ranking, mapping_dict)

Basic TF-IDF matrix
Claims for citing and description for non-citing patents not cleaned.




Mean Recal@10 : 0.6712
Mean Recal@20 : 0.7483
Mean Recal@50 : 0.8460
Mean Recal@100: 0.9002
Mean Average Precision: 0.4518


In [34]:
# 30m
# Keep digits
print("Basic TF-IDF matrix")
print("Full text for citing and description for non-citing patents stemmed.")

corpus_citing_train = create_corpus(json_citing_train, "fulltext")
corpus_nonciting = create_corpus(json_nonciting, "fulltext")

clean_corpus(corpus_citing_train, verbose=True, stem=True)
clean_corpus(corpus_nonciting, verbose=True, stem=True)

vectorizer = TfidfVectorizer(stop_words='english', max_features=10000, sublinear_tf=True)
predicted_ranking = predict_pipeline_task1(corpus_citing_train, corpus_nonciting, vectorizer)

show_results(predicted_ranking, mapping_dict)

Basic TF-IDF matrix
Full text for citing and description for non-citing patents stemmed.


Cleaning:   0%|          | 0/6831 [00:00<?, ?it/s]

Cleaning:   0%|          | 0/16837 [00:00<?, ?it/s]



Mean Recal@10 : 0.6651
Mean Recal@20 : 0.7479
Mean Recal@50 : 0.8345
Mean Recal@100: 0.8883
Mean Average Precision: 0.4516


In [48]:
# 30m
# Remove digits
print("Basic TF-IDF matrix")
print("Full text for citing and description for non-citing patents stemmed and digits removed.")

corpus_citing_train = create_corpus(json_citing_train, "fulltext")
corpus_nonciting = create_corpus(json_nonciting, "fulltext")

clean_corpus(corpus_citing_train, verbose=True, stem=True, remove_digits=True)
clean_corpus(corpus_nonciting, verbose=True, stem=True, remove_digits=True)

vectorizer = TfidfVectorizer(stop_words='english', max_features=10000, sublinear_tf=True)
predicted_ranking = predict_pipeline_task1(corpus_citing_train, corpus_nonciting, vectorizer)

show_results(predicted_ranking, mapping_dict)

Basic TF-IDF matrix
Full text for citing and description for non-citing patents stemmed and digits removed.


Cleaning:   0%|          | 0/6831 [00:00<?, ?it/s]

Cleaning:   0%|          | 0/16837 [00:00<?, ?it/s]



Mean Recal@10 : 0.6633
Mean Recal@20 : 0.7457
Mean Recal@50 : 0.8326
Mean Recal@100: 0.8858
Mean Average Precision: 0.4493


In [41]:
# 4m
print("Basic TF-IDF matrix")
print("Full text for citing and non-citing patents cleaned.")

corpus_citing_train = create_corpus(json_citing_train, "fulltext")
corpus_nonciting = create_corpus(json_nonciting, "fulltext")

clean_corpus(corpus_citing_train, verbose=False)
clean_corpus(corpus_nonciting, verbose=False)

vectorizer = TfidfVectorizer(stop_words='english', max_features=10000, sublinear_tf=True)
predicted_ranking = predict_pipeline_task1(corpus_citing_train, corpus_nonciting, vectorizer)

show_results(predicted_ranking, mapping_dict)

Basic TF-IDF matrix
Full text for citing and non-citing patents not cleaned.


Mean Recal@10 : 0.6592
Mean Recal@20 : 0.7442
Mean Recal@50 : 0.8300
Mean Recal@100: 0.8901
Mean Average Precision: 0.4528


In [37]:
# 3m
# Remove digits in the text
print("Basic TF-IDF matrix")
print("Full text for citing and non-citing patents cleaned and digits removed.")

corpus_citing_train = create_corpus(json_citing_train, "fulltext")
corpus_nonciting = create_corpus(json_nonciting, "fulltext")

clean_corpus(corpus_citing_train, verbose=False, remove_digits=True)
clean_corpus(corpus_nonciting, verbose=False, remove_digits=True)

vectorizer = TfidfVectorizer(stop_words='english', max_features=10000, sublinear_tf=True)
predicted_ranking = predict_pipeline_task1(corpus_citing_train, corpus_nonciting, vectorizer)

show_results(predicted_ranking, mapping_dict)

Basic TF-IDF matrix
Full text for citing and non-citing patents not cleaned.


Mean Recal@10 : 0.6864
Mean Recal@20 : 0.7703
Mean Recal@50 : 0.8541
Mean Recal@100: 0.9067
Mean Average Precision: 0.4778


In [66]:
# 2m20
# Remove digits in the text
print("Basic TF-IDF matrix")
print("Claims for citing and description for non-citing patents cleaned and digits removed.")

corpus_citing_train = create_corpus(json_citing_train, "claims")
corpus_nonciting = create_corpus(json_nonciting, "description")

clean_corpus(corpus_citing_train, verbose=False, remove_digits=True)
clean_corpus(corpus_nonciting, verbose=False, remove_digits=True)

vectorizer = TfidfVectorizer(stop_words='english', max_features=10000, sublinear_tf=True)
predicted_ranking = predict_pipeline_task1(corpus_citing_train, corpus_nonciting, vectorizer)

show_results(predicted_ranking, mapping_dict)

Basic TF-IDF matrix
Claims for citing and description for non-citing patents cleaned and digits removed.


Mean Recal@10 : 0.6856
Mean Recal@20 : 0.7644
Mean Recal@50 : 0.8579
Mean Recal@100: 0.9104
Mean Average Precision: 0.4636


In [67]:
# 3m
# Remove digits in the text
print("Basic TF-IDF matrix")
print("Claims for citing and description for non-citing patents cleaned and digits removed.")

corpus_citing_train = create_corpus(json_citing_train, "claims")
corpus_nonciting = create_corpus(json_nonciting, "description")

clean_corpus(corpus_citing_train, verbose=False, remove_digits=True, stem=True)
clean_corpus(corpus_nonciting, verbose=False, remove_digits=True, stem=True)

vectorizer = TfidfVectorizer(stop_words='english', max_features=10000, sublinear_tf=True)
predicted_ranking = predict_pipeline_task1(corpus_citing_train, corpus_nonciting, vectorizer)

show_results(predicted_ranking, mapping_dict)

Basic TF-IDF matrix
Claims for citing and description for non-citing patents cleaned and digits removed.


Mean Recal@10 : 0.6938
Mean Recal@20 : 0.7754
Mean Recal@50 : 0.8654
Mean Recal@100: 0.9148
Mean Average Precision: 0.4688


In [71]:
# Store the citing and cited corpus dictionaries to save time
# with open("datasets/corpus/task1/citing_train_fulltext_stemmed.json", "w") as file:
#     json.dump(corpus_citing_train, file)

# with open("datasets/corpus/task1/nonciting_fulltext_stemmed.json", "w") as file:
#     json.dump(corpus_nonciting, file)

with open("datasets/corpus/task1/citing_train_fulltext_stemmed.json", "r") as file:
    corpus_citing_train = json.load(file)

with open("datasets/corpus/task1/nonciting_fulltext_stemmed.json", "r") as file:
    corpus_nonciting = json.load(file)

In [76]:
clean_corpus(corpus_citing_train, remove_digits=True)
clean_corpus(corpus_nonciting, remove_digits=True)

Cleaning:   0%|          | 0/6831 [00:00<?, ?it/s]

Cleaning:   0%|          | 0/16837 [00:00<?, ?it/s]

In [77]:
vectorizer = TfidfVectorizer(stop_words='english', max_features=10000, sublinear_tf=True)
predicted_ranking = predict_pipeline_task1(corpus_citing_train, corpus_nonciting, vectorizer)

show_results(predicted_ranking, mapping_dict)



Mean Recal@10 : 0.7057
Mean Recal@20 : 0.7824
Mean Recal@50 : 0.8645
Mean Recal@100: 0.9120
Mean Average Precision: 0.4844


In [8]:
corpus_citing_train = create_corpus(json_citing_train, "fulltext")
corpus_nonciting = create_corpus(json_nonciting, "fulltext")

clean_corpus(corpus_citing_train, remove_digits=True, stem=True)
clean_corpus(corpus_nonciting, remove_digits=True, stem=True)


Cleaning:   0%|          | 0/6831 [00:00<?, ?it/s]

Cleaning:   0%|          | 0/16837 [00:00<?, ?it/s]

# **II. Word2Vec**

In [7]:
# Load the citing and cited corpus dictionaries containing stemmed of fulltext
# with open("datasets/predicted_ranking/task1/fulltext_stemmed.json", "r") as file:
#     corpus_citing_train = json.load(file)

# with open("datasets/predicted_ranking/task1/fulltext_stemmed.json", "r") as file:
#     corpus_nonciting = json.load(file)

corpus_citing_train = create_corpus(json_citing_train, "fulltext")
corpus_nonciting = create_corpus(json_nonciting, "fulltext")

In [8]:
w2v_sentences = []

for corpus in corpus_citing_train + corpus_nonciting:
    w2v_sentences.append(corpus['text'].split())

In [11]:
# 20m-
print("Basic Word2Vec")
print("Full text for citing and description for non-citing patents.")

vectorizer = Word2Vec(sentences=w2v_sentences, workers=4)
predicted_ranking = predict_pipeline_task1(corpus_citing_train, corpus_nonciting, vectorizer, create_matrix=create_word2vec_matrix)

show_results(predicted_ranking, mapping_dict)


show_results(predicted_ranking, mapping_dict)



Mean Recal@10 : 0.0203
Mean Recal@20 : 0.0388
Mean Recal@50 : 0.0715
Mean Recal@100: 0.1201
Mean Average Precision: 0.0100


# **III. Word2Vec Optimization**

In [None]:
def objective(trial):
    trial.suggest_int("vector_size", 100, 300)
    # trial.suggest_int("window", 1, 10)
    # trial.suggest_int("min_count", 1, 10)
    # trial.suggest_float("alpha", 0.025, 0.1)
    # trial.suggest_float("min_alpha", 0.0001, 0.01)
    # trial.suggest_float("sample", 0, 1e-5)

    vectorizer = Word2Vec(**trial.params, sentences=w2v_sentences, workers=4)
    predicted_ranking = predict_pipeline_task1(corpus_citing_train, corpus_nonciting, vectorizer, create_matrix=create_word2vec_matrix)

    y_true, y_pred, _ = get_true_and_predicted(mapping_dict, predicted_ranking)

    return mean_recall_at_k(y_true, y_pred, 100)

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

print(study.best_params)
print(study.best_value)

# **IV. BM25**

In [3]:
print("BM25")
print("Claims for citing and description for non-citing patents.")

corpus_citing_train = create_corpus(json_citing_train, "claims")
corpus_nonciting = create_corpus(json_nonciting, "description")

vectorizer = TfidfVectorizer(stop_words='english', max_features=10000, sublinear_tf=True)
predicted_ranking = predict_pipeline_task1(corpus_citing_train, corpus_nonciting, vectorizer, create_matrix=create_bm25_matrix)

show_results(predicted_ranking, mapping_dict)

BM25
Claims for citing and description for non-citing patents.


NameError: name 'create_corpus' is not defined

# **IV. BERT**

In [None]:
# -m-
print("BERT")
print("Full text for citing and description for non-citing patents.")

corpus_citing_train = create_corpus(json_citing_train, "fulltext")
corpus_nonciting = create_corpus(json_nonciting, "fulltext")

model_name = 'bert-base-uncased'
predicted_ranking = predict_pipeline_task1(corpus_citing_train, corpus_nonciting, model_name, create_matrix=create_bert_matrix)

show_results(predicted_ranking, mapping_dict)

# **Make predictions**

In [63]:
# 2m2 - score mAP 0.471
# Remove digits in the text
print("Basic TF-IDF matrix")
print("Full text for citing and non-citing patents cleaned and remove digits.")

corpus_citing_test = create_corpus(json_citing_test, "fulltext")
corpus_nonciting = create_corpus(json_nonciting, "fulltext")

clean_corpus(corpus_citing_test, verbose=False, remove_digits=True)
clean_corpus(corpus_nonciting, verbose=False, remove_digits=True)

vectorizer = TfidfVectorizer(stop_words='english', max_features=10000, sublinear_tf=True)
predicted_ranking = predict_pipeline_task1(corpus_citing_test, corpus_nonciting, vectorizer)


Basic TF-IDF matrix
Full text for citing and non-citing patents cleaned and remove digits.


In [64]:
with open('predictions/task1/prediction1_fulltext_cleaned_nodigitnew.json', 'w') as f:
    json.dump(predicted_ranking, f)