In [1]:
import pandas as pd
import json
import nltk
import matplotlib.pyplot as plt
import re
import numpy as np

from concurrent.futures import ProcessPoolExecutor
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from tqdm.auto import tqdm

In [2]:
def load_json_data(file_path):
    with open(file_path, "r") as file:
        contents = json.load(file)
    return contents


def create_tfidf_matrix(citing_dataset, nonciting_dataset, vectorizer=TfidfVectorizer()):
    """
    Creates TF-IDF matrix for the given citing and non-citing datasets based on the specified text column.

    Parameters:
    citing_dataset (json)): DataFrame containing citing patents.
    nonciting_dataset (json): DataFrame containing non-citing patents.
    vectorizer (TfidfVectorizer, optional): TfidfVectorizer object for vectorizing text data.
                                             Defaults to TfidfVectorizer().

    Returns:
    tuple: A tuple containing TF-IDF matrices for citing and non-citing patents respectively.
           (tfidf_matrix_citing, tfidf_matrix_nonciting)
    """
    all_text = [patent['text'] for patent in citing_dataset + nonciting_dataset]

    # Vectorizing descriptions
    print("Vectorizing descriptions...")
    # tfidf_matrix = vectorizer.fit_transform(tqdm(all_text, desc="TF-IDF"))
    tfidf_matrix = vectorizer.fit_transform(all_text)

    # Since we're interested in similarities between citing and cited patents,
    # we need to split the TF-IDF matrix back into two parts
    split_index = len(citing_dataset)
    tfidf_matrix_citing = tfidf_matrix[:split_index]
    tfidf_matrix_nonciting = tfidf_matrix[split_index:]

    # Size of vocabulary
    print("Size of vocabulary:", len(vectorizer.vocabulary_))

    return tfidf_matrix_citing, tfidf_matrix_nonciting



def get_mapping_dict(mapping_df):
    """
    Creates dictionary of citing ids to non-citing id based on given dataframe (which is based on providedjson)

    Parameters:
    mapping_df (DataFrame): DataFrame containing mapping between citing and cited patents
    Returns:
    dict: dictionary of unique citing patent ids to list of cited patent ids
    """
    mapping_dict = {}

    for _, row in mapping_df.iterrows():
        key = row[0]  # Value from column 0
        value = row[2]  # Value from column 2
        if key in mapping_dict:
            mapping_dict[key].append(value)
        else:
            mapping_dict[key] = [value]

    return mapping_dict

def create_corpus(corpus, text_type):
    """
    Extracts text data from a corpus based on the specified text type.

    Parameters:
    corpus (list): List of dictionaries representing patent documents.
    text_type (str): Type of text to extract ('title', 'abstract', 'claim1', 'claims', 'description', 'fulltext').

    Returns:
    list: List of dictionaries with 'id' and 'text' keys representing each document in the corpus.
    """

    app_ids = [doc['Application_Number'] + doc['Application_Category'] for doc in corpus]

    cnt = 0 # count the number of documents without text
    texts = []  # list of texts
    ids_to_remove = []  # list of ids of documents without text, to remove them from the corpus

    if text_type == 'title':
        for doc in corpus:
            try:
                texts.append(doc['Content']['title'])
            except: # if the document does not have a title
                ids_to_remove.append(doc['Application_Number']+doc['Application_Category'])
                cnt += 1
        if cnt > 0:
            print(f"Number of documents without title: {cnt}")

    elif text_type == 'abstract':
        for doc in corpus:
            try:
                texts.append(doc['Content']['pa01'])
            except: # if the document does not have an abstract
                ids_to_remove.append(doc['Application_Number']+doc['Application_Category'])
                cnt += 1
        if cnt > 0:
            print(f"Number of documents without abstract: {cnt}")

    elif text_type == 'claim1':
        for doc in corpus:
            try:
                texts.append(doc['Content']['c-en-0001'])
            except: # if the document does not have claim 1
                ids_to_remove.append(doc['Application_Number']+doc['Application_Category'])
                cnt += 1

        if cnt > 0:
            print(f"Number of documents without claim 1: {cnt}")

    elif text_type == 'claims':
        # all the values with the key starting with 'c-en-', each element in the final list is a list of claims
        for doc in corpus:
            doc_claims = []
            for key in doc['Content'].keys():
                if key.startswith('c-en-'):
                    doc_claims.append(doc['Content'][key])
            if len(doc_claims) == 0:    # if the document does not have any claims
                ids_to_remove.append(doc['Application_Number']+doc['Application_Category'])
                cnt += 1
            else:
                doc_text_string = ' '.join(doc_claims)
                texts.append(doc_text_string)
        if cnt > 0:
            print(f"Number of documents without claims: {cnt}")

    elif text_type == 'description':
        # all the values with the key starting with 'p'
        for doc in corpus:
            doc_text = []
            for key in doc['Content'].keys():
                if key.startswith('p'):
                    doc_text.append(doc['Content'][key])
            if len(doc_text) == 0:  # if the document does not have any description
                ids_to_remove.append(doc['Application_Number']+doc['Application_Category'])
                cnt += 1
            else:
                doc_text_string = ' '.join(doc_text)
                texts.append(doc_text_string)
        if cnt > 0:
            print(f"Number of documents without description: {cnt}")

    elif text_type == 'fulltext':
        for doc in corpus:
            doc_text = list(doc['Content'].values())
            doc_text_string = ' '.join(doc_text)
            texts.append(doc_text_string)
        if cnt > 0:
            print(f"Number of documents without any text: {cnt}")

    else:
        raise ValueError("Invalid text type")

    if len(ids_to_remove) > 0:
        print(f"Removing {len(ids_to_remove)} documents without required text")
        for id_ in ids_to_remove[::-1]:
            idx = app_ids.index(id_)
            del app_ids[idx]

    # Create a list of dictionaries with app_ids and texts
    corpus_data = [{'id': app_id, 'text': text} for app_id, text in zip(app_ids, texts)]

    return corpus_data


def get_true_and_predicted(citing_to_cited_dict, recommendations_dict):
    """
    Get the true and predicted labels for the metrics calculation.

    Parameters:
    citing_to_cited_dict : dict of str : list of str
        Mapping between citing patents and the list of their cited patents
    recommendations_dict : dict of str : list of str
        Mapping between citing patents and the sorted list of recommended patents

    Returns:
    list of list
        True relevant items for each recommendation list.
    list of list
        Predicted recommended items for each recommendation list.
    int
        Number of patents not in the citation mapping
    """
    # Initialize lists to store true labels and predicted labels
    true_labels = []
    predicted_labels = []
    not_in_citation_mapping = 0

    # Iterate over the items in both dictionaries
    for citing_id in recommendations_dict.keys():
        # Check if the citing_id is present in both dictionaries
        if citing_id in citing_to_cited_dict:
            # If yes, append the recommended items from both dictionaries to the respective lists
            true_labels.append(citing_to_cited_dict[citing_id])
            predicted_labels.append(recommendations_dict[citing_id])
        else:
            not_in_citation_mapping += 1

    return true_labels, predicted_labels, not_in_citation_mapping



def mean_recall_at_k(true_labels, predicted_labels, k=10):
    """
    Calculate the mean Recall@k for a list of recommendations.

    Parameters:
    true_labels : list of list
        True relevant items for each recommendation list.
    predicted_labels : list of list
        Predicted recommended items for each recommendation list.
    k : int
        Number of recommendations to consider.

    Returns:
    float
        Mean Recall@k value.
    """
    recalls_at_k = []

    for true, pred in zip(true_labels, predicted_labels):
        # Calculate Recall@k for each recommendation list
        true_set = set(true)
        k = min(k, len(pred))
        relevant_count = sum(1 for item in pred[:k] if item in true_set)
        recalls_at_k.append(relevant_count / len(true_set))

    # Calculate the mean Recall@k
    mean_recall = sum(recalls_at_k) / len(recalls_at_k)

    return mean_recall

def mean_inv_ranking(true_labels, predicted_labels):
    """
    Calculate the mean of lists of the mean inverse rank of true relevant items
    in the lists of sorted recommended items.

    Parameters:
    true_labels : list of list
        True relevant items for each recommendation list.
    predicted_labels : list of list
        Predicted recommended items for each recommendation list.

    Returns:
    float
        Mean of lists of the mean inverse rank of true relevant items.
    """
    mean_ranks = []

    for true, pred in zip(true_labels, predicted_labels):
        # Calculate the inverse rank of true relevant items
        # in the recommendation list
        ranks = []
        for item in true:
            try:
                rank = 1 / (pred.index(item) + 1)
            except ValueError:
                rank = 0  # If item not found, assign 0
            ranks.append(rank)

        # Calculate the mean inverse rank of true relevant items
        # in the recommendation list
        mean_rank = sum(ranks) / len(ranks)
        mean_ranks.append(mean_rank)

    # Calculate the mean of the mean inverse ranks across all recommendation lists
    mean_of_mean_ranks = sum(mean_ranks) / len(mean_ranks)

    return mean_of_mean_ranks


def mean_ranking(true_labels, predicted_labels):
    """
    Calculate the mean of lists of the mean rank of true relevant items
    in the lists of sorted recommended items.

    Parameters:
    true_labels : list of list
        True relevant items for each recommendation list.
    predicted_labels : list of list
        Predicted recommended items for each recommendation list.

    Returns:
    float
        Mean of lists of the mean rank of true relevant items.
    """
    mean_ranks = []

    for true, pred in zip(true_labels, predicted_labels):
        # Calculate the rank of true relevant items
        # in the recommendation list
        ranks = []
        for item in true:
            try:
                rank = pred.index(item) + 1
            except ValueError:
                rank = len(pred)  # If item not found, assign the length of the list
            ranks.append(rank)

        # Calculate the mean rank of true relevant items
        # in the recommendation list
        mean_rank = sum(ranks) / len(ranks)
        mean_ranks.append(mean_rank)

    # Calculate the mean of the mean ranks across all recommendation lists
    mean_of_mean_ranks = sum(mean_ranks) / len(mean_ranks)

    return mean_of_mean_ranks



def mean_average_precision(true_labels, predicted_labels, k=10):
    """
    Calculate the mean Average Precision for a list of recommendations.

    Parameters:
    true_labels : list of list
        True relevant items for each recommendation list.
    predicted_labels : list of list
        Predicted recommended items for each recommendation list.
    k : int
        Number of recommendations to consider.

    Returns:
    float
        Mean Average Precision value.
    """
    average_precisions = []

    for true, pred in zip(true_labels, predicted_labels):
        # Calculate Average Precision for each recommendation list
        true_set = set(true)
        precision_at_k = []
        relevant_count = 0
        for i, item in enumerate(pred[:k]):
            if item in true_set:
                relevant_count += 1
                precision_at_k.append(relevant_count / (i + 1))
        average_precision = sum(precision_at_k) / len(true_set)
        average_precisions.append(average_precision)

    # Calculate the mean Average Precision
    mean_average_precision = sum(average_precisions) / len(average_precisions)

    return mean_average_precision

def top_k_ranks(citing, cited, cosine_similarities, k=10):
    # Create a dictionary to store the top k ranks for each citing patent
    top_k_ranks = {}
    for i, content_id in enumerate(citing):
        top_k_ranks[content_id['id']] = [cited[j]['id'] for j in np.argsort(cosine_similarities[i])[::-1][:k]]
    return top_k_ranks


In [3]:
json_citing_train = load_json_data("./datasets/Content_JSONs/Citing_2020_Cleaned_Content_12k/Citing_Train_Test/citing_TRAIN.json")
json_citing_test = load_json_data("./datasets/Content_JSONs/Citing_2020_Cleaned_Content_12k/Citing_Train_Test/citing_TEST.json")

json_nonciting = load_json_data("./datasets/Content_JSONs/Cited_2020_Uncited_2010-2019_Cleaned_Content_22k/CLEANED_CONTENT_DATASET_cited_patents_by_2020_uncited_2010-2019.json")
json_citing_to_cited = load_json_data("./datasets/Citation_JSONs/Citation_Train.json") # Citing ids are unique
json_citing_id = load_json_data("./Citing_ID_List_Test.json")

In [4]:
citing_dataset_df = pd.DataFrame(json_citing_train)
citing_dataset_df_test = pd.DataFrame(json_citing_test)
nonciting_dataset_df = pd.DataFrame(json_nonciting)
mapping_dataset_df = pd.DataFrame(json_citing_to_cited)

mapping_dict = get_mapping_dict(mapping_dataset_df)

In [5]:
mapping_dataset_df.columns = ['citing_id', "citing_claims" ,'cited_id', 'paragraphs', 'category']
mapping_dataset_df["category"].value_counts(normalize=True)

A    0.62404
X    0.37596
Name: category, dtype: float64

In [6]:
new_citing = citing_dataset_df
new_citing["citing_id"] = citing_dataset_df["Application_Number"] + citing_dataset_df["Application_Category"]
new_citing = new_citing.drop(columns=["Application_Number", "Application_Category", "Application_Date"])

new_citing_test = citing_dataset_df_test
new_citing_test["citing_id"] = citing_dataset_df_test["Application_Number"] + citing_dataset_df_test["Application_Category"]
new_citing_test = new_citing_test.drop(columns=["Application_Number", "Application_Category", "Application_Date"])


In [7]:
new_nonciting = nonciting_dataset_df
new_nonciting["citing_id"] = nonciting_dataset_df["Application_Number"] + nonciting_dataset_df["Application_Category"]
new_nonciting = new_nonciting.drop(columns=["Application_Number", "Application_Category", "Application_Date"])
new_nonciting

Unnamed: 0,Content,citing_id
0,"{'title': 'VEHICLE WITH LOCKABLE TILT SYSTEM',...",2019772B1
1,"{'title': 'UNIVERSAL JOINT', 'c-en-0001': 'A u...",2136094B1
2,"{'title': 'Apparatus, method and computer prog...",2340701A2
3,{'title': 'Method and apparatus for forming a ...,2338661A1
4,{'title': 'Heat exchanger mounting assembly in...,2339144A2
...,...,...
16832,"{'title': 'Synchronization of a split audio, v...",1995910B1
16833,{'title': 'Fluid handling unit and fluid handl...,1997557B1
16834,"{'title': 'Oil pump system for vehicle', 'c-en...",1992846B1
16835,{'title': 'Apparatus for construction of quasi...,1981195B1


In [8]:
mapping_dataset_df

Unnamed: 0,citing_id,citing_claims,cited_id,paragraphs,category
0,3712070A1,[c-en-0004],3354576A1,"[p0024, p0027, c-en-0012, c-en-0013]",A
1,3675165A1,"[c-en-0001, c-en-0002, c-en-0003, c-en-0004, c...",3336831A2,"[p0045, p0046, p0047, p0048, p0049, p0050, p00...",A
2,3599626A1,"[c-en-0002, c-en-0003, c-en-0004, c-en-0005, c...",2453448A1,"[p0029, p0030]",A
3,3705201A1,"[c-en-0001, c-en-0002, c-en-0004, c-en-0006, c...",2468433A2,"[p0011, p0012, p0013, p0014, p0015, p0016, p00...",X
4,3628210A1,"[c-en-0001, c-en-0002, c-en-0003, c-en-0004, c...",3369366A1,[pa01],A
...,...,...,...,...,...
8589,3623977A1,"[c-en-0008, c-en-0009, c-en-0010, c-en-0011, c...",2518981A1,"[p0021, p0022, p0023, p0024, p0025, p0026, p00...",A
8590,3721843A1,"[c-en-0001, c-en-0002, c-en-0003, c-en-0004, c...",3213727A1,"[p0015, p0016, p0017, p0018, p0019, p0020, p00...",X
8591,3708263A1,"[c-en-0001, c-en-0002, c-en-0003, c-en-0004, c...",3217171A1,"[pa01, p0010, p0014, p0003, p0009, p0016]",A
8592,3588557A1,"[c-en-0001, c-en-0002, c-en-0003, c-en-0004, c...",2988328A1,"[p0047, p0012]",A


In [15]:
def create_tfidf_matrix(citing_dataset, nonciting_dataset, vectorizer=TfidfVectorizer()):
    """
    Creates TF-IDF matrix for the given citing and non-citing datasets based on the specified text column.

    Parameters:
    citing_dataset (json)): DataFrame containing citing patents.
    nonciting_dataset (json): DataFrame containing non-citing patents.
    vectorizer (TfidfVectorizer, optional): TfidfVectorizer object for vectorizing text data.
                                             Defaults to TfidfVectorizer().

    Returns:
    tuple: A tuple containing TF-IDF matrices for citing and non-citing patents respectively.
           (tfidf_matrix_citing, tfidf_matrix_nonciting)
    """
    all_text = [patent['text'] for patent in citing_dataset + nonciting_dataset]

    # Vectorizing descriptions
    print("Vectorizing descriptions...")
    # tfidf_matrix = vectorizer.fit_transform(tqdm(all_text, desc="TF-IDF"))
    tfidf_matrix = vectorizer.fit_transform(all_text)

    # Since we're interested in similarities between citing and cited patents,
    # we need to split the TF-IDF matrix back into two parts
    split_index = len(citing_dataset)
    tfidf_matrix_citing = tfidf_matrix[:split_index]
    tfidf_matrix_nonciting = tfidf_matrix[split_index:]

    # Size of vocabulary
    print("Size of vocabulary:", len(vectorizer.vocabulary_))

    return tfidf_matrix_citing, tfidf_matrix_nonciting


def clean_text(text, remove_stopwords=True):
    text = text.lower()
    
    if remove_stopwords:
        stop_words = set(stopwords.words('english'))
        text = ' '.join([word for word in text.split() if word not in stop_words])

    text = re.sub(r'\(([0-9])+\)', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^a-z0-9\s]', '', text)

    stemmer = PorterStemmer()
    text = ' '.join([stemmer.stem(word) for word in text.split()])
    
    return text

def clean_text_2(text, remove_stopwords=True):
    text = text.lower()
    
    if remove_stopwords:
        stop_words = set(stopwords.words('english'))
        text = ' '.join([word for word in text.split() if word not in stop_words])

    text = re.sub(r'\(([0-9])+\)', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^a-z\s]', '', text)
    
    return text

def clean_corpus(corpus):
    for patent in tqdm(corpus, desc="Cleaning corpus"):
        patent['text'] = clean_text(patent['text'])

def clean_corpus_2(corpus):
    for patent in tqdm(corpus, desc="Cleaning corpus"):
        patent['text'] = clean_text_2(patent['text'])

def train_pipeline(corpus_citing, corpus_nonciting, cleaning_data=True):
    if cleaning_data:
        print("Cleaning data...")
        clean_corpus(corpus_citing)
        clean_corpus(corpus_nonciting)

    print("Creating TF-IDF matrix...")
    vectorizer = TfidfVectorizer(stop_words="english", max_features=10000, sublinear_tf=True)
    #vectorizer = CountVectorizer(max_features=10000, binary=True)
    tfidf_citing, tfidf_nonciting = create_tfidf_matrix(corpus_citing, corpus_nonciting, vectorizer)
    
    print("Calculating cosine similarities...")
    cosine_similarities = linear_kernel(tfidf_citing, tfidf_nonciting)

    return tfidf_citing, tfidf_nonciting, cosine_similarities

def get_true_and_predicted_task_3(citation_to_type_dict, prediction_dict):

    """
    Get the true and predicted labels for the metrics calculation.

    Parameters:
    citation_to_type_dict :
        dict of str : str
        Mapping between the concatenation of citing and cited id and their type
    prediction_dict :
        dict of str : list of str
        Mapping between the concatenation of citing and cited id and their type

    Returns:
    list of str
        True type for each citation pair.
    list of str
        Predicted type for each citation pair.
    int
        Number of pairs of citing and cited patents not in the citation mapping
    """
    # Initialize lists to store true labels and predicted labels
    true_labels = []
    predicted_labels = []
    not_in_citation_mapping = 0

    # Iterate over the items in both dictionaries
    for id in citation_to_type_dict.keys():
        # Check if the pair of citing_id and cited_id is present in both dictionaries
        true_labels.append(citation_to_type_dict[id])
        if id in prediction_dict:
            predicted_labels.append(prediction_dict[id])
        else:
            predicted_labels.append("missing_value")
            not_in_citation_mapping += 1

    return true_labels, predicted_labels, not_in_citation_mapping

In [10]:
# citing_corpus = create_corpus(json_citing_train, 'fulltext')
# nonciting_corpus = create_corpus(json_nonciting, 'fulltext')

# clean_corpus(citing_corpus)
# clean_corpus(nonciting_corpus)

# with open("datasets/citing_corpus.json", "w") as file:
#     json.dump(citing_corpus, file)


# with open("datasets/nonciting_corpus.json", "w") as file:
#     json.dump(nonciting_corpus, file)

In [16]:
with open("datasets/citing_train_fulltext_stemmed.json", "r") as file:
    citing_corpus = json.load(file)

with open("datasets/nonciting_fulltext_stemmed.json", "r") as file:
    nonciting_corpus = json.load(file)

In [17]:
clean_corpus_2(citing_corpus)
clean_corpus_2(nonciting_corpus)

Cleaning corpus:   0%|          | 0/6831 [00:00<?, ?it/s]

Cleaning corpus:   0%|          | 0/16837 [00:00<?, ?it/s]

In [11]:
# with open("datasets/citing_corpus.json", "r") as file:
#     citing_corpus = json.load(file)

# with open("datasets/nonciting_corpus.json", "r") as file:
#     nonciting_corpus = json.load(file)


In [18]:
tfidf_citing, tfidf_nonciting, cosine_similarities= train_pipeline(citing_corpus, nonciting_corpus, cleaning_data=False)

Creating TF-IDF matrix...
Vectorizing descriptions...
Size of vocabulary: 10000
Calculating cosine similarities...


In [19]:
new_citing_1 = new_citing.copy()
# add colonnes of tfidf_citing to new_citing
for i in range(tfidf_citing.shape[1]):
    new_citing_1[f"tfidf_{i}"] = tfidf_citing[:, i].toarray()


  new_citing_1[f"tfidf_{i}"] = tfidf_citing[:, i].toarray()
  new_citing_1[f"tfidf_{i}"] = tfidf_citing[:, i].toarray()
  new_citing_1[f"tfidf_{i}"] = tfidf_citing[:, i].toarray()
  new_citing_1[f"tfidf_{i}"] = tfidf_citing[:, i].toarray()
  new_citing_1[f"tfidf_{i}"] = tfidf_citing[:, i].toarray()
  new_citing_1[f"tfidf_{i}"] = tfidf_citing[:, i].toarray()
  new_citing_1[f"tfidf_{i}"] = tfidf_citing[:, i].toarray()
  new_citing_1[f"tfidf_{i}"] = tfidf_citing[:, i].toarray()
  new_citing_1[f"tfidf_{i}"] = tfidf_citing[:, i].toarray()
  new_citing_1[f"tfidf_{i}"] = tfidf_citing[:, i].toarray()
  new_citing_1[f"tfidf_{i}"] = tfidf_citing[:, i].toarray()
  new_citing_1[f"tfidf_{i}"] = tfidf_citing[:, i].toarray()
  new_citing_1[f"tfidf_{i}"] = tfidf_citing[:, i].toarray()
  new_citing_1[f"tfidf_{i}"] = tfidf_citing[:, i].toarray()
  new_citing_1[f"tfidf_{i}"] = tfidf_citing[:, i].toarray()
  new_citing_1[f"tfidf_{i}"] = tfidf_citing[:, i].toarray()
  new_citing_1[f"tfidf_{i}"] = tfidf_cit

In [20]:
new_nonciting_1 = new_nonciting.copy()
# add colonnes of tfidf_nonciting to new_nonciting
for i in range(tfidf_nonciting.shape[1]):
    new_nonciting_1[f"tfidf_{i}"] = tfidf_nonciting[:, i].toarray()

  new_nonciting_1[f"tfidf_{i}"] = tfidf_nonciting[:, i].toarray()
  new_nonciting_1[f"tfidf_{i}"] = tfidf_nonciting[:, i].toarray()
  new_nonciting_1[f"tfidf_{i}"] = tfidf_nonciting[:, i].toarray()
  new_nonciting_1[f"tfidf_{i}"] = tfidf_nonciting[:, i].toarray()
  new_nonciting_1[f"tfidf_{i}"] = tfidf_nonciting[:, i].toarray()
  new_nonciting_1[f"tfidf_{i}"] = tfidf_nonciting[:, i].toarray()
  new_nonciting_1[f"tfidf_{i}"] = tfidf_nonciting[:, i].toarray()
  new_nonciting_1[f"tfidf_{i}"] = tfidf_nonciting[:, i].toarray()
  new_nonciting_1[f"tfidf_{i}"] = tfidf_nonciting[:, i].toarray()
  new_nonciting_1[f"tfidf_{i}"] = tfidf_nonciting[:, i].toarray()
  new_nonciting_1[f"tfidf_{i}"] = tfidf_nonciting[:, i].toarray()
  new_nonciting_1[f"tfidf_{i}"] = tfidf_nonciting[:, i].toarray()
  new_nonciting_1[f"tfidf_{i}"] = tfidf_nonciting[:, i].toarray()
  new_nonciting_1[f"tfidf_{i}"] = tfidf_nonciting[:, i].toarray()
  new_nonciting_1[f"tfidf_{i}"] = tfidf_nonciting[:, i].toarray()
  new_nonc

In [34]:
new_nonciting_1.head(3)

Unnamed: 0,Content,citing_id,tfidf_0,tfidf_1,tfidf_2,tfidf_3,tfidf_4,tfidf_5,tfidf_6,tfidf_7,...,tfidf_9990,tfidf_9991,tfidf_9992,tfidf_9993,tfidf_9994,tfidf_9995,tfidf_9996,tfidf_9997,tfidf_9998,tfidf_9999
0,"{'title': 'VEHICLE WITH LOCKABLE TILT SYSTEM',...",2019772B1,0.0,0.0,0.0,0.063991,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"{'title': 'UNIVERSAL JOINT', 'c-en-0001': 'A u...",2136094B1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"{'title': 'Apparatus, method and computer prog...",2340701A2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [67]:
cosine = pd.DataFrame(cosine_similarities)
cosine.columns = new_nonciting_1["citing_id"]
cosine.rename(index=new_citing_1["citing_id"], inplace=True)

In [73]:
cosine

citing_id,2019772B1,2136094B1,2340701A2,2338661A1,2339144A2,2338662A1,2341324A2,2351911A2,2365310A1,2345872A2,...,1998527B1,1992453B1,1992453B9,1992705B1,1953009B1,1995910B1,1997557B1,1992846B1,1981195B1,1952974B1
3650293A1,0.228938,0.112770,0.151462,0.083379,0.143045,0.086985,0.086460,0.118157,0.119990,0.088165,...,0.096988,0.179477,0.181562,0.075009,0.087107,0.098114,0.141589,0.195152,0.068176,0.141172
3694265A1,0.122908,0.056833,0.184009,0.088104,0.096442,0.088657,0.113520,0.071558,0.169631,0.145089,...,0.136324,0.066789,0.067564,0.079083,0.064880,0.248056,0.085341,0.081504,0.143308,0.105979
3623522A1,0.179695,0.116731,0.177020,0.137543,0.153385,0.137106,0.161026,0.140914,0.181832,0.149589,...,0.142009,0.165177,0.167156,0.114903,0.144951,0.133642,0.163818,0.201904,0.099166,0.180001
3611147A1,0.080177,0.047213,0.094302,0.089608,0.117405,0.092758,0.140782,0.092779,0.097565,0.052858,...,0.064319,0.055806,0.056453,0.100630,0.073124,0.060050,0.095621,0.066081,0.080965,0.158284
3640454A1,0.219064,0.098952,0.147657,0.172062,0.240100,0.168596,0.209136,0.211321,0.239601,0.157718,...,0.090770,0.160102,0.161980,0.087683,0.134558,0.120147,0.127706,0.214905,0.100136,0.189784
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3741587A1,0.212990,0.085541,0.233180,0.153385,0.115285,0.156872,0.181157,0.127150,0.214315,0.133282,...,0.138851,0.107153,0.108398,0.122408,0.185288,0.216261,0.133533,0.129026,0.151991,0.136837
3693248A1,0.214240,0.233782,0.093999,0.070372,0.212348,0.079958,0.175617,0.240875,0.123549,0.070736,...,0.072890,0.252932,0.255942,0.069350,0.179721,0.070883,0.193109,0.210317,0.056038,0.205443
3628962A1,0.177513,0.138011,0.209364,0.141442,0.285933,0.137800,0.321487,0.247391,0.235714,0.168073,...,0.128917,0.144267,0.145942,0.106929,0.157713,0.160803,0.145583,0.193571,0.116131,0.188844
3620899A1,0.114216,0.084606,0.124927,0.068117,0.122315,0.069139,0.116654,0.114297,0.120500,0.120678,...,0.111794,0.095177,0.096332,0.105206,0.132009,0.097897,0.140401,0.100502,0.097947,0.128987


In [85]:
# new_citing_test_1 = new_citing_test.copy()
# # add colonnes of tfidf_citing to new_citing
# for i in range(tfidf_citing.shape[1]):
#     new_citing_test_1[f"tfidf_{i}"] = tfidf_citing[:, i].toarray()

In [78]:
# couples_citing_nonciting from mapping_dataset_df

couples_citing_nonciting = mapping_dataset_df[["citing_id", "cited_id", "category"]]

# merge new_citing_1 and couples_citing_nonciting on citing_id
df = pd.merge(couples_citing_nonciting, new_citing_1, left_on="citing_id", right_on="citing_id", how="inner")

# merge df and new_nonciting_1 on cited_id
df = pd.merge(df, new_nonciting_1, left_on="cited_id", right_on="citing_id", how="inner")

# add cosine similarity to df en recuprant les noms de colonnes et les valeurs de citing_id de cosine
df["cosine"] = df.apply(lambda row: cosine[cosine.index == row["citing_id_x"]][row["cited_id"]].values[0], axis=1)

df.head(1)


Unnamed: 0,citing_id_x,cited_id,category,Content_x,tfidf_0_x,tfidf_1_x,tfidf_2_x,tfidf_3_x,tfidf_4_x,tfidf_5_x,...,tfidf_9991_y,tfidf_9992_y,tfidf_9993_y,tfidf_9994_y,tfidf_9995_y,tfidf_9996_y,tfidf_9997_y,tfidf_9998_y,tfidf_9999_y,cosine
0,3712070A1,3354576A1,A,{'title': 'SHOESTRING ENVIRONMENTAL CONTROL SY...,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.779607


In [80]:
X = df.drop(columns=["citing_id_x", "citing_id_y", "cited_id", "category", "Content_x", "Content_y"])
y = df["category"]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [81]:
#convert X to 1 and A to 0
y_train = y_train.replace("A", 0)
y_train = y_train.replace("X", 1)

y_test = y_test.replace("A", 0)
y_test = y_test.replace("X", 1)

In [82]:
from xgboost import XGBClassifier
from piml.models import ReluDNNClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score



clf = XGBClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

#balance accuracy

from sklearn.metrics import balanced_accuracy_score 
balanced_accuracy_score(y_test, y_pred)


              precision    recall  f1-score   support

           0       0.68      0.83      0.75      1041
           1       0.61      0.41      0.49       678

    accuracy                           0.66      1719
   macro avg       0.65      0.62      0.62      1719
weighted avg       0.65      0.66      0.65      1719



0.6194860285804153

In [83]:
# value counts of y_pred
pd.Series(y_pred).value_counts(normalize=True)

0    0.737638
1    0.262362
dtype: float64

-------

In [89]:
del tfidf_citing
del tfidf_nonciting
del cosine_similarities
del new_citing_1
del new_nonciting_1
del couples_citing_nonciting

NameError: name 'df' is not defined

In [94]:
# open le fichier json dans prediction/task1/prediction1_desc_44.json

with open("predictions/task1/prediction1_fulltext_nodigit471.json", "r") as file:
    predictions = json.load(file)


In [95]:

# faire tous les couples citing_id_cited_id dans prediction ou citing_id est prediction.keys(i) et cited_id est prediction.values(j)

couples = []
for i in predictions.keys():
    for j in predictions[i]:
        couples.append([i, j])

In [96]:
len(couples)

100000

In [97]:
tfidf_citing_test, tfidf_nonciting, cosine_similarities= train_pipeline(create_corpus(json_citing_test, 'claims'), create_corpus(json_nonciting, 'fulltext'))

Cleaning data...


Cleaning corpus:   0%|          | 0/1000 [00:00<?, ?it/s]

Cleaning corpus:   0%|          | 0/16837 [00:00<?, ?it/s]

Creating TF-IDF matrix...
Vectorizing descriptions...
Size of vocabulary: 10000
Calculating cosine similarities...


In [98]:
new_nonciting_1 = new_nonciting.copy()
# add colonnes of tfidf_nonciting to new_nonciting
for i in range(tfidf_nonciting.shape[1]):
    new_nonciting_1[f"tfidf_{i}"] = tfidf_nonciting[:, i].toarray()

In [99]:
new_citing_test_1 = new_citing_test.copy()
# add colonnes of tfidf_citing to new_citing
for i in range(tfidf_citing_test.shape[1]):
    new_citing_test_1[f"tfidf_{i}"] = tfidf_citing_test[:, i].toarray()

In [None]:
del tfidf_citing_test
del tfidf_nonciting


In [100]:
cosine = pd.DataFrame(cosine_similarities)
cosine.columns = new_nonciting_1["citing_id"]
cosine.rename(index=new_citing_test_1["citing_id"], inplace=True)

In [101]:
df1 = pd.DataFrame(couples, columns=["citing_id", "cited_id"])
df1["couples"] = df1["citing_id"].astype(str) +"_"+ df1["cited_id"].astype(str)

df1

Unnamed: 0,citing_id,cited_id,couples
0,3708804A1,2905477A1,3708804A1_2905477A1
1,3708804A1,2169237A2,3708804A1_2169237A2
2,3708804A1,3321489A1,3708804A1_3321489A1
3,3708804A1,3324019A1,3708804A1_3324019A1
4,3708804A1,1952029B1,3708804A1_1952029B1
...,...,...,...
99995,3756569A1,2599452A1,3756569A1_2599452A1
99996,3756569A1,2823774A2,3756569A1_2823774A2
99997,3756569A1,2614784A2,3756569A1_2614784A2
99998,3756569A1,3412225A1,3756569A1_3412225A1


In [102]:
df1 = pd.merge(df1, new_citing_test_1, left_on="citing_id", right_on="citing_id", how="inner")
df1 = pd.merge(df1, new_nonciting_1, left_on="cited_id", right_on="citing_id", how="inner")
df1["cosine"] = df1.apply(lambda row: cosine[cosine.index == row["citing_id_x"]][row["cited_id"]].values[0], axis=1)

df1.head(1)

: 

In [120]:
# faire les predictions avec le modele clf sur df1 et sortir les sous la forme "3591192A1_1731734B1": "A",
predictions1 = clf.predict(df1.drop(columns=["citing_id_x", "citing_id_y", "cited_id", "couples", "Content_x", "Content_y"]))

predictions1

: 

In [None]:
predictions1 = pd.Series(predictions1).replace(0, "A")
predictions1 = pd.Series(predictions1).replace(1, "X")



# repartition des predictions


pd.Series(predictions1).value_counts(normalize=True)



X    0.510768
A    0.489232
dtype: float64

In [44]:
predictions1 = pd.DataFrame(predictions1, columns=["category"])
predictions1["couples"] = df1["couples"]
predictions1 = predictions1.set_index("couples")
predictions1 = predictions1.to_dict()["category"]

In [45]:
# couples_citing_nonciting from mapping_dataset_df

couples_citing_nonciting = mapping_dataset_df[["citing_id", "cited_id", "category"]]

# merge new_citing_1 and couples_citing_nonciting on citing_id
df = pd.merge(couples_citing_nonciting, new_citing_1, left_on="citing_id", right_on="citing_id", how="inner")

# merge df and new_nonciting_1 on cited_id
df = pd.merge(df, new_nonciting_1, left_on="cited_id", right_on="citing_id", how="inner")

df.head(1)

NameError: name 'new_citing_1' is not defined

In [46]:
df["category"].value_counts(normalize=True)

A    0.62404
X    0.37596
Name: category, dtype: float64

In [47]:
with open('predictions/task3/prediction3_k100_nodigit.json', 'w') as f:
    json.dump(predictions1, f)