In [None]:
import random
import numpy as np
import pandas as pd
import tracemalloc
tracemalloc.start()
random.seed(3311791)
np.random.seed(3311791)
df = pd.read_csv('/kaggle/input/df-lemma-clean/df_lemma_dropped.csv').drop('H', axis=1)
df['Year'] = df['Year'].astype(int)

In [None]:
!pip install umap-learn hdbscan
!pip install BorutaShap
!pip install xlsxwriter
!pip install adjustText
from adjustText import adjust_text
from BorutaShap import BorutaShap
import umap
import hdbscan
import numpy as np
import matplotlib.pyplot as plt



In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import lil_matrix, csr_matrix
from itertools import combinations
from collections import defaultdict

time_periods = [(min(df['Year']), 1990), (1991, 2008), (2009, 2016), (2017, max(df['Year']))]

df['Family.Number'] = df['Family.Number'].astype(str)

def get_tfidf_dummy(patent_df, max_df=0.95, tfidf_threshold_percentile=50, min_patent_cooc=2, max_feat=None, ngrams=(1,3)):
    '''
    tfidf_threshold = percentile of non-zero entries to be considered for co-occurrence analysis
    min_patent_cooc = how often word combinations need to occur in different patents to be considered for further analysis (min_df=2 already in TFIDF)
    '''
    tfidf_vectorizer = TfidfVectorizer(ngram_range=ngrams,
        max_df=max_df, min_df=2, stop_words="english", max_features=max_feat)

    tfidf = tfidf_vectorizer.fit_transform(patent_df['Text_Lemma_unlist'])
    tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()

    if tfidf_threshold_percentile is not None:
        # percentile of nonzero entries in TF-IDF matrix
        tfidf_threshold = np.percentile(tfidf.data, tfidf_threshold_percentile)
        tfidf.data[tfidf.data < tfidf_threshold] = 0
        tfidf.eliminate_zeros()
        tfidf.data[tfidf.data >= tfidf_threshold] = 1
    else:
        tfidf.data[tfidf.data > 0] = 1
        tfidf.eliminate_zeros()

        
    print(f'full len(tfidf_feature_names): {len(tfidf_feature_names)}')
    # Find columns which have at least 2 times a 1 in them
    cols_to_keep = [i for i in range(tfidf.shape[1]) if len(tfidf[:, i].data) >= min_patent_cooc]
    tfidf = tfidf[:, cols_to_keep]

    # Also keep the corresponding feature names
    tfidf_feature_names = [tfidf_feature_names[i] for i in cols_to_keep]
    print(f'trimmed len(tfidf_feature_names): {len(tfidf_feature_names)}')

    # Initialize the storage container
    tfidf_dummy = lil_matrix((tfidf.shape[0], len(tfidf_feature_names)*(len(tfidf_feature_names)-1)//2))
    column_dict = defaultdict(lambda: len(column_dict))
    for name in tfidf_feature_names:
        column_dict[name]
    # Iterate over each row in the tfidf matrix
    #indices = [i for i in range(tfidf.shape[0]) if i not in range(10900, 11500)]
    for index in range(tfidf.shape[0]):
        row = tfidf.getrow(index)
        # Get all combinations of feature names for the current row

        pairs = combinations([tfidf_feature_names[i] for i in row.indices], 2)

        for pair in pairs:
            # Check if neither feature name is a substring of the other
            if not (pair[0] in pair[1] or pair[1] in pair[0]):
                # Sort the pair and convert to a tuple
                sorted_pair = tuple(sorted(pair))
                # Generate a column name from the sorted pair
                column_name = '+'.join(sorted_pair)
                # If the column doesn't exist yet, add it to the dictionary
                if column_name not in column_dict:
                    column_dict[column_name] = len(column_dict)
                # Add a 1 for this entry in the tfidf_dummy container
                tfidf_dummy[index, column_dict[column_name]] = 1

    col_sums = np.array(tfidf_dummy.sum(axis=0)).ravel() # tfidf_dummy.sum(axis=0) #

    print(f'N word pairs in {min(patent_df["Year"])}-{max(patent_df["Year"])}: {len(col_sums)}')
    cols_to_retain = np.where(col_sums >= min_patent_cooc)[0]
    print(f'N columns to retain in {min(patent_df["Year"])}-{max(patent_df["Year"])} when filtering out co-occurrences in less than {min_patent_cooc} patents: {len(cols_to_retain)}')

    tfidf_dummy = tfidf_dummy[:, cols_to_retain]

    return  pd.DataFrame(tfidf_dummy.toarray(), columns=[name for name, index in column_dict.items() if index in cols_to_retain])


for i, (start, stop) in enumerate(time_periods):
        df_time_period = df[(df['Year'] >= start) & (df['Year'] <= stop)]
        print(f'{start}-{stop}')
        max_df=0.5; max_feat=None; tfidf_threshold_percentile=99; n_components=2; n_try=1; min_patent_cooc=2

        tfidf_dummy = get_tfidf_dummy(df_time_period, max_df=max_df, max_feat=max_feat, min_patent_cooc=min_patent_cooc, tfidf_threshold_percentile=tfidf_threshold_percentile)
        tfidf_dummy.to_csv(f'tfidf_dummy_{start}_{stop}_maxdf{max_df}_tfidfperc{tfidf_threshold_percentile}.csv')

1991-2008
full len(tfidf_feature_names): 379563
trimmed len(tfidf_feature_names): 3960
N word pairs in 1991-2008: 7838820
N columns to retain in 1991-2008 when filtering out co-occurrences in less than 2 patents: 4038
1920-1990
full len(tfidf_feature_names): 53341
trimmed len(tfidf_feature_names): 456
N word pairs in 1920-1990: 103740
N columns to retain in 1920-1990 when filtering out co-occurrences in less than 2 patents: 207
2009-2016
full len(tfidf_feature_names): 738115
trimmed len(tfidf_feature_names): 7642
N word pairs in 2009-2016: 29196261
N columns to retain in 2009-2016 when filtering out co-occurrences in less than 2 patents: 8462
2017-2023
full len(tfidf_feature_names): 825368
trimmed len(tfidf_feature_names): 8361
N word pairs in 2017-2023: 34948980
N columns to retain in 2017-2023 when filtering out co-occurrences in less than 2 patents: 10104
