
# Method description
Step 1. We first build the numerical representation of the
data. Depending on the structure of the data, such represen-
tation can be constructed using either: a) metadata in the categorical format (in the case of USPTO patents data the metadata consists of the
CPC categories of patents describing the relation towards
specific scientific area, which the patent is submitted), b) textual data of the abstracts. The BERT model for embedding was downloaded from https://github.com/google/patents-public-data/blob/master/models/BERT%20for%20Patents.md

Step 2. Using the numerical high-dimensional data obtained at the first step we then build the low-dimensional representation through
applying the dimensionality reduction methods. Several methods are possible to apply at this stage include tSNE, diffusion maps methods.

Step 3.  Furthermore the clustering methods are applied to the high- and low-dimensional representation of data. The resulted data can as well further analyzed using methods from the mobility data analysis and general stochastic processes formalism adapted from https://arxiv.org/abs/2302.13054

## Load data
We analyze the data from open USPTO dataset https://www.uspto.gov/ip-policy/economic-research/research-datasets

# Step 1. Embedding methods and numerical representation of data
We use the patentBERT model for producing the numerical representation of the textual dataset. More details of such embedding methods are described in https://arxiv.org/abs/1906.02124

In [None]:
import pandas as pd
df = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("../embeddingoutput.csv")
pandasdf = df.toPandas()
pandasdf.drop(columns=pandasdf.columns[0], axis=1,  inplace=True)


pandasdf.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,984,985,986,987,988,989,990,991,992,993,994,995,996,997,998,999,1000,1001,1002,1003,1004,1005,1006,1007,1008,1009,1010,1011,1012,1013,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,-0.069363,-0.10472,0.170673,-0.438419,-0.144168,0.681036,0.138129,0.301038,-0.292081,-0.606854,-0.288766,-1.054059,0.518129,0.510559,0.839489,0.380828,-0.825546,-0.715169,-0.330739,-0.704941,-0.001159,0.430229,1.206324,-0.212817,-0.36591,-0.122769,-0.775084,-0.870044,-0.121482,-0.446669,-0.976716,0.42765,0.283998,-0.360515,0.521507,-0.116353,-0.32761,0.534353,-0.347815,1.123636,...,-0.89619,0.101728,0.479426,0.927935,-0.358585,-0.617793,0.090672,0.585063,0.848197,0.186102,-0.517821,0.6419,-0.419397,0.469247,0.49593,-0.379151,0.097631,-0.220659,-0.611092,-0.274693,0.030647,-0.170343,-0.605823,0.76179,-0.336923,-0.204806,0.340841,-0.268847,0.034237,0.553283,0.283917,0.142959,0.333324,-0.097232,0.338291,0.317263,-0.136278,0.288835,-0.5414,-0.360672
1,-0.657632,-0.660423,-0.32936,-0.422629,-0.221532,0.424544,0.580698,-0.144818,-0.506951,-0.482279,0.073071,-0.726611,-0.017608,-0.101514,0.552301,0.858862,-1.224141,0.458215,-0.262847,-0.255457,-0.051434,0.467641,1.20367,-0.255218,0.12427,-0.396945,-0.351141,-0.833173,0.525649,-0.311577,0.090772,0.28354,0.119979,-0.668585,0.114054,-0.191174,-0.164651,0.3587,0.736736,0.608984,...,0.298056,0.048169,-0.074115,0.855455,-0.319563,-0.074175,-0.00867,0.299647,-0.262448,0.100508,-0.202112,0.57025,-0.681673,0.512238,-0.073582,-0.077811,0.505413,0.325041,-0.056031,0.186717,-0.050719,-0.524433,-0.537832,0.029255,-0.252963,-0.41654,0.355757,-0.231939,-0.634832,0.624658,-0.323671,0.895678,0.047849,-0.020366,0.068159,-0.267707,0.215926,-0.759468,-0.143991,-0.440908
2,-0.657632,-0.660423,-0.32936,-0.422629,-0.221532,0.424544,0.580698,-0.144818,-0.506951,-0.482279,0.073071,-0.726611,-0.017608,-0.101514,0.552301,0.858862,-1.224141,0.458215,-0.262847,-0.255457,-0.051434,0.467641,1.20367,-0.255218,0.12427,-0.396945,-0.351141,-0.833173,0.525649,-0.311577,0.090772,0.28354,0.119979,-0.668585,0.114054,-0.191174,-0.164651,0.3587,0.736736,0.608984,...,0.298056,0.048169,-0.074115,0.855455,-0.319563,-0.074175,-0.00867,0.299647,-0.262448,0.100508,-0.202112,0.57025,-0.681673,0.512238,-0.073582,-0.077811,0.505413,0.325041,-0.056031,0.186717,-0.050719,-0.524433,-0.537832,0.029255,-0.252963,-0.41654,0.355757,-0.231939,-0.634832,0.624658,-0.323671,0.895678,0.047849,-0.020366,0.068159,-0.267707,0.215926,-0.759468,-0.143991,-0.440908
3,-0.657632,-0.660423,-0.32936,-0.422629,-0.221532,0.424544,0.580698,-0.144818,-0.506951,-0.482279,0.073071,-0.726611,-0.017608,-0.101514,0.552301,0.858862,-1.224141,0.458215,-0.262847,-0.255457,-0.051434,0.467641,1.20367,-0.255218,0.12427,-0.396945,-0.351141,-0.833173,0.525649,-0.311577,0.090772,0.28354,0.119979,-0.668585,0.114054,-0.191174,-0.164651,0.3587,0.736736,0.608984,...,0.298056,0.048169,-0.074115,0.855455,-0.319563,-0.074175,-0.00867,0.299647,-0.262448,0.100508,-0.202112,0.57025,-0.681673,0.512238,-0.073582,-0.077811,0.505413,0.325041,-0.056031,0.186717,-0.050719,-0.524433,-0.537832,0.029255,-0.252963,-0.41654,0.355757,-0.231939,-0.634832,0.624658,-0.323671,0.895678,0.047849,-0.020366,0.068159,-0.267707,0.215926,-0.759468,-0.143991,-0.440908
4,-0.35812,-0.280004,0.694134,-0.69291,-0.084421,0.513281,0.398551,0.149493,-0.185156,-0.650438,0.601124,-0.593182,0.537901,-0.233282,0.73201,0.381555,-0.939021,0.513529,0.001086,-0.420366,-0.34152,0.496589,0.77829,-0.412515,-0.668345,-0.63579,-0.447815,-0.579105,0.259927,-0.106404,-0.337793,0.412689,0.053856,-0.828441,1.301035,-0.122319,-0.019628,0.436147,0.184037,0.534072,...,0.446321,-0.604563,0.396821,0.754534,-0.336379,-0.851278,-0.010447,0.07659,0.020918,-0.025492,-0.178903,0.396745,-0.424331,0.257488,-0.264725,-0.335262,-0.202762,-0.009124,-0.510991,0.175919,-0.16464,-0.518945,-0.475701,0.189043,-0.166743,0.162045,0.465138,0.265746,0.318009,0.288605,-0.200721,0.63704,0.132071,-0.908206,-0.541148,-0.807207,0.319228,-0.896838,-0.064141,-0.54129


# Load all data from final embeddings and categories from notebook

In [None]:

final_embedding = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("dbfs:/mnt/patentanalysis/USPTO/MLModel/finalembeddingoutput.csv")

# transform into dataframe
usptodf = final_embedding.toPandas()

In [None]:
usptodf.head()

Unnamed: 0,_c0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,...,1001,1002,1003,1004,1005,1006,1007,1008,1009,1010,1011,1012,1013,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023,Patent Number,Application Number,Filing Date,Grant Date,Entity Status,Application Status Category,Application Status Date,type,abstract,title,kind,num_claims,time to abandonment,time to abd since filing,subgroup_id,group_id,CPC_Definition
0,0,-0.069363,-0.10472,0.170673,-0.438419,-0.144168,0.681036,0.138129,0.301038,-0.292081,-0.606854,-0.288766,-1.054059,0.518129,0.510559,0.839489,0.380828,-0.825546,-0.715169,-0.330739,-0.704941,-0.001159,0.430229,1.206324,-0.212817,-0.36591,-0.122769,-0.775084,-0.870044,-0.121482,-0.446669,-0.976716,0.42765,0.283998,-0.360515,0.521507,-0.116353,-0.32761,0.534353,-0.347815,...,-0.220659,-0.611092,-0.274693,0.030647,-0.170343,-0.605823,0.76179,-0.336923,-0.204806,0.340841,-0.268847,0.034237,0.553283,0.283917,0.142959,0.333324,-0.097232,0.338291,0.317263,-0.136278,0.288835,-0.5414,-0.360672,6347763,9751925,1/2/2000,2002-02-19,UNDISCOUNTED,Patent Expired Due to NonPayment of Maintenanc...,2010-03-22,utility,An active damping method and a self-contained ...,System and method for reducing dispersion of s...,B1,10,2953,3732,F42B10/661,F42B,Explosive Charges
1,1,-0.657632,-0.660423,-0.32936,-0.422629,-0.221532,0.424544,0.580698,-0.144818,-0.506951,-0.482279,0.073071,-0.726611,-0.017608,-0.101514,0.552301,0.858862,-1.224141,0.458215,-0.262847,-0.255457,-0.051434,0.467641,1.20367,-0.255218,0.12427,-0.396945,-0.351141,-0.833173,0.525649,-0.311577,0.090772,0.28354,0.119979,-0.668585,0.114054,-0.191174,-0.164651,0.3587,0.736736,...,0.325041,-0.056031,0.186717,-0.050719,-0.524433,-0.537832,0.029255,-0.252963,-0.41654,0.355757,-0.231939,-0.634832,0.624658,-0.323671,0.895678,0.047849,-0.020366,0.068159,-0.267707,0.215926,-0.759468,-0.143991,-0.440908,6345753,9647774,1/2/2000,2002-02-12,UNDISCOUNTED,Patent Expired Due to NonPayment of Maintenanc...,2014-03-10,utility,A vertical strip storage system has a support ...,Vertical belt storage system,B1,3,4409,5181,B21C49/00,B21C,"Manufacture Of Metal Sheets, Wire, Rods, Tubes..."
2,2,-0.657632,-0.660423,-0.32936,-0.422629,-0.221532,0.424544,0.580698,-0.144818,-0.506951,-0.482279,0.073071,-0.726611,-0.017608,-0.101514,0.552301,0.858862,-1.224141,0.458215,-0.262847,-0.255457,-0.051434,0.467641,1.20367,-0.255218,0.12427,-0.396945,-0.351141,-0.833173,0.525649,-0.311577,0.090772,0.28354,0.119979,-0.668585,0.114054,-0.191174,-0.164651,0.3587,0.736736,...,0.325041,-0.056031,0.186717,-0.050719,-0.524433,-0.537832,0.029255,-0.252963,-0.41654,0.355757,-0.231939,-0.634832,0.624658,-0.323671,0.895678,0.047849,-0.020366,0.068159,-0.267707,0.215926,-0.759468,-0.143991,-0.440908,6345753,9647774,1/2/2000,2002-02-12,UNDISCOUNTED,Patent Expired Due to NonPayment of Maintenanc...,2014-03-10,utility,A vertical strip storage system has a support ...,Vertical belt storage system,B1,3,4409,5181,B65H20/34,B65H,Handling Thin Or Filamentary Material
3,3,-0.657632,-0.660423,-0.32936,-0.422629,-0.221532,0.424544,0.580698,-0.144818,-0.506951,-0.482279,0.073071,-0.726611,-0.017608,-0.101514,0.552301,0.858862,-1.224141,0.458215,-0.262847,-0.255457,-0.051434,0.467641,1.20367,-0.255218,0.12427,-0.396945,-0.351141,-0.833173,0.525649,-0.311577,0.090772,0.28354,0.119979,-0.668585,0.114054,-0.191174,-0.164651,0.3587,0.736736,...,0.325041,-0.056031,0.186717,-0.050719,-0.524433,-0.537832,0.029255,-0.252963,-0.41654,0.355757,-0.231939,-0.634832,0.624658,-0.323671,0.895678,0.047849,-0.020366,0.068159,-0.267707,0.215926,-0.759468,-0.143991,-0.440908,6345753,9647774,1/2/2000,2002-02-12,UNDISCOUNTED,Patent Expired Due to NonPayment of Maintenanc...,2014-03-10,utility,A vertical strip storage system has a support ...,Vertical belt storage system,B1,3,4409,5181,B65G21/14,B65G,Transport Or Storage Devices
4,4,-0.35812,-0.280004,0.694134,-0.69291,-0.084421,0.513281,0.398551,0.149493,-0.185156,-0.650438,0.601124,-0.593182,0.537901,-0.233282,0.73201,0.381555,-0.939021,0.513529,0.001086,-0.420366,-0.34152,0.496589,0.77829,-0.412515,-0.668345,-0.63579,-0.447815,-0.579105,0.259927,-0.106404,-0.337793,0.412689,0.053856,-0.828441,1.301035,-0.122319,-0.019628,0.436147,0.184037,...,-0.009124,-0.510991,0.175919,-0.16464,-0.518945,-0.475701,0.189043,-0.166743,0.162045,0.465138,0.265746,0.318009,0.288605,-0.200721,0.63704,0.132071,-0.908206,-0.541148,-0.807207,0.319228,-0.896838,-0.064141,-0.54129,6702405,9647268,1/2/2000,2004-03-09,UNDISCOUNTED,Patent Expired Due to NonPayment of Maintenanc...,2016-04-04,utility,An electric parking brake system for a vehicle...,Electric parking brake,B1,9,4409,5937,B60T13/746,B60T,Vehicle Brake Control Systems Or Parts Thereof...


In [None]:
import numpy as np

embeddings_df = usptodf.iloc[:, 1:1025] # importantly there are 1024 +1 columns!!
embeddings_array = np.array(usptodf.iloc[:, 1:1025])

embeddings_df.head()





Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,984,985,986,987,988,989,990,991,992,993,994,995,996,997,998,999,1000,1001,1002,1003,1004,1005,1006,1007,1008,1009,1010,1011,1012,1013,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,-0.069363,-0.10472,0.170673,-0.438419,-0.144168,0.681036,0.138129,0.301038,-0.292081,-0.606854,-0.288766,-1.054059,0.518129,0.510559,0.839489,0.380828,-0.825546,-0.715169,-0.330739,-0.704941,-0.001159,0.430229,1.206324,-0.212817,-0.36591,-0.122769,-0.775084,-0.870044,-0.121482,-0.446669,-0.976716,0.42765,0.283998,-0.360515,0.521507,-0.116353,-0.32761,0.534353,-0.347815,1.123636,...,-0.89619,0.101728,0.479426,0.927935,-0.358585,-0.617793,0.090672,0.585063,0.848197,0.186102,-0.517821,0.6419,-0.419397,0.469247,0.49593,-0.379151,0.097631,-0.220659,-0.611092,-0.274693,0.030647,-0.170343,-0.605823,0.76179,-0.336923,-0.204806,0.340841,-0.268847,0.034237,0.553283,0.283917,0.142959,0.333324,-0.097232,0.338291,0.317263,-0.136278,0.288835,-0.5414,-0.360672
1,-0.657632,-0.660423,-0.32936,-0.422629,-0.221532,0.424544,0.580698,-0.144818,-0.506951,-0.482279,0.073071,-0.726611,-0.017608,-0.101514,0.552301,0.858862,-1.224141,0.458215,-0.262847,-0.255457,-0.051434,0.467641,1.20367,-0.255218,0.12427,-0.396945,-0.351141,-0.833173,0.525649,-0.311577,0.090772,0.28354,0.119979,-0.668585,0.114054,-0.191174,-0.164651,0.3587,0.736736,0.608984,...,0.298056,0.048169,-0.074115,0.855455,-0.319563,-0.074175,-0.00867,0.299647,-0.262448,0.100508,-0.202112,0.57025,-0.681673,0.512238,-0.073582,-0.077811,0.505413,0.325041,-0.056031,0.186717,-0.050719,-0.524433,-0.537832,0.029255,-0.252963,-0.41654,0.355757,-0.231939,-0.634832,0.624658,-0.323671,0.895678,0.047849,-0.020366,0.068159,-0.267707,0.215926,-0.759468,-0.143991,-0.440908
2,-0.657632,-0.660423,-0.32936,-0.422629,-0.221532,0.424544,0.580698,-0.144818,-0.506951,-0.482279,0.073071,-0.726611,-0.017608,-0.101514,0.552301,0.858862,-1.224141,0.458215,-0.262847,-0.255457,-0.051434,0.467641,1.20367,-0.255218,0.12427,-0.396945,-0.351141,-0.833173,0.525649,-0.311577,0.090772,0.28354,0.119979,-0.668585,0.114054,-0.191174,-0.164651,0.3587,0.736736,0.608984,...,0.298056,0.048169,-0.074115,0.855455,-0.319563,-0.074175,-0.00867,0.299647,-0.262448,0.100508,-0.202112,0.57025,-0.681673,0.512238,-0.073582,-0.077811,0.505413,0.325041,-0.056031,0.186717,-0.050719,-0.524433,-0.537832,0.029255,-0.252963,-0.41654,0.355757,-0.231939,-0.634832,0.624658,-0.323671,0.895678,0.047849,-0.020366,0.068159,-0.267707,0.215926,-0.759468,-0.143991,-0.440908
3,-0.657632,-0.660423,-0.32936,-0.422629,-0.221532,0.424544,0.580698,-0.144818,-0.506951,-0.482279,0.073071,-0.726611,-0.017608,-0.101514,0.552301,0.858862,-1.224141,0.458215,-0.262847,-0.255457,-0.051434,0.467641,1.20367,-0.255218,0.12427,-0.396945,-0.351141,-0.833173,0.525649,-0.311577,0.090772,0.28354,0.119979,-0.668585,0.114054,-0.191174,-0.164651,0.3587,0.736736,0.608984,...,0.298056,0.048169,-0.074115,0.855455,-0.319563,-0.074175,-0.00867,0.299647,-0.262448,0.100508,-0.202112,0.57025,-0.681673,0.512238,-0.073582,-0.077811,0.505413,0.325041,-0.056031,0.186717,-0.050719,-0.524433,-0.537832,0.029255,-0.252963,-0.41654,0.355757,-0.231939,-0.634832,0.624658,-0.323671,0.895678,0.047849,-0.020366,0.068159,-0.267707,0.215926,-0.759468,-0.143991,-0.440908
4,-0.35812,-0.280004,0.694134,-0.69291,-0.084421,0.513281,0.398551,0.149493,-0.185156,-0.650438,0.601124,-0.593182,0.537901,-0.233282,0.73201,0.381555,-0.939021,0.513529,0.001086,-0.420366,-0.34152,0.496589,0.77829,-0.412515,-0.668345,-0.63579,-0.447815,-0.579105,0.259927,-0.106404,-0.337793,0.412689,0.053856,-0.828441,1.301035,-0.122319,-0.019628,0.436147,0.184037,0.534072,...,0.446321,-0.604563,0.396821,0.754534,-0.336379,-0.851278,-0.010447,0.07659,0.020918,-0.025492,-0.178903,0.396745,-0.424331,0.257488,-0.264725,-0.335262,-0.202762,-0.009124,-0.510991,0.175919,-0.16464,-0.518945,-0.475701,0.189043,-0.166743,0.162045,0.465138,0.265746,0.318009,0.288605,-0.200721,0.63704,0.132071,-0.908206,-0.541148,-0.807207,0.319228,-0.896838,-0.064141,-0.54129


# Step 2. Applying tSNE to BERT-embeddings data

1. we transform embeddings from BERT to numpy array
2. we apply tSNE with various parameters and coloring


## Parametrisation of the embedding tSNE

Some default sets of parameters we are using for tSNE method:

    TSNE(n_components=2, *, perplexity=30.0, early_exaggeration=12.0, learning_rate='auto', n_iter=1000, n_iter_without_progress=300, min_grad_norm=1e-07, metric='euclidean', metric_params=None, init='pca', verbose=0, random_state=None, method='barnes_hut', angle=0.5, n_jobs=None)[source]¶


 Too low perplexity levels may cause issues as noted for perplexity 5, Source: https://distill.pub/2016/misread-tsne/

 More on the choise of the embedding parameters is listed and discussed in the publication Gonzalez-Marquez, R., Schmidt, L., Schmidt, B. M., Berens, P., Kobak, D. The landscape of biomedical research, bioarxiv.


For the tSNE method we use the PCA initialization, which was found crucial for the further stages of the interpretation and analysis of the low-dimensional data representations.

In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt


# PCA dimensionality reduction is for dimension high M>>100
# print shape after PCA:  (N, 50) - we can try to do direct tSNE embedding


# Perform t-SNE embedding with the specific set of parameters
# Important is the PCA initialization

#tsne = TSNE(n_components=2, perplexity=30, random_state=42)  # You can adjust perplexity as needed
tsne = (n_components=2, *, perplexity=30.0, early_exaggeration=12.0, learning_rate='auto', n_iter=1000, n_iter_without_progress=300, min_grad_norm=1e-07, metric='euclidean', metric_params=None, init='pca', verbose=0, random_state=None, method='barnes_hut', angle=0.5, n_jobs=None)[source]
print(type (df)) # should be numpy array


#doc_embeddings converted to dataframe and provide dataframe to tsne
embedded_data = tsne.fit_transform(pandasdf) # we need to apply it to the Numpy array data, not the data in dataframe

# Create a DataFrame for the embedded data
df_embedded = pd.DataFrame(embedded_data, columns=['Dimension 1', 'Dimension 2'])

# print shape after t-SNE:  (N, 2)
# print(df_embedded.shape)

# Optionally, you can add labels or other metadata to the df_embedded DataFrame.

# Visualize the embedded data (scatter plot)
plt.figure(figsize=(20, 25))
plt.scatter(df_embedded['Dimension 1'], df_embedded['Dimension 2'], marker='o', alpha=0.7)
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
plt.title('t-SNE Embedding')
plt.grid(True)
plt.show()



In [None]:
import pandas as pd

# save tSNE information to the form of tsv file with /t as delimeter

# Save the DataFrame as a TSV file
df_embedded_dim_red.to_csv('output_dim_red_file.tsv', sep='\t', index=False)




In [None]:
df_embedded.head()

Unnamed: 0,Dimension 1,Dimension 2
0,-35.906311,-9.569508
1,-9.985184,72.229645
2,-11.21358,71.463913
3,-9.985184,72.229645
4,-16.754843,-4.304878


In [None]:
# reproducing results on the larger dataframe


from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import pandas as pd


# PCA dimensionality reduction is for dimension high M>>100
# print shape after PCA:  (N, 50) - we can try to do direct tSNE embedding


# Perform t-SNE embedding
tsne_cosine = TSNE(random_state=1, n_iter=15000, metric="cosine")

tsne = TSNE(n_components=2, perplexity=30, random_state=42)  # You can adjust perplexity as needed

#doc_embeddings converted to dataframe and provide dataframe to tsne
dimred_data_all = tsne.fit_transform(embeddings_df) # we need to apply it to the Numpy array data, not the data in dataframe, dimensionality reduction

# Create a DataFrame for the embedded data
df_embedded_all = pd.DataFrame(dimred_data_all, columns=['Dimension 1', 'Dimension 2'])

# print shape after t-SNE:  (N, 2)
# print(df_embedded.shape)

# Optionally, you can add labels or other metadata to the df_embedded DataFrame.

# Visualize the embedded data (scatter plot)
plt.figure(figsize=(8, 6))
plt.scatter(df_embedded_all['Dimension 1'], df_embedded_all['Dimension 2'], marker='o', alpha=0.7)
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
plt.title('t-SNE Embedding on all data')
plt.grid(True)
plt.show()


# Apply color coding for the projected dataset

Then we need to color code the low-dimensional representation of the obtained embeddings. This will allow us to explore the dimensionality reduction methods further.

We then will again need to load data with final embedding output.
We store the sample of the dataset in the  

    dfBERTembedding = pd.read_csv('finalembeddingoutput.csv', index_col=0, header=0, sep=',')


We are putting the same colors to CPCs, which have the same first letters, e.g. B01, and B02.

#### Groups assignments for patents data

There are group_id general classes, which are A, B, C, D, E, F.
They correspond to the following categories

A
	HUMAN NECESSITIES

B
    PERFORMING OPERATIONS; TRANSPORTING

C

    CHEMISTRY; METALLURGY

D

    TEXTILES; PAPER

E

    FIXED CONSTRUCTIONS

F

    MECHANICAL ENGINEERING; LIGHTING; HEATING; WEAPONS; BLASTING




Other categories also include GENERAL TAGGING OF NEW TECHNOLOGICAL DEVELOPMENTS.


In [None]:
# estimate number of unique group_id CPCs
unique_CPCs = usptodf['group_id'].nunique()

print(unique_CPCs)

# Use the rank function to assign unique numbers to unique entries in 'Column1'
usptodf['UniqueNumber'] = usptodf['group_id'].rank(method='dense').astype(int)

usptodf.head()

321


Unnamed: 0,_c0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,...,1002,1003,1004,1005,1006,1007,1008,1009,1010,1011,1012,1013,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023,Patent Number,Application Number,Filing Date,Grant Date,Entity Status,Application Status Category,Application Status Date,type,abstract,title,kind,num_claims,time to abandonment,time to abd since filing,subgroup_id,group_id,CPC_Definition,UniqueNumber
0,0,-0.069363,-0.10472,0.170673,-0.438419,-0.144168,0.681036,0.138129,0.301038,-0.292081,-0.606854,-0.288766,-1.054059,0.518129,0.510559,0.839489,0.380828,-0.825546,-0.715169,-0.330739,-0.704941,-0.001159,0.430229,1.206324,-0.212817,-0.36591,-0.122769,-0.775084,-0.870044,-0.121482,-0.446669,-0.976716,0.42765,0.283998,-0.360515,0.521507,-0.116353,-0.32761,0.534353,-0.347815,...,-0.611092,-0.274693,0.030647,-0.170343,-0.605823,0.76179,-0.336923,-0.204806,0.340841,-0.268847,0.034237,0.553283,0.283917,0.142959,0.333324,-0.097232,0.338291,0.317263,-0.136278,0.288835,-0.5414,-0.360672,6347763,9751925,1/2/2000,2002-02-19,UNDISCOUNTED,Patent Expired Due to NonPayment of Maintenanc...,2010-03-22,utility,An active damping method and a self-contained ...,System and method for reducing dispersion of s...,B1,10,2953,3732,F42B10/661,F42B,Explosive Charges,230
1,1,-0.657632,-0.660423,-0.32936,-0.422629,-0.221532,0.424544,0.580698,-0.144818,-0.506951,-0.482279,0.073071,-0.726611,-0.017608,-0.101514,0.552301,0.858862,-1.224141,0.458215,-0.262847,-0.255457,-0.051434,0.467641,1.20367,-0.255218,0.12427,-0.396945,-0.351141,-0.833173,0.525649,-0.311577,0.090772,0.28354,0.119979,-0.668585,0.114054,-0.191174,-0.164651,0.3587,0.736736,...,-0.056031,0.186717,-0.050719,-0.524433,-0.537832,0.029255,-0.252963,-0.41654,0.355757,-0.231939,-0.634832,0.624658,-0.323671,0.895678,0.047849,-0.020366,0.068159,-0.267707,0.215926,-0.759468,-0.143991,-0.440908,6345753,9647774,1/2/2000,2002-02-12,UNDISCOUNTED,Patent Expired Due to NonPayment of Maintenanc...,2014-03-10,utility,A vertical strip storage system has a support ...,Vertical belt storage system,B1,3,4409,5181,B21C49/00,B21C,"Manufacture Of Metal Sheets, Wire, Rods, Tubes...",62
2,2,-0.657632,-0.660423,-0.32936,-0.422629,-0.221532,0.424544,0.580698,-0.144818,-0.506951,-0.482279,0.073071,-0.726611,-0.017608,-0.101514,0.552301,0.858862,-1.224141,0.458215,-0.262847,-0.255457,-0.051434,0.467641,1.20367,-0.255218,0.12427,-0.396945,-0.351141,-0.833173,0.525649,-0.311577,0.090772,0.28354,0.119979,-0.668585,0.114054,-0.191174,-0.164651,0.3587,0.736736,...,-0.056031,0.186717,-0.050719,-0.524433,-0.537832,0.029255,-0.252963,-0.41654,0.355757,-0.231939,-0.634832,0.624658,-0.323671,0.895678,0.047849,-0.020366,0.068159,-0.267707,0.215926,-0.759468,-0.143991,-0.440908,6345753,9647774,1/2/2000,2002-02-12,UNDISCOUNTED,Patent Expired Due to NonPayment of Maintenanc...,2014-03-10,utility,A vertical strip storage system has a support ...,Vertical belt storage system,B1,3,4409,5181,B65H20/34,B65H,Handling Thin Or Filamentary Material,115
3,3,-0.657632,-0.660423,-0.32936,-0.422629,-0.221532,0.424544,0.580698,-0.144818,-0.506951,-0.482279,0.073071,-0.726611,-0.017608,-0.101514,0.552301,0.858862,-1.224141,0.458215,-0.262847,-0.255457,-0.051434,0.467641,1.20367,-0.255218,0.12427,-0.396945,-0.351141,-0.833173,0.525649,-0.311577,0.090772,0.28354,0.119979,-0.668585,0.114054,-0.191174,-0.164651,0.3587,0.736736,...,-0.056031,0.186717,-0.050719,-0.524433,-0.537832,0.029255,-0.252963,-0.41654,0.355757,-0.231939,-0.634832,0.624658,-0.323671,0.895678,0.047849,-0.020366,0.068159,-0.267707,0.215926,-0.759468,-0.143991,-0.440908,6345753,9647774,1/2/2000,2002-02-12,UNDISCOUNTED,Patent Expired Due to NonPayment of Maintenanc...,2014-03-10,utility,A vertical strip storage system has a support ...,Vertical belt storage system,B1,3,4409,5181,B65G21/14,B65G,Transport Or Storage Devices,114
4,4,-0.35812,-0.280004,0.694134,-0.69291,-0.084421,0.513281,0.398551,0.149493,-0.185156,-0.650438,0.601124,-0.593182,0.537901,-0.233282,0.73201,0.381555,-0.939021,0.513529,0.001086,-0.420366,-0.34152,0.496589,0.77829,-0.412515,-0.668345,-0.63579,-0.447815,-0.579105,0.259927,-0.106404,-0.337793,0.412689,0.053856,-0.828441,1.301035,-0.122319,-0.019628,0.436147,0.184037,...,-0.510991,0.175919,-0.16464,-0.518945,-0.475701,0.189043,-0.166743,0.162045,0.465138,0.265746,0.318009,0.288605,-0.200721,0.63704,0.132071,-0.908206,-0.541148,-0.807207,0.319228,-0.896838,-0.064141,-0.54129,6702405,9647268,1/2/2000,2004-03-09,UNDISCOUNTED,Patent Expired Due to NonPayment of Maintenanc...,2016-04-04,utility,An electric parking brake system for a vehicle...,Electric parking brake,B1,9,4409,5937,B60T13/746,B60T,Vehicle Brake Control Systems Or Parts Thereof...,103


In [None]:
# find and assign colors according to the first letter of group_id

# Selecting rows where the "group" column
rows_groupA = usptodf[usptodf['group_id'].str.startswith('A')]

# Selecting rows where the "group" column starts with "B"
rows_groupB = usptodf[usptodf['group_id'].str.startswith('B')]


# Selecting rows where the "group" column
rows_groupC = usptodf[usptodf['group_id'].str.startswith('C')]


# Selecting rows where the "group" column
rows_groupD = usptodf[usptodf['group_id'].str.startswith('D')]


# Selecting rows where the "group" column
rows_groupE = usptodf[usptodf['group_id'].str.startswith('E')]


# Selecting rows where the "group" column
rows_groupF = usptodf[usptodf['group_id'].str.startswith('F')]


# Displaying the selected rows
print(np.shape(rows_groupA))
print(np.shape(rows_groupB))
print(np.shape(rows_groupC))


(424, 1043)
(456, 1043)
(380, 1043)


In [None]:

# Create a new column "new_column" and set it to 1 for rows where "group" starts with "B"
usptodf['colors'] = 0  # Initialize the color column to 0
usptodf.loc[usptodf['group_id'].str.startswith('A'), 'colors'] = 1  # Set to 1 where "group" starts with "B"

usptodf.loc[usptodf['group_id'].str.startswith('B'), 'colors'] = 2  # Set to 1 where "group" starts with "B"

usptodf.loc[usptodf['group_id'].str.startswith('C'), 'colors'] = 3  # Set to 1 where "group" starts with "B"

usptodf.loc[usptodf['group_id'].str.startswith('D'), 'colors'] = 4  # Set to 1 where "group" starts with "B"

usptodf.loc[usptodf['group_id'].str.startswith('E'), 'colors'] = 5  # Set to 1 where "group" starts with "B"

usptodf.loc[usptodf['group_id'].str.startswith('F'), 'colors'] = 6  # Set to 1 where "group" starts with "B"


usptodf.loc[usptodf['group_id'].str.startswith('G'), 'colors'] = 7
usptodf.head(10)

'''

#final view:

group  new_column
0   B01           1
1   B02           1
2   C02           2
3  D004           3
4   B03           1
5   E01           4
'''




'\n\n#final view:\n\ngroup  new_column\n0   B01           1\n1   B02           1\n2   C02           2\n3  D004           3\n4   B03           1\n5   E01           4\n'

In [None]:
# Create a new column "new_column" and set it to 1 for rows where "group" starts with "B"
usptodf['category'] = 0  # Initialize the color column to 0
usptodf.loc[usptodf['group_id'].str.startswith('A'), 'category'] = "A"  #

usptodf.loc[usptodf['group_id'].str.startswith('B'), 'category'] = "B"  # Set to 1 where "group" starts with "B"

usptodf.loc[usptodf['group_id'].str.startswith('C'), 'category'] = "C"  # Set to 1 where "group" starts with "B"

usptodf.loc[usptodf['group_id'].str.startswith('D'), 'category'] = "D"  # Set to 1 where "group" starts with "B"

usptodf.loc[usptodf['group_id'].str.startswith('E'), 'category'] = "E"  # Set to 1 where "group" starts with "B"

usptodf.loc[usptodf['group_id'].str.startswith('F'), 'category'] = "F"  # Set to 1 where "group" starts with "B"

usptodf.head()

Unnamed: 0,_c0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,...,1004,1005,1006,1007,1008,1009,1010,1011,1012,1013,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023,Patent Number,Application Number,Filing Date,Grant Date,Entity Status,Application Status Category,Application Status Date,type,abstract,title,kind,num_claims,time to abandonment,time to abd since filing,subgroup_id,group_id,CPC_Definition,UniqueNumber,colors,category
0,0,-0.069363,-0.10472,0.170673,-0.438419,-0.144168,0.681036,0.138129,0.301038,-0.292081,-0.606854,-0.288766,-1.054059,0.518129,0.510559,0.839489,0.380828,-0.825546,-0.715169,-0.330739,-0.704941,-0.001159,0.430229,1.206324,-0.212817,-0.36591,-0.122769,-0.775084,-0.870044,-0.121482,-0.446669,-0.976716,0.42765,0.283998,-0.360515,0.521507,-0.116353,-0.32761,0.534353,-0.347815,...,0.030647,-0.170343,-0.605823,0.76179,-0.336923,-0.204806,0.340841,-0.268847,0.034237,0.553283,0.283917,0.142959,0.333324,-0.097232,0.338291,0.317263,-0.136278,0.288835,-0.5414,-0.360672,6347763,9751925,1/2/2000,2002-02-19,UNDISCOUNTED,Patent Expired Due to NonPayment of Maintenanc...,2010-03-22,utility,An active damping method and a self-contained ...,System and method for reducing dispersion of s...,B1,10,2953,3732,F42B10/661,F42B,Explosive Charges,230,6,F
1,1,-0.657632,-0.660423,-0.32936,-0.422629,-0.221532,0.424544,0.580698,-0.144818,-0.506951,-0.482279,0.073071,-0.726611,-0.017608,-0.101514,0.552301,0.858862,-1.224141,0.458215,-0.262847,-0.255457,-0.051434,0.467641,1.20367,-0.255218,0.12427,-0.396945,-0.351141,-0.833173,0.525649,-0.311577,0.090772,0.28354,0.119979,-0.668585,0.114054,-0.191174,-0.164651,0.3587,0.736736,...,-0.050719,-0.524433,-0.537832,0.029255,-0.252963,-0.41654,0.355757,-0.231939,-0.634832,0.624658,-0.323671,0.895678,0.047849,-0.020366,0.068159,-0.267707,0.215926,-0.759468,-0.143991,-0.440908,6345753,9647774,1/2/2000,2002-02-12,UNDISCOUNTED,Patent Expired Due to NonPayment of Maintenanc...,2014-03-10,utility,A vertical strip storage system has a support ...,Vertical belt storage system,B1,3,4409,5181,B21C49/00,B21C,"Manufacture Of Metal Sheets, Wire, Rods, Tubes...",62,2,B
2,2,-0.657632,-0.660423,-0.32936,-0.422629,-0.221532,0.424544,0.580698,-0.144818,-0.506951,-0.482279,0.073071,-0.726611,-0.017608,-0.101514,0.552301,0.858862,-1.224141,0.458215,-0.262847,-0.255457,-0.051434,0.467641,1.20367,-0.255218,0.12427,-0.396945,-0.351141,-0.833173,0.525649,-0.311577,0.090772,0.28354,0.119979,-0.668585,0.114054,-0.191174,-0.164651,0.3587,0.736736,...,-0.050719,-0.524433,-0.537832,0.029255,-0.252963,-0.41654,0.355757,-0.231939,-0.634832,0.624658,-0.323671,0.895678,0.047849,-0.020366,0.068159,-0.267707,0.215926,-0.759468,-0.143991,-0.440908,6345753,9647774,1/2/2000,2002-02-12,UNDISCOUNTED,Patent Expired Due to NonPayment of Maintenanc...,2014-03-10,utility,A vertical strip storage system has a support ...,Vertical belt storage system,B1,3,4409,5181,B65H20/34,B65H,Handling Thin Or Filamentary Material,115,2,B
3,3,-0.657632,-0.660423,-0.32936,-0.422629,-0.221532,0.424544,0.580698,-0.144818,-0.506951,-0.482279,0.073071,-0.726611,-0.017608,-0.101514,0.552301,0.858862,-1.224141,0.458215,-0.262847,-0.255457,-0.051434,0.467641,1.20367,-0.255218,0.12427,-0.396945,-0.351141,-0.833173,0.525649,-0.311577,0.090772,0.28354,0.119979,-0.668585,0.114054,-0.191174,-0.164651,0.3587,0.736736,...,-0.050719,-0.524433,-0.537832,0.029255,-0.252963,-0.41654,0.355757,-0.231939,-0.634832,0.624658,-0.323671,0.895678,0.047849,-0.020366,0.068159,-0.267707,0.215926,-0.759468,-0.143991,-0.440908,6345753,9647774,1/2/2000,2002-02-12,UNDISCOUNTED,Patent Expired Due to NonPayment of Maintenanc...,2014-03-10,utility,A vertical strip storage system has a support ...,Vertical belt storage system,B1,3,4409,5181,B65G21/14,B65G,Transport Or Storage Devices,114,2,B
4,4,-0.35812,-0.280004,0.694134,-0.69291,-0.084421,0.513281,0.398551,0.149493,-0.185156,-0.650438,0.601124,-0.593182,0.537901,-0.233282,0.73201,0.381555,-0.939021,0.513529,0.001086,-0.420366,-0.34152,0.496589,0.77829,-0.412515,-0.668345,-0.63579,-0.447815,-0.579105,0.259927,-0.106404,-0.337793,0.412689,0.053856,-0.828441,1.301035,-0.122319,-0.019628,0.436147,0.184037,...,-0.16464,-0.518945,-0.475701,0.189043,-0.166743,0.162045,0.465138,0.265746,0.318009,0.288605,-0.200721,0.63704,0.132071,-0.908206,-0.541148,-0.807207,0.319228,-0.896838,-0.064141,-0.54129,6702405,9647268,1/2/2000,2004-03-09,UNDISCOUNTED,Patent Expired Due to NonPayment of Maintenanc...,2016-04-04,utility,An electric parking brake system for a vehicle...,Electric parking brake,B1,9,4409,5937,B60T13/746,B60T,Vehicle Brake Control Systems Or Parts Thereof...,103,2,B


In [None]:
########### merge togeterh df_embedding dataframe and colors from final_embedding dataframe ###############

df_embedded['color'] = usptodf['colors'] #'colors'
df_embedded.head()


Unnamed: 0,Dimension 1,Dimension 2,color
0,6.52843,28.125072,6
1,50.745388,-9.62024,2
2,50.742943,-9.595822,2
3,50.745575,-9.620245,2
4,17.616222,4.323287,2


In [None]:


# Define a dictionary to map numbers to colors
'''
colors and categories
A	HUMAN NECESSITIES
red

B   PERFORMING OPERATIONS; TRANSPORTING
blue

C    CHEMISTRY; METALLURGY
green

D    TEXTILES; PAPER
yellow

E   FIXED CONSTRUCTIONS
orange

F    MECHANICAL ENGINEERING; LIGHTING; HEATING; WEAPONS; BLASTING
purple

G    PHYSICS
black

H    ELECTRICITY
black

Y    GENERAL TAGGING OF NEW TECHNOLOGICAL DEVELOPMENTS
'''
color_mapping = {
    1: 'red', #
    2: 'blue', # B
    3: 'green', # C
    4: 'yellow', # D
    5: 'orange', # E
    6: 'purple', # F
    0: 'black' # assign 0 to values above 6th category
}

#del df_embedded['colors'] # delete it from previous session

# Use the .apply method to create a new column with colors
df_embedded['colors'] = df_embedded['color'].apply(lambda x: color_mapping.get(x, 'unknown'))

df_embedded.head()


Unnamed: 0,Dimension 1,Dimension 2,color,colors
0,6.52843,28.125072,6,purple
1,50.745388,-9.62024,2,blue
2,50.742943,-9.595822,2,blue
3,50.745575,-9.620245,2,blue
4,17.616222,4.323287,2,blue


In [None]:
usptodf.head()

Unnamed: 0,_c0,0,1,2,3,4,5,6,7,8,...,type,abstract,title,kind,num_claims,time to abandonment,time to abd since filing,subgroup_id,group_id,CPC_Definition
0,0,-0.069363,-0.10472,0.170673,-0.438419,-0.144168,0.681036,0.138129,0.301038,-0.292081,...,utility,An active damping method and a self-contained ...,System and method for reducing dispersion of s...,B1,10,2953,3732,F42B10/661,F42B,Explosive Charges
1,1,-0.657632,-0.660423,-0.32936,-0.422629,-0.221532,0.424544,0.580698,-0.144818,-0.506951,...,utility,A vertical strip storage system has a support ...,Vertical belt storage system,B1,3,4409,5181,B21C49/00,B21C,"Manufacture Of Metal Sheets, Wire, Rods, Tubes..."
2,2,-0.657632,-0.660423,-0.32936,-0.422629,-0.221532,0.424544,0.580698,-0.144818,-0.506951,...,utility,A vertical strip storage system has a support ...,Vertical belt storage system,B1,3,4409,5181,B65H20/34,B65H,Handling Thin Or Filamentary Material
3,3,-0.657632,-0.660423,-0.32936,-0.422629,-0.221532,0.424544,0.580698,-0.144818,-0.506951,...,utility,A vertical strip storage system has a support ...,Vertical belt storage system,B1,3,4409,5181,B65G21/14,B65G,Transport Or Storage Devices
4,4,-0.35812,-0.280004,0.694134,-0.69291,-0.084421,0.513281,0.398551,0.149493,-0.185156,...,utility,An electric parking brake system for a vehicle...,Electric parking brake,B1,9,4409,5937,B60T13/746,B60T,Vehicle Brake Control Systems Or Parts Thereof...


In [None]:
df_embedded['category'] = usptodf['category']

# Filtering dataframe before coloring it

We need to clean the dataset before color-coding it by removing the unknown colors, etc.

In [None]:
# Show the legend based on unique values in the 'colors' column
unique_colors = df_embedded['colors'].unique()
print(unique_colors)


# Create a boolean mask to filter out rows with 'colors' equal to 'unknown'
mask = df_embedded['colors'] != 'unknown'

# Apply the mask to the DataFrame to keep only the rows where 'colors' is not 'unknown'
filtered_df = df_embedded[mask]

# Check that our dataframe does not have unknown as "colors"
unique_colors = filtered_df['colors'].unique()
print(unique_colors)


['purple' 'blue' 'red' 'green' 'black' 'unknown' 'orange' 'yellow']
['purple' 'blue' 'red' 'green' 'black' 'orange' 'yellow']


In [None]:

#plt.scatter(x,y,color=color,s=0.5,alpha=0.3)

# Visualize the embedded data (scatter plot)
ax = plt.figure(figsize=(8, 6))
plt.scatter(filtered_df['Dimension 1'], filtered_df['Dimension 2'], c = filtered_df['colors'], marker='o', alpha=0.7)

# Hide grid lines
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
plt.title('t-SNE on the sample of the data')
plt.grid(False)
plt.show()


# Add the legend for categories of patents

CPC
COOPERATIVE PATENT CLASSIFICATION

A
HUMAN NECESSITIES

B
PERFORMING OPERATIONS; TRANSPORTING

C
CHEMISTRY; METALLURGY

D
TEXTILES; PAPER

E
FIXED CONSTRUCTIONS

F
MECHANICAL ENGINEERING; LIGHTING; HEATING; WEAPONS; BLASTING

G
PHYSICS

H
ELECTRICITY


TODO: to put legend for colors for the final pipeline


In [None]:
cpcclass = {
    'red': 'A   HUMAN NECESSITIES',
    'blue': 'B   PERFORMING OPERATIONS; TRANSPORTING',
    'green': 'C    CHEMISTRY; METALLURGY',
    'yellow': 'D    TEXTILES; PAPER',
    'orange': 'E   FIXED CONSTRUCTIONS',
    'purple': 'F    MECHANICAL ENGINEERING; LIGHTING; HEATING; WEAPONS; ',
    'black': 'G    PHYSICS',
}
color_mapping = {
    1: 'red', #
    2: 'blue', # B
    3: 'green', # C
    4: 'yellow', # D
    5: 'orange', # E
    6: 'purple', # F
    0: 'black' # assign 0 to values above 6th category
}



#plt.scatter(x,y,color=color,s=0.5,alpha=0.3)

# Visualize the embedded data (scatter plot)
plt.figure(figsize=(10, 8))
plt.scatter(filtered_df['Dimension 1'], filtered_df['Dimension 2'], c = filtered_df['colors'], marker='o', alpha=0.3)


# Show the legend based on unique values in the 'colors' column
unique_colors = filtered_df['colors'].unique()
legend_handles = [plt.Line2D([0], [0], marker='o', color='w', label=cpcclass[color], markerfacecolor=color) for color in unique_colors]
plt.legend(handles=legend_handles, title='CPC CLASS', loc='best')



plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
plt.title('t-SNE Embedding on the sample of the data')
plt.grid(False)
plt.show()



# Step 3. Exploration of the similarity and clustering structures

As it has been noted in various domains where low-dimensional projections have been applied the naive applications of dimensionality reduction methods often suffer from misinterpretations or shortcomings, e.g. the global structure or clustering of the data is not represented accurately. For more details one can watch the recording on the contrastive learning https://www.youtube.com/watch?v=A2HmdO8cApw

Based on the closseness in the embedding space we can construct the clustering method. However one needs to perform the validation of the clustering and localisation methods, which can be done using several approaches.

One of them was presented in Dmitry Kobak and Philipp Berens. The art of using t-SNE for single-cell transcriptomics. 10(1):5416.
ISSN 2041-1723. doi: 10.1038/s41467-019-13056-x.
URL https://www.nature.com/articles/
s41467-019-13056-x.

## Clustering validation methods
One possible metrics is KNN, where the fraction of k-nearest neighbours in the original highdimensional data that are preserved as k-nearest neighbours in the embedding. One can use k  = 10, 20 and computed the average across all n points (depending on the size of the dataset). KNN quantifies preservation of the local structure.

In [None]:


import pandas as pd
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

# Load your DataFrame 'filtered_df' with Dimension 1 and Dimension 2 columns

#filtered_df = df_embedded


# Assuming you have chosen the number of clusters 'n_clusters'
n_clusters = 80 # should be more than 10 categories but less than 100 subcategories
X = filtered_df[['Dimension 1', 'Dimension 2']].values

# Initialize K-Means clustering model
kmeans = KMeans(n_clusters=n_clusters, random_state=42)

# Fit the model to your data
kmeans.fit(X)

# Predict cluster labels
cluster_labels = kmeans.predict(X)

# Add cluster labels to your DataFrame
filtered_df['cluster'] = cluster_labels

# Plot the clustered points
plt.scatter(filtered_df['Dimension 1'], filtered_df['Dimension 2'], c=filtered_df['cluster'], cmap='rainbow')
plt.title('K-Means Clustering')
plt.xlabel('Dim 1')
plt.ylabel('Dim 2')
plt.show()

# Print the DataFrame with cluster labels
print(df)


In [None]:
# elbow plot for identifying the optimal number of clusters


#n_clusters

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

# Load your dataset

# Select the features you want to use for clustering
X = filtered_df[['Dimension 1', 'Dimension 2']]  # Replace with actual column names

# Create a list to store the inertia (within-cluster sum of squares) values
inertia = []

# Define a range of K values (number of clusters)
k_values = range(1, 11)  # You can adjust this range as needed

# Perform K-means clustering for each value of K
for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X)
    inertia.append(kmeans.inertia_)

# Plot the elbow plot
plt.figure(figsize=(8, 6))
plt.plot(k_values, inertia, marker='o', linestyle='-', color='b')
plt.title('Elbow Plot for K-Means Clustering')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Inertia')
plt.grid(True)
plt.show()


In [None]:
import pandas as pd
from sklearn.metrics import pairwise_distances

# applying to the dataframe with columns ['abstract', 'x', 'y']


# Function to find similar points within the given radius
def find_similar_points(tsne_embedding_dataframe, radius):
    distances = pairwise_distances(tsne_embedding_dataframe[['x', 'y']])
    similar_points = distances < radius

    similar_abstracts = []
    for i in range(len(similar_points)):
        current_abstract = tsne_embedding_dataframe.iloc[i]['abstract']
        similar_abstracts.append([current_abstract] + list(tsne_embedding_dataframe[similar_points[i]]['abstract']))

    return similar_abstracts

# Set the radius for similarity, this radius should be sampled based on the average distance from the distribution of distances between data points
radius = 0.2

# one needs to find radius (specific experiments needed)

# Find similar points
similar_abstracts = find_similar_points(usptodf, radius)


