In [2]:
import pandas as pd

data = pd.read_csv('sampledata/data_input.csv')     # Read apstracts from csv file
data.sample(5)                                     # Show 5 rows of data  

Unnamed: 0,id,abstract
2215,2215,Let M be an almost complex manifold equipped w...
10791,10791,<PLOT > is a collection of routines to draw su...
22072,22072,The article analyzes a proposed network topolo...
62703,62706,The ICML 2013 Workshop on Challenges in Repres...
35145,35145,The Internet has revolutionized the computer a...


In [3]:
data.drop('id', axis = 1, inplace = True)            # Drop id column

In [4]:
data.to_csv('sampledata/data.csv', index = False)  # Save data to csv file

### N-Gram Similarity

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import euclidean_distances

vectorizer = CountVectorizer()                     # Create a vectorizer object

In [6]:
texts = data['abstract'].tolist()                  # Convert abstracts to list
texts = [str(text) for text in texts]              # Convert all elements to string
texts = [text.lower() for text in texts]           # Convert all elements to lowercase

In [7]:
print(len(texts))                                  # Print number of abstracts

86209


In [15]:
stexts = texts[:10]

In [16]:
features = vectorizer.fit_transform(stexts)        # Create a matrix of features

In [32]:
print(type(features))                              # Print type of features

<class 'scipy.sparse.csr.csr_matrix'>


In [None]:
print(vectorizer.vocabulary_)                      # Print vocabulary, the underscore is a placeholder

In [18]:
print(len(vectorizer.vocabulary_))                 # Print number of words in vocabulary i.e. vocabulary size. The size generally depends on the size of the corpus and thus affects the number of features (& performance)

692


In [34]:
distances = euclidean_distances(features)          # Calculate euclidean distances between all pairs of features

In [40]:
# save distance to file
import numpy as np
np.savetxt('sampledata/distances.csv', distances, delimiter = ',')

# load distance from file
distances = np.loadtxt('sampledata/distances.csv', delimiter = ',')
print(distances.shape)

(10, 10)


In [44]:
features.v

<10x692 sparse matrix of type '<class 'numpy.int64'>'
	with 944 stored elements in Compressed Sparse Row format>

In [28]:
new_query = 'The purpose of this study is to investigate the effect of the COVID-19 pandemic on the mental health of the population in the United States.'

new_query = [new_query]                            # Convert query to list

new_query = [str(text) for text in new_query]      # Convert all elements to string
new_query = [text.lower() for text in new_query]   # Convert all elements to lowercase

new_query_features = vectorizer.transform(new_query) # Create a matrix of features for the query

new_query_distances = euclidean_distances(new_query_features, features) # Calculate euclidean distances between the query and all abstracts

print()
print(new_query_distances)                         # Print distances

[[17.60681686 23.4520788  17.2626765  15.5241747  11.35781669 24.71841419
  14.69693846 10.04987562 18.84144368 19.6977156 ]]


In [29]:
def get_feature(sample_text):
    sample_text = [sample_text]                                 # Convert query to list
    sample_text = [str(text) for text in sample_text]           # Convert all elements to string
    sample_text = [text.lower() for text in sample_text]        # Convert all elements to lowercase
    sample_text_features = vectorizer.transform(sample_text)    # Create a matrix of features for the query
    return sample_text_features

In [31]:
def get_distances(sample_text_features, features):
    i = 0
    
    for item in features:
        print(stexts[i])
        i += 1
        print(euclidean_distances(sample_text_features, item))

In [33]:
get_distances(get_feature(new_query), features)

turing machines and g\"odel numbers are important pillars of the theory of computation. thus, any computational architecture needs to show how it could relate to turing machines and how stable implementations of turing computation are possible. in this chapter, we implement universal turing computation in a neural field environment. to this end, we employ the canonical symbologram representation of a turing machine obtained from a g\"odel encoding of its symbolic repertoire and generalized shifts. the resulting nonlinear dynamical automaton (nda) is a piecewise affine-linear map acting on the unit square that is partitioned into rectangular domains. instead of looking at point dynamics in phase space, we then consider functional dynamics of probability distributions functions (p.d.f.s) over phase space. this is generally described by a frobenius-perron integral transformation that can be regarded as a neural field equation over the unit square as feature space of a dynamic field theory

In [None]:
i = 0

for item in features:
    print(stexts[i])
    i += 1
    print(euclidean_distances(features[0], item)[0][0])