### functions: <br>
cosine_similarity - cosine similarity of two vectors <br>
great_circle_distance - great-circle distance of two coordinates <br>
most_similar - similarities over matrices <br>
place_composition - adds vectors corresponding with list of place names <br>
place_join - find all places corresponding with name in a gazetteer <br>
plot_timeseries - plots timeseries over place-year matrix <br>
ppmi - calculates ppmi for matrix (positive pointwise mutual information) <br>
semantic_footprint - semantic footprint of a term <br>
similarity_map - creates semantic map of a keyword <br>
<br>
### data:<br>
doc_metadata - EEBO doc metadata<br>
eebo_network - EEBO booktrade network<br>
geo - geographical metadata<br>
kwic - EEBO keyword-in-context matrix<br>
litmath<br>
place_doc - places<br>
term_doc - EEBO term-document matrix<br>
<br>
### other functions:<br>
dot_product - dot product of two vectors<br>
correlation - correlation between two vectors<br>
variance<br>
covariance<br>
tf-idf - normalize a matrix usuing tf-idf<br>
standard scoring<br>
entropy<br>
<br>
k-means clustering<br>
degree<br>
betweenness<br>
eigenvector centrality<br>
transitivity/clustering coefficiant<br>


In [1]:
import numpy as np
import math

matrix = [[1,2,3],[4,5,6],[7,8,9]]

#normalize matrix using positive pointwise mutual information
#input matrix m, output normalized matrix
def ppmi(m):
    total = np.sum(m)
    pcol = (np.sum(m, axis = 0)) / total
    xlength = len(pcol)
    pcol = np.reshape(pcol,(1,len(pcol)))
    prow = (np.sum(m, axis = 1)) / total
    ylength = len(prow)
    prow = np.reshape(prow,(len(prow),1))
    pm = prow@pcol
    m = m / total
    m = m / pm
    m = np.log(m)
    for x in range(0,ylength):
        for y in range(0,xlength):
            if m[x][y] < 0:
                m[x][y] = 0
    return m

ppmi(matrix)

    
    
#normalize matrix using tf-idf
#input matrix m, out
#def tfidf(m):

array([[0.00000000e+00, 2.22044605e-16, 2.23143551e-01],
       [2.22044605e-16, 0.00000000e+00, 0.00000000e+00],
       [8.96121587e-02, 2.22044605e-16, 0.00000000e+00]])

In [7]:
import numpy as np
import math

x = [1,2,3]
y = [4,5,6]

def cosine_similarity(x,y) :
    return (np.dot(x,y)) / (math.sqrt(np.dot(x,x)) * math.sqrt(np.dot(y,y)))

cosine_similarity(x,y)

0.9746318461970762

In [35]:
import numpy as np
import math

x = [1,2,3]
y = [4,5,6]

def dotProduct(x: list, y: list) -> float:
    # Should we just do `return np.dot(x,y)` instead?
    if not len(x) == len(y):
        raise Exception("Vectors are not the same length")
    x_arr = np.array(x)
    y_arr = np.array(y)
    return sum(x_arr * y_arr)
    
def variance(x) :
    mean = sum(x)/len(x)
    res = (x-mean)
    return(var == (np.dot(res,res)) / len(x))
    
def covariance(x,y) :
    meanx = sum(x)/len(x)
    meany = sum(y)/len(y)
    resx = (x-mean)
    resy = (y-mean)
    return(covar == (np.dot(resx,resy))/len(x))

def correlation(x,y) :
    return(covariance(x,y)/np.sqrt(variance(x)*variance(y)))
    
def standardScoring(x) :
    mean = sum(x)/len(x)
    sd = math.sqrt(variance(x))
    return((x-mean)/sd)
    
def entropy(x: list) -> float:
    x_arr = np.array(x)
    px = x_arr/sum(x)
    px = px[px != 0] # The values of px that are 0 will not contribute to the sum
    logpx = np.log(px)
    return(-dotProduct(px,logpx))

def euc_distance(x,y) :
    return math.sqrt(np.sum(np.subtract(x,y))**2)



0.023718761330184128

In [102]:
import pandas as pd

# See https://en.wikipedia.org/wiki/Tf-idf#Definition for details on calculation
def tfidf(term: str, document: str, corpus: pd.DataFrame) -> float:
    # Find term-frequency of term
    tf = corpus.loc[term, document]/corpus.values.sum()
    # Find inverse document frequency
    N = len(corpus.columns)
    term_vec = corpus.loc[term]
    docs_with_term = len(term_vec[term_vec != 0]) # Number of documents containing term
    idf = np.log(N/docs_with_term)
    return tf * idf

# Method that returns a dataframe containg the tfidf for each term
def tfidf_matrix(corpus: pd.DataFrame) -> pd.DataFrame:
    # Create term-frequency matrix
    tf = corpus/corpus.values.sum()
    # Create inverse document frequency vector
    N = len(corpus.columns)
    doc_frequencies = np.array(corpus.apply(lambda row: len(row[row != 0]), axis=1))
    idf = np.log(N/doc_frequencies)
    return tf.multiply(idf, axis='rows')


In [105]:
# Testing tfidf
corpus = pd.DataFrame(np.array([[5,9,0],[4,8,1],[0,0,9]]),
                      columns=['Ham','MND','1H4'],
                      index=['love','hate','king'])
print(corpus)
print(tfidf_matrix(corpus))



      Ham  MND  1H4
love    5    9    0
hate    4    8    1
king    0    0    9
           Ham       MND       1H4
love  0.056315  0.101366  0.000000
hate  0.000000  0.000000  0.000000
king  0.000000  0.000000  0.274653


In [49]:
arr = np.array([1, 2, 3, 4, 5])
 
applyall = np.vectorize(lambda i: i + 2)
res = applyall(arr)
print(res)

[3 4 5 6 7]


In [12]:
import numpy as np
import pandas as pd
import math

""" 'most_similar' parameters:
m - mat
    word matrix as a pandas dataframe
v - vec
    vector to evaluate
method - str
    similarity to be performed, by default 'cosine' ex: 
margin - int
    0 to calculate over rows, 1 over columns
fullResults - boolean
    by default false
"""

def cosine_similarity(x,y) :
    return (np.dot(x,y)) / (math.sqrt(np.dot(x,x)) * math.sqrt(np.dot(y,y)))

def euc_distance(x,y) :
    return math.sqrt(np.sum(np.subtract(x,y))**2)


def most_similar(mat, vec, method = "cosine", margin = 0, fullResults = False):
    cos_sim = cosine_similarity(x,y)
    euc_dist = euc_distance(x,y)
    
    if(class(mat) == "docMatrix"):
        mat = np.matmul(mat,mat)
        
    if(np.isin(len(vec),mat.shape) == False):
        keyword = vec
        if(margin == 0):
            if(np.isin(keyword,list(mat.rows)) == False):
                raise Exception("Your keyword doesn't match any of your matrix's row names.")
            vec = mat.loc[keyword]
        if(margin == 1):
            if(np.isin(keyword,list(mat.columns)) == False):
                raise Exception("Your keyword doesn't match any of your matrix's column names.")
            vec = mat[keyword]
        
        if(np.isin(method, list("cosine","euclidean", "covariance", "pearson") == False)):
            raise Exception("Method must be specified as 'cosine', 'euclidean', 'covariance', or 'pearson'.")
        if(method == "cosine"):
            #results = apply(mat, margin, cos_sim, vec)
            results = mat.apply(cos_sim, axis=margin)
        if(method == "euclidean"):
            #results = apply(mat, margin, euc_dist, vec)
            results = mat.apply(euc_dist, axis=margin)
        if(method == "covariance"):
            #results = apply(mat, margin, np.cov, vec)
            
        if(method == "pearson"):
            results = apply(mat, margin, correlation, vec)
            
        if(fullResults == False):
            if(len(vec) == 1):
                results = results[-np.where(results.columns == vec)]
            if(method == "euclidean"):
                results = results.sort()[0:11]
            else:
                results = results.sort(reverse=True)[0:11]
        return results
        

SyntaxError: invalid syntax (3828950259.py, line 29)

In [None]:
import numpy as np
import math

def degrees_to_radians(x) :
    return(x*math.pi/180)
    
def great_circle_distance(lon1, lat1, lon2, lat2, radius = 3437) :
    lon1 = degrees_to_radians(lon1)
    lat1 = degrees_to_radians(lat1)
    lon2 = degrees_to_radians(lon2)
    lat2 = degrees_to_radians(lat2)
    d = math.acos(math.sin(lat1)*math.sin(lat2) + math.cos(lat1)*math.cos(lat2) * math.cos(lon2-lon1)) * radius
    return d

In [None]:
import numpy as np
import math

# Adds vectors corresponding to a list of place names
def place_composition(mat, places) :
    places = places[np.isin(mat.rows)]
    if(len(places) > 1):
        mat = mat.loc[places] #[,places]
        mat = as.matrix(mat)
        vec = np.sum(mat, axis=1)
    else:
        vec = mat[places] #[places,]
    return vec
    

In [None]:
def place_join(places, gaz) :
    places = list(places,gaz)
    

In [None]:
def plot_timeseries() :

In [None]:
def semantic_footprint() :

In [None]:
def similarity_map() :