
<img src="../assets/logo3.png" width="200" height="200" >
<div style="display:block"><br><br>
    <div style="display:block" align=left display=block> 
        <font size=5><b>Day 2 Hands-On 1 (Solution) - Numpy Basics</b></font><br>
        <hr/>
</div>

**Execute the cell below. By running this cell, a dataset will be loaded from `patents_small.csv` file. In this notebook, you are asked to analyze this data in several ways. There are three numpy arrays in this dataset:**
- `patent_number`: a unique identifier for each patetnt
- `patent features`: a vector of 16 features describing several properties of each patent
- `category`: the category to which a patent belongs 

In [2]:
import pandas as pd
import numpy as np

df = pd.read_csv('patents.csv')
df.head()
patent_number = df['publication_number'].to_numpy()
patent_features = df['patent_embedding'].to_numpy()
temp = []
for i in range(patent_features.size):
    s = str(patent_features[i])
    s1 = s.replace(r'\n', '')
    temp.append(
        np.array(s.split()[1:-1], dtype='float')[:16]
    )

patent_features = np.stack(temp)
patent_category = df['category']

<hr />

#### 1- Which patent has the highest norm? (Eucledian distance from origin)


In [4]:
import scipy.spatial.distance as ssd

# method 1
norms=np.sqrt(np.sum(patent_features**2,axis=1))

# method 2
origin = np.zeros((1, 16))
norms = ssd.cdist(origin, patent_features)

print(norms)
print('max', np.max(norms))
print('max id', np.argmax(norms))
print('category:',patent_category[np.argmax(norms)], '- patent:', patent_number[np.argmax(norms)])

[[0.1219383  0.19354777 0.16784897 ... 0.11544574 0.13356323 0.13686935]]
max 0.3530806305286102
max id 10839
category: 5 - patent: CH-527846-A


In [44]:
0.1+0.2+0.7==1

True

#### 2- Find the two patents that are the farthest from eachother.

In [11]:
import scipy.spatial.distance as ssd

norms=ssd.cdist(patent_features,patent_features)

In [19]:
# method 1 for finding the max index in a 2D matrix
m = np.argmax(norms)
row, column = np.unravel_index(m,norms.shape)

# method 2
row = m // norms.shape[0]
column = m % norms.shape[0]

print(row, column)
print(norms[row, column])
print(np.max(norms))

1661 9236
0.5612983586484407
0.5612983586484407


#### 3- Write a function that, given a patent number, finds its nearest neighbour.


In [20]:
dists = ssd.cdist(patent_features, patent_features)

(array([10839]),)


In [26]:
def nearest_neighbor(p_number):
    p_index = np.where(patent_number == p_number)[0][0]
    nearest_id = np.argsort(dists[p_index])[1]
    return nearest_id

nearest_neighbor('CH-527846-A')

6364

#### 4- For each patent category, find the cluster center. This quantity is computed by taking average of all patents associated with each cluster.

In [27]:
centroids = []
for cat in patent_category.unique():
    cat_patents = patent_features[patent_category==cat]
    centroids.append(
        np.mean(cat_patents, axis=0)
    )

print(centroids)

[array([ 0.01021772,  0.0140427 , -0.03571764,  0.05286253, -0.04302765,
       -0.00263517,  0.02233755, -0.04675915,  0.01272022,  0.03165236,
        0.01146286, -0.00024609,  0.01377522,  0.00555212,  0.02024696,
       -0.04467966]), array([ 0.01211396, -0.0304879 ,  0.05560378, -0.03702774,  0.00110319,
        0.01892597, -0.04493763,  0.01639101,  0.03405147,  0.01160055,
       -0.0039251 ,  0.01961012,  0.0012078 ,  0.02051051, -0.04779424,
       -0.01136447]), array([ 0.01086092, -0.02427292,  0.06917166, -0.04593048, -0.02812299,
       -0.0124727 , -0.04987288,  0.00655626,  0.0098301 , -0.01550384,
        0.00122531,  0.00426678,  0.00017979,  0.02210309, -0.02753392,
       -0.00829946]), array([ 0.01844678,  0.00991557, -0.05545595,  0.02615103, -0.07078419,
       -0.0115121 ,  0.04539117, -0.05906673, -0.02173693,  0.00203886,
        0.00052992,  0.02329754, -0.03247548,  0.03103352,  0.0140693 ,
       -0.06104154]), array([ 0.01498087,  0.02345642, -0.00569218,  

#### 5- How many patents have a nearest neighbour that is in the same category?

In [30]:
nearest_neighbor_category = []
for patent in patent_number:
    nearest_patent = nearest_neighbor(patent)
    nearest_category = patent_category[nearest_patent]
    nearest_neighbor_category.append(nearest_category)

nearest_neighbor_category = np.array(nearest_neighbor_category)
result = np.sum(patent_category == nearest_neighbor_category)
print(result)

13000


#### 6- What is the average and std of distances between every pair of patents?


In [31]:
dists = ssd.cdist(patent_features, patent_features)
mean_pair_distance = np.mean(dists)
std_pair_distance = np.std(dists)

print(f'mean pair distance = {mean_pair_distance}, std pair distance = {std_pair_distance}')

mean pair distance = 0.1774779588870755, std pair distance = 0.06172153433074445


#### 7- What is the average and std of distances between every pair of patents within a category? Using these calculated quantities, which cluster do you think is more condensed? Which one is more scattered?

In [35]:
distance_means, distance_stds = [], []
for i in patent_category.unique():
    cat_patents = patent_features[patent_category==i]
    within_pair_distance = ssd.cdist(cat_patents, cat_patents)
    distance_means.append(
        np.mean(within_pair_distance)
    )
    distance_stds.append(
        np.std(within_pair_distance)
    )

print(distance_means)
print(distance_stds)
print('the most scattered cluster: ', np.argmax(distance_means))
print('the most condensed cluster: ', np.argmin(distance_means))

[0.10235593475072612, 0.1040055310233456, 0.13098792252469746, 0.14095499322691948, 0.13610883703695129, 0.13874011203724912, 0.13052366960262599, 0.13459851471369638]
[0.030560683465341895, 0.03180215536017749, 0.03973015614729102, 0.04403433221841342, 0.04026897604426641, 0.04483107158434705, 0.03853325352892595, 0.04241648037596702]
the most scattered cluster:  3
the most condensed cluster:  0
