In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('all-MiniLM-L6-v2')
#Sentences are encoded by calling model.encode()

df = pd.read_csv ('medium_data.csv')
# https://www.kaggle.com/datasets/dorianlazar/medium-articles-dataset

# combine title and subtitle into full_title
df['full_title'] = df.apply(lambda row: row['title'] if pd.isnull(row['subtitle']) else row['title'] + ': ' + row['subtitle'], axis=1)

# find how many of each article there are with the same publisher, and reading time
reading_times = sorted(df['reading_time'].unique())
publications = list(df['publication'].unique())
print("reading_times:", reading_times)
print("publications:", publications)
print()

for t in reading_times:
    for p in publications:
        articles = df[(df['reading_time'] == t) & (df['publication'] == p)]
        shape = articles.shape
        if shape[0] != 0:
            print(t, p, ' '*(24-len(p)), shape[0])
    print()
    
# subset rows based on biggest group
df = df[(df['reading_time'] == 5) & (df['publication'] == 'The Startup')]
print(df.shape)

# subset df to only include relevant columns
y_col = 'claps'
x_col = 'full_title'
df = df[['id', x_col, y_col]]
print(df.shape)

reading_times: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 31, 32, 33, 36, 40, 55]
publications: ['Towards Data Science', 'UX Collective', 'The Startup', 'The Writing Cooperative', 'Data Driven Investor', 'Better Marketing', 'Better Humans']

0 The Startup               2

1 Towards Data Science      3
1 UX Collective             1
1 The Startup               7
1 The Writing Cooperative   1
1 Data Driven Investor      2

2 Towards Data Science      22
2 UX Collective             19
2 The Startup               72
2 The Writing Cooperative   12
2 Data Driven Investor      40
2 Better Marketing          2
2 Better Humans             1

3 Towards Data Science      87
3 UX Collective             75
3 The Startup               340
3 The Writing Cooperative   87
3 Data Driven Investor      141
3 Better Marketing          17
3 Better Humans             2

4 Towards Data Science      162
4 UX Collective             86
4 The Startup     

In [2]:
train_size = 10
test_size = 5

test_df = df[train_size: train_size + test_size]
test_df = test_df.reset_index(drop=True)

df = df[0:train_size]
df = df.reset_index(drop=True)

print(test_df.shape)
print(df.shape)

(5, 3)
(10, 3)


In [3]:
df_rows = len(df.index)
test_df_rows = len(test_df.index)
distances = [[None for _ in range(df_rows)] for _ in range(test_df_rows)]

k = 3 # or whatever

In [4]:
# calculate distances between test values and train values
for i in range(test_df_rows):
    for j in range(df_rows):
        emb1 = model.encode(test_df[x_col][i])
        emb2 = model.encode(df[x_col][j])
        cos_sim = abs(float(util.cos_sim(emb1, emb2)))
        
        # store training data index along with cos_sim
        distances[i][j] = (j, cos_sim, df[y_col][j])

In [5]:
# print nicely formatted distance values
for row in distances:
    for n in row:
        if n is None:
            print('None  ', end='')
        else:
            print('%.3f'%(n[1]) + ' ' + str(n[2]) + ', ', end='')
    print()

0.013 11700, 0.067 1800, 0.025 1300, 0.009 148, 0.039 212, 0.068 235, 0.009 378, 0.027 9, 0.022 129, 0.032 70, 
0.238 11700, 0.048 1800, 0.026 1300, 0.003 148, 0.299 212, 0.167 235, 0.175 378, 0.149 9, 0.256 129, 0.177 70, 
0.224 11700, 0.058 1800, 0.011 1300, 0.013 148, 0.228 212, 0.127 235, 0.239 378, 0.071 9, 0.236 129, 0.207 70, 
0.079 11700, 0.014 1800, 0.131 1300, 0.039 148, 0.070 212, 0.162 235, 0.000 378, 0.260 9, 0.096 129, 0.078 70, 
0.119 11700, 0.009 1800, 0.139 1300, 0.079 148, 0.123 212, 0.021 235, 0.051 378, 0.136 9, 0.113 129, 0.023 70, 


In [6]:
# get k nearest neighbors for each testing data point
knn_list = [None for _ in range(test_df_rows)]

for (i, tup) in enumerate(distances):
    sort_dist = sorted(tup, key=lambda x: x[1], reverse = True) # sort by cosine similarity
    knn = sort_dist[:k]
    knn_list[i] = knn

In [7]:
# print nicely formatted knn list
for row in knn_list:
    for n in row:
        if n is None:
            print('None  ', end='')
        else:
            print(str(n[0]) + ': ' + '%.3f'%(n[1]) + ' ' + str(n[2]).rjust(5) + ',    ', end='')
    print()

5: 0.068   235,    1: 0.067  1800,    4: 0.039   212,    
4: 0.299   212,    8: 0.256   129,    0: 0.238 11700,    
6: 0.239   378,    8: 0.236   129,    4: 0.228   212,    
7: 0.260     9,    5: 0.162   235,    2: 0.131  1300,    
2: 0.139  1300,    7: 0.136     9,    4: 0.123   212,    


In [8]:
# for each test data point, get average claps for its k nearest neighbors
expected_claps = [None for _ in range(test_df_rows)]
for i in range(test_df_rows):
    sum_claps = 0
    for j in range(k):
        claps = knn_list[i][j][2]
        sum_claps += claps
    expected_claps[i] = sum_claps / k

In [9]:
print(expected_claps)

[749.0, 4013.6666666666665, 239.66666666666666, 514.6666666666666, 507.0]
