In [1]:
# Import the libraries we will be using
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Define the file paths for training and test data
train_file_path = 'lab2_train.csv'
test_file_path = 'lab2_test.csv'

# Import training and test data
train_data = pd.read_csv(train_file_path, delimiter=',')
test_data = pd.read_csv(test_file_path, delimiter=',')

In [3]:
train_data.head()

Unnamed: 0,user_from_id,user_to_id,is_like,is_match
0,1136,3141,False,False
1,2424,3174,False,False
2,1300,3590,False,False
3,800,2736,False,False
4,883,437,False,False


In [4]:
pivot = pd.pivot_table(train_data, index='user_from_id', columns='user_to_id', values='is_like', fill_value=np.nan)

In [5]:
pivot.iloc[0:10, 0:10]

user_to_id,0,1,2,3,4,5,6,7,8,9
user_from_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,,,,,,,,,,
2,0.0,,,,,,,,,
3,,,,,,,,,,
4,0.0,,,,,,,,,
5,,,,,,,,,,
6,0.0,,,,,,,,,
7,,,,,,,,,,
8,,,,,,,,,,
9,,0.0,,,,,,,,
10,,,,,,,,,,


In [6]:
users_from_ids = pivot.index.values
print(len(users_from_ids))
users_to_ids = pivot.columns.values
print(len(users_to_ids))

3149
3040


In [7]:
# intersection 1-1
# union 1-0, 1-1, 0-1
def get_Jaccard_similarity(row_1, row_2):
    intersection = np.sum((row_1 == row_2) & (row_1 == 1))
    union = np.sum((~np.isnan(row_1) & ~np.isnan(row_2)) & ((row_1 == 1) | (row_2 == 1)))
    if union == 0:
        return 0
    return intersection / union

In [8]:
def get_k_nearest_neighbors_rows(user_id):
    user_row = pivot.loc[user_id]
    similarities = pivot.apply(lambda row: get_Jaccard_similarity(user_row, row), axis=1)
    similarities = similarities.sort_values(ascending=False)
    return similarities

In [9]:
def get_k_nearest_neighbors_columns(user_id):
    user_column = pivot[user_id]
    similarities = pivot.apply(lambda column: get_Jaccard_similarity(user_column, column), axis=0)
    similarities = similarities.sort_values(ascending=False)
    return similarities

In [34]:
sorted_similarities_rows = {}
for i, user_id in enumerate(users_from_ids):
    print(f'{i+1}/{len(users_from_ids)}', end='\r')
    sorted_similarities_rows[user_id] = get_k_nearest_neighbors_rows(user_id)

3149/3149

In [35]:
sorted_similarities_columns = {}
for i, user_id in enumerate(users_to_ids):
    print(f'{i+1}/{len(users_to_ids)}', end='\r')
    sorted_similarities_columns[user_id] = get_k_nearest_neighbors_columns(user_id)

3040/3040

In [10]:
import pickle
with open('rows.pickle', 'rb') as file:
    sorted_similarities_rows = pickle.load(file)
with open('columns.pickle', 'rb') as file:
    sorted_similarities_columns = pickle.load(file)

In [36]:
# import pickle
# with open('rows1.pickle', 'wb') as file:
#     pickle.dump(sorted_similarities_rows, file)
# with open('columns1.pickle', 'wb') as file:
#     pickle.dump(sorted_similarities_columns, file)

In [11]:
sorted_similarities_rows[1]

user_from_id
1       1.0
3027    1.0
1254    1.0
3183    1.0
1290    0.5
       ... 
1095    0.0
1096    0.0
1097    0.0
1098    0.0
3716    0.0
Length: 3149, dtype: float64

In [12]:
sorted_similarities_columns[2]

user_to_id
2746    1.0
1053    1.0
2       1.0
2980    1.0
1424    1.0
       ... 
1322    0.0
1323    0.0
1324    0.0
1328    0.0
3624    0.0
Length: 3040, dtype: float64

In [7]:
num_of_likes = {}
for user_id in users_from_ids:
    num_of_likes[user_id] = np.sum(pivot.loc[user_id] == 1)
num_of_liked = {}
for user_id in users_to_ids:
    num_of_liked[user_id] = np.sum(pivot[user_id] == 1)

In [8]:
print(max(num_of_likes.values()))
print(max(num_of_liked.values()))

79
57


In [9]:
likes_median = np.median(list(num_of_likes.values()))
print(likes_median)
liked_median = np.median(list(num_of_liked.values()))
print(liked_median)
likes_mean = np.mean(list(num_of_likes.values()))
print(likes_mean)
liked_mean = np.mean(list(num_of_liked.values()))
print(liked_mean)

1.0
1.0
3.8675770085741505
4.00625


In [16]:
def get_first_k_choices_rows(user_from_id, user_to_id, k=None, s=None):
    if (user_from_id not in users_from_ids) or (user_to_id not in users_to_ids):
        return []
    neighbors = sorted_similarities_rows[user_from_id]
    choices = []
    for neighbor_id, similarity in neighbors.items():
        if pivot.loc[neighbor_id, user_to_id] == 1:
            choices.append((neighbor_id, similarity, True))
        if pivot.loc[neighbor_id, user_to_id] == 0:
            choices.append((neighbor_id, similarity, False))
        if k and len(choices) == k:
            break
        if s and similarity < s - 1e-6:
            break
    return choices

In [17]:
def get_first_k_choices_columns(user_from_id, user_to_id, k=None, s=None):
    if (user_from_id not in users_from_ids) or (user_to_id not in users_to_ids):
        return []
    neighbors = sorted_similarities_columns[user_to_id]
    choices = []
    for neighbor_id, similarity in neighbors.items():
        if pivot.loc[user_from_id, neighbor_id] == 1:
            choices.append((neighbor_id, similarity, True))
        if pivot.loc[user_from_id, neighbor_id] == 0:
            choices.append((neighbor_id, similarity, False))
        if k and len(choices) == k:
            break
        if s and similarity < s - 1e-6:
            break
    return choices

In [56]:
unknown = []
def get_is_like(user_from_id, user_to_id) -> bool:
    global unknown
    if (user_from_id not in users_from_ids) and (user_to_id not in users_to_ids):
        unknown.append((user_from_id, user_to_id))
        return False
    all_choices = get_first_k_choices_rows(user_from_id, user_to_id, s=0.1)
    all_choices.extend(get_first_k_choices_columns(user_from_id, user_to_id, s=0.1))
    choices = []
    for choice in all_choices:
        choices.append(choice[2])
    if len(choices) == 0:
        if user_from_id in users_from_ids:
            return num_of_likes[user_from_id] >= 70
        if user_to_id in users_to_ids:
            return num_of_liked[user_to_id] >= 50
        unknown.append((user_from_id, user_to_id))
        return False
    return np.mean(choices) >= 0.5

In [57]:
total, trues = 0, 0
with open('submission.csv', 'w') as file:
    file.write('index,is_like\n')
    for i in range(len(test_data)):
        print(f'{i+1}/{len(test_data)}', end='\r')
        row = test_data.iloc[i]
        user_from_id = row['user_from_id']
        user_to_id = row['user_to_id']
        is_like = get_is_like(user_from_id, user_to_id)
        
        if is_like:
            trues += 1
        total += 1
        
        file.write(f'{i},{is_like}\n')

16203/16203

In [58]:
print(len(unknown))

417


In [59]:
print(trues/total*100)

7.3628340430784425


In [80]:
import pickle
with open('rows.pickle', 'wb') as file:
    pickle.dump(sorted_similarities_rows, file)
with open('columns.pickle', 'wb') as file:
    pickle.dump(sorted_similarities_columns, file)

In [50]:
x = pivot.loc[1]
y = pivot.loc[3027]
z = ((~np.isnan(x)) & (~np.isnan(y)))

2

In [None]:
users_total = max(users_from_ids.max(), users_to_ids.max())

In [189]:
matrix = np.full((users_total+1, users_total+1), np.nan)

In [190]:
for i in range(len(train_data)):
    user_from_id = train_data.iloc[i]['user_from_id']
    user_to_id = train_data.iloc[i]['user_to_id']
    is_like = train_data.iloc[i]['is_like']
    is_match = train_data.iloc[i]['is_match']
    matrix[user_from_id, user_to_id] = is_like
    if is_like == 1:
        matrix[user_to_id, user_from_id] = is_match

In [191]:
def get_k_nearest_neighbors_rows_matrix(user_id):
    user_row = matrix[user_id, :]
    similarities = []
    for i in range(users_total+1):
        row = matrix[i, :]
        similarity = get_Jaccard_similarity(user_row, row)
        similarities.append((similarity, i))
    similarities = sorted(similarities, key=lambda x: x[0], reverse=True)
    return similarities

In [192]:
def get_k_nearest_neighbors_columns_matrix(user_id):
    user_column = matrix[:, user_id]
    similarities = []
    for i in range(users_total+1):
        column = matrix[:, i]
        similarity = get_Jaccard_similarity(user_column, column)
        similarities.append((similarity, i))
    similarities = sorted(similarities, key=lambda x: x[0], reverse=True)
    return similarities

In [193]:
sorted_rows = []
for i in range(users_total+1):
    print(f'{i}/{users_total}', end='\r')
    sorted_rows.append(get_k_nearest_neighbors_rows_matrix(i))

171/3716

KeyboardInterrupt: 

In [None]:
sorted_columns = []
for i in range(users_total+1):
    print(f'{i}/{users_total}', end='\r')
    sorted_columns.append(get_k_nearest_neighbors_columns_matrix(i))

In [163]:
print(len(sorted_rows))
print(len(sorted_columns))

3717
3717


In [155]:
def get_first_k_choices_rows_matrix(user_from_id, user_to_id, k):
    choices = []
    for similarity, neighbor_id in sorted_rows[user_from_id]:
        if matrix[neighbor_id, user_to_id] == 1:
            choices.append((neighbor_id, similarity, True))
        if matrix[neighbor_id, user_to_id] == 0:
            choices.append((neighbor_id, similarity, False))
        if len(choices) == k:
            break
    return choices

In [159]:
def get_first_k_choices_columns_matrix(user_from_id, user_to_id, k):
    choices = []
    for similarity, neighbor_id in sorted_columns[user_to_id]:
        if matrix[user_from_id, neighbor_id] == 1:
            choices.append((neighbor_id, similarity, True))
        if matrix[user_from_id, neighbor_id] == 0:
            choices.append((neighbor_id, similarity, False))
        if len(choices) == k:
            break
    return choices

In [187]:
unknown = []
def get_is_like_matrix(user_from_id, user_to_id) -> bool:
    global unknown
    if user_from_id > users_total or user_to_id > users_total:
        return False
    choices_rows = get_first_k_choices_rows_matrix(user_from_id, user_to_id, 1000)
    choices_columns = get_first_k_choices_columns_matrix(user_from_id, user_to_id, 1000)
    if len(choices_rows) == 0 and len(choices_columns) == 0:
        return False
    numerator, denominator = 0, 0
    for choice in choices_rows:
        numerator += choice[1] * choice[2]
        denominator += 1
    for choice in choices_columns:
        numerator += choice[1] * choice[2]
        denominator += 1
    return numerator / denominator >= 0.5

In [188]:
with open('submission.csv', 'w') as file:
    file.write('index,is_like\n')
    for i in range(len(test_data)):
        print(f'{i+1}/{len(test_data)}', end='\r')
        row = test_data.iloc[i]
        user_from_id = row['user_from_id']
        user_to_id = row['user_to_id']
        is_like = get_is_like_matrix(user_from_id, user_to_id)
        file.write(f'{i},{is_like}\n')

16203/16203

In [183]:
print(len(unknown))

1890


In [184]:
x, y = unknown[0]

In [180]:
sorted_rows[x]

[(1.0, 1),
 (1.0, 2),
 (1.0, 3),
 (1.0, 6),
 (1.0, 8),
 (1.0, 19),
 (1.0, 20),
 (1.0, 21),
 (1.0, 25),
 (1.0, 28),
 (1.0, 29),
 (1.0, 30),
 (1.0, 31),
 (1.0, 33),
 (1.0, 36),
 (1.0, 37),
 (1.0, 38),
 (1.0, 43),
 (1.0, 50),
 (1.0, 56),
 (1.0, 57),
 (1.0, 58),
 (1.0, 61),
 (1.0, 62),
 (1.0, 63),
 (1.0, 64),
 (1.0, 65),
 (1.0, 66),
 (1.0, 68),
 (1.0, 69),
 (1.0, 70),
 (1.0, 75),
 (1.0, 83),
 (1.0, 90),
 (1.0, 91),
 (1.0, 92),
 (1.0, 93),
 (1.0, 101),
 (1.0, 107),
 (1.0, 108),
 (1.0, 111),
 (1.0, 117),
 (1.0, 127),
 (1.0, 132),
 (1.0, 142),
 (1.0, 150),
 (1.0, 153),
 (1.0, 155),
 (1.0, 156),
 (1.0, 166),
 (1.0, 170),
 (1.0, 174),
 (1.0, 175),
 (1.0, 190),
 (1.0, 191),
 (1.0, 192),
 (1.0, 197),
 (1.0, 203),
 (1.0, 208),
 (1.0, 210),
 (1.0, 213),
 (1.0, 215),
 (1.0, 217),
 (1.0, 220),
 (1.0, 225),
 (1.0, 228),
 (1.0, 233),
 (1.0, 234),
 (1.0, 242),
 (1.0, 245),
 (1.0, 252),
 (1.0, 253),
 (1.0, 254),
 (1.0, 255),
 (1.0, 257),
 (1.0, 259),
 (1.0, 260),
 (1.0, 261),
 (1.0, 263),
 (1.0, 266),
 (