In [None]:
import pandas as pd
import numpy as np
import json

from sklearn import preprocessing
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

import pickle


In [None]:
user_info = pd.read_parquet("../data/submit_data/user_info_filter_asset.parquet")
user_frequency = pd.read_parquet("../data/submit_data/user_frequency.parquet")
contract_interact = pd.read_parquet("../data/submit_data/contract_interact.parquet")
user_interact = pd.read_parquet("../data/submit_data/user_interact.parquet")

user_interact = user_interact.reset_index(drop=True)

In [None]:
git_access_token = "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"
git_url = "https://github.com/IT4043E-IT5384-2023/IT4043E_group21_problem4.git"

from git import Repo

git_url = git_url.replace("https://", "")
repo = Repo.clone_from(f"https://{git_access_token}:x-oauth-basic@" + git_url, './bigdata')
repo.git.checkout("feature/user_transactions")


In [None]:
with open("../data/submit_data/participants.json", encoding="utf-8") as f:
        participants = json.load(f)

In [None]:
def get_time_distance_matrix(user_frequency, group_users):
  user_frequency_filter = user_frequency[user_frequency['user'].isin(group_users)]
  num = user_frequency_filter.user.nunique()
  user_frequency_filter = user_frequency_filter.sort_values("user").reset_index(drop=True)
  user_frequency_filter[[f"X_Time{i}" for i in range(len(user_frequency_filter.time_frequency[0]))]] = user_frequency_filter.time_frequency.apply(pd.Series)
  print(f"number of users: {num}")
  print(f"columns: {user_frequency_filter['user'].head(10)}")
  user_frequency_filter.drop(['user','block_timestamp_list','time_frequency'], axis= 1,inplace = True)
  time_matrix_group_users = cosine_similarity(user_frequency_filter)
  return time_matrix_group_users


def get_time_feature(user_frequency):
  user_frequency_filter = user_frequency.copy()
  num = user_frequency_filter.user.nunique()
  user_frequency_filter = user_frequency_filter.sort_values("user").reset_index(drop=True)
  user_frequency_filter[[f"X_Time{i}" for i in range(len(user_frequency_filter.time_frequency[0]))]] = user_frequency_filter.time_frequency.apply(pd.Series)
  print(f"number of users: {num}")
  print(f"columns: {user_frequency_filter['user'].head(10)}")
  user_frequency_filter.drop(['user','block_timestamp_list','time_frequency'], axis= 1,inplace = True)
 # time_matrix_group_users = cosine_similarity(user_frequency_filter)
  return user_frequency_filter

def get_token_distance_matrix(user_info, group_users):
  user_info_group_users = user_info[user_info['user'].isin(group_users)]
  num = user_info_group_users.user.nunique()

  user_info_group_users_pivot_table = user_info_group_users.pivot_table(index="user",columns ="asset", values="priceInUSD").reset_index()
  user_info_group_users_pivot_table = user_info_group_users_pivot_table.sort_values('user').reset_index(drop=True)
 # print(f"number of users: {num}")
 # print(f"df: {user_info_group_users_pivot_table['user'].head(10)}")
  user_info_group_users_pivot_table.drop('user',axis=1, inplace = True)
  user_info_group_users_pivot_table = user_info_group_users_pivot_table.fillna(0)
  token_matrix_group_users = cosine_similarity(user_info_group_users_pivot_table)
  return token_matrix_group_users


def get_contract_users(user_lst, df):
  existing_values = df['user'].unique().tolist()
  new_values = list(set(user_lst) - set(existing_values))
  new_rows = pd.DataFrame({'user': new_values, 'nb_contract_interact': 0, 'nb_contract_address': 0})
  result_df = pd.concat([df, new_rows], ignore_index=True)
  return result_df

def get_other_users(user_lst, df):
  existing_values = df['user'].unique().tolist()
  new_values = list(set(user_lst) - set(existing_values))
  new_rows = pd.DataFrame({'user': new_values, 'nb_user_interact': 0, 'nb_user_address': 0})
  result_df = pd.concat([df, new_rows], ignore_index=True)
  return result_df

def get_connection_distance_matrix(user_lst, contract_interact, user_interact, group_users):
  contract_distance = contract_interact.groupby('user').agg({"contract_address":"count","count":sum}).reset_index()
  contract_distance.rename(columns={"count":"nb_contract_interact","contract_address":"nb_contract_address"}, inplace=True)
  contract_distance_full = get_contract_users(user_lst,contract_distance)

  user_distance = user_interact.groupby('user').agg({"other_user":"count","count":sum}).reset_index()
  user_distance.rename(columns={"count":"nb_user_interact","other_user":"nb_user_address"}, inplace=True)
  user_distance_full = get_other_users(user_lst,user_distance)

  connection_distance = contract_distance_full.merge(user_distance_full, on ='user', how='outer')
  connection_distance = connection_distance[connection_distance['user'].isin(group_users)]

  connection_distance = connection_distance.sort_values("user").reset_index(drop=True)

  num = connection_distance.user.nunique()
 # print(f"number of users: {num}")
 # print(f"columns: {connection_distance['user'].head(10)}")
  connection_distance.drop("user", axis=1, inplace =True)
  connection_matrix_group_users = cosine_similarity(connection_distance)

  return connection_matrix_group_users

def get_top5_closest(row, distance_matrix,group_users):

    index = row['Index']
    distances = distance_matrix[index]
    sorted_indices = np.argsort(distances)[::-1][2:7]  # Exclude the element itself
    
    closest_elements = [group_users[i] for i in sorted_indices]

#    closest_elements = sorted_indices.tolist()
    distances = distances[sorted_indices].tolist()
    return pd.Series({'Top 5 similar wallet': closest_elements, 'Similarity': distances})



def get_distance_matrix(user_lst, contract_interact, user_interact, user_info,user_cluster):
    result_df_list = []
    for i in range(0, 25):
        group_users = user_cluster[user_cluster['cluster'] == i].user.tolist()
        group_users = sorted(group_users)
        token_matrix = get_token_distance_matrix(user_info, group_users)
        connection_matrix = get_connection_distance_matrix(user_lst, contract_interact, user_interact, group_users)
        distance_matrix = (token_matrix+ connection_matrix)/2
        print(distance_matrix)
        df = pd.DataFrame({'Index': range(len(distance_matrix)), 'user': group_users})

        result_df = pd.concat([df, df.apply(lambda x: get_top5_closest(x, distance_matrix,group_users), axis=1)], axis=1)
        result_df.drop('Index', axis=1,inplace=True)
        result_df_list.append(result_df)
    
    distance_df = pd.concat(result_df_list).reset_index(drop=True)
    
    return distance_df

In [None]:
time_features = get_time_feature(user_frequency)
trainn = time_features.copy()

trainn_normalized = preprocessing.normalize(trainn, norm='l2')
df_normalized = pd.DataFrame(trainn_normalized, columns=trainn.columns)

kmeans = KMeans(n_clusters=25, random_state=42, algorithm = "elkan")
cluster_labels = kmeans.fit(df_normalized)

result = pd.DataFrame()
result['cluster'] = cluster_labels.labels_


user_frequency_filter = user_frequency.copy()
num = user_frequency_filter.user.nunique()
user_frequency_filter = user_frequency_filter.sort_values("user").reset_index(drop=True)
user_cluster = user_frequency_filter.join(result)
user_cluster = user_cluster[['user','cluster']]
user_lst = user_info.user.unique().tolist()


In [None]:
dis_matrix = get_distance_matrix(user_lst, contract_interact, user_interact, user_info,user_cluster)

In [None]:
dis_matrix.to_csv('/kaggle/working/user_similarity.csv')