In [12]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA

In [8]:
node_type_file_path = "../cleaned_data/graph_data/node_type_ID.txt"

In [9]:
node_type_index = {}  # ID: type
user_node_set = []
poi_node_set = []

with open(node_type_file_path) as f:
    lines = f.readlines()
    for line in lines:
        line_content = line.strip().split("\t")
        node = line_content[0]
        node_type = line_content[1]
        node_type_index[node] = node_type

        if node_type == "P":
            poi_node_set.append(node)
        elif node_type == "U":
            user_node_set.append(node)

In [10]:
user_node_set_map = {node_id: i for i, node_id in enumerate(user_node_set)}
poi_node_set_map = {node_id: i for i, node_id in enumerate(poi_node_set)}

In [11]:
def minus_mean(x):
    return x - x.mean()

## Processing Features of User Node 

In [13]:
all_user = pd.read_csv("../cleaned_data/splited_data/train/new_user.csv")
all_user.set_index("user_id", inplace=True)
all_user["elite"] = all_user["elite"].map(
    lambda x: len(str(x).split(",")) if str(x) != "nan" else 0
)
all_user = all_user[["useful", "elite", "average_stars"]]
m=np.linalg.norm(all_user["useful"])
res=all_user["useful"]/m
all_user["useful"] = res
m=np.linalg.norm(all_user["elite"])
res=all_user["elite"]/m
all_user["elite"] = res
m=np.linalg.norm(all_user["average_stars"])
res=all_user["average_stars"]/m
all_user["average_stars"] = res

In [14]:
user_fe = np.array([all_user.loc[k].tolist() for k in user_node_set_map.keys()])

In [15]:
all_poi = pd.read_csv("../cleaned_data/splited_data/train/new_business.csv")
all_poi.set_index("business_id", inplace=True)
all_poi = all_poi[["stars", "is_open", "categories"]]
m=np.linalg.norm(all_poi["stars"])
res=all_poi["stars"]/m
all_poi["stars"] = res
all_poi["categories"] = all_poi["categories"].map(
    lambda x: [i for i in str(x).split(", ") if i not in ["Food", "Restaurants"]]
    if str(x) != "nan"
    else []
)

In [16]:
poi_all_cats = list(
    set(
        sum(
            all_poi["categories"].to_list(),
            [],
        )
    )
)
poi_filtered_cats = [
    poi_all_cats[i]
    for i in np.where(
        np.array(
            [
                all_poi["categories"].map(lambda x: cat in x).sum()
                for cat in poi_all_cats
            ]
        )
        > 20
    )[0]
]
X = np.array([[cat in i for cat in poi_filtered_cats] for i in all_poi["categories"]])
pca = PCA(n_components=25)
X = pca.fit_transform(X)
X_dict = dict(zip(all_poi.index, X.tolist()))
len(poi_filtered_cats), pca.explained_variance_ratio_.sum()

(60, 0.7910258962677772)

In [17]:
poi_fe = np.array(
    [all_poi.loc[k][:-1].tolist() + X_dict[k] for k in poi_node_set_map.keys()]
)

In [18]:
np.save("../cleaned_data/graph_data/user_fe.npy", user_fe)
np.save("../cleaned_data/graph_data/poi_fe.npy", poi_fe)