In [2]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from natsort import natsorted
from tqdm import tqdm
import os
import numpy as np

In [None]:
!conda list

In [3]:
%%time

train = pd.read_csv("./train_dataset_VK/train.csv")

CPU times: total: 1min 58s
Wall time: 2min 20s


61786

In [9]:
%%time

attr = pd.read_csv("./train_dataset_VK/attr.csv")

CPU times: total: 9.05 s
Wall time: 14.8 s


In [None]:
def draw_ego_graph(dataframe, ego_id, seed=123):
    pd_ego_graph = dataframe[dataframe.ego_id == ego_id]
    ego_graph = nx.from_pandas_edgelist(pd_ego_graph, source='u', target='v', edge_attr=['x1', 'x2', 'x3'])
    node_and_degree = ego_graph.degree()
    (largest_hub, degree) = sorted(node_and_degree, key=itemgetter(1))[-1]
    
    # Create ego graph of main hub
    hub_ego = nx.ego_graph(ego_graph, largest_hub)
    
    # Draw graph
    pos = nx.spring_layout(hub_ego, seed=seed)  # Seed layout for reproducibility
    nx.draw(hub_ego, pos, node_color="b", node_size=50, with_labels=True)

    # Draw ego as large and red
    options = {"node_size": 300, "node_color": "r"}
    nx.draw_networkx_nodes(hub_ego, pos, nodelist=[largest_hub], **options)
    plt.show()

In [26]:
def add_new_features_and_dump_to_csv(df, attr_df, fill_value=-1, save_folder='./output'): 
    os.makedirs(save_folder, exist_ok=True)
    ego_idxs = sorted(df.ego_id.unique())
    columns_to_fill = ['city_id', 'city_id_v', 'school', 'school_v', 'university', 'university_v']

    for idx in tqdm(ego_idxs):
        
        curr_train_graph = df[df.ego_id == idx]
        curr_attr_graph = attr_df[attr_df.ego_id == idx]

        d = curr_train_graph.merge(curr_attr_graph, on="u", how="left", suffixes=("","_u")).merge(curr_attr_graph.rename(columns={"u": "v"}), on="v", how="left", suffixes=("","_v"))
  
        
        d[columns_to_fill] = d[columns_to_fill].fillna(value=-1)
        
        d = d.astype({'city_id': 'int', 'city_id_v': 'int', 'school': 'int', 'school_v': 'int', 'university': 'int', 'university_v': 'int'})
        
        #############
        city_id_u = d.city_id.copy()
#         city_id_u[city_id_u == -1] = np.nan
        city = (city_id_u == d.city_id_v) & (city_id_u >= 0)
#         print(city_id_u)
        d["same_city"] = city
        d["is_city_presented"] = (d["city_id"] >= 0) & (d["city_id_v"] >= 0)
        ##############
        university_u = d.university.copy()
#         university_u[university_u == -1] = np.nan
        university = (university_u == d.university_v) & (university_u >= 0)
#         print(university_u)
        d["same_university"] = university
        d["is_university_presented"] = (d["university"] >= 0) & (d["university_v"] >= 0)
        ##############
        school_u = d.school.copy()
#         school_u[school_u == -1] = np.nan
        school = (school_u == d.school_v) & (school_u >= 0)
#         print(school_u)
        d["same_school"] = school
        d["is_school_presented"] = (d["school"] >= 0) & (d["school_v"] >= 0)
        ##############
        d = d.rename(columns={"sex": "sex_u", "age": "age_u"})
        d = d[[
               "ego_id",
               "u",
               "v",
               "sex_u",
               "sex_v",
               "age_u",
               "age_v",
               "is_city_presented",
               "same_city",
               "is_school_presented",
               "same_school",
               "is_university_presented",
               "same_university"
        ]]

        save_path = os.path.join(save_folder, f"{idx}.csv")

        d.to_csv(save_path)

In [27]:
add_new_features_and_dump_to_csv(train, attr, save_folder='E:/hack-ai/train_csv')

100%|██████████████████████████████████████████████████████████████████████████| 61786/61786 [2:48:32<00:00,  6.11it/s]
