In [1]:
import os
import sys
import argparse
import numpy as np
import pandas as pd
from collections import defaultdict, OrderedDict

In [2]:
!pwd

/Users/liuzhichao/Documents/GitHub/Baidu_BigData_Competition


In [3]:
# change format of grid_attr.csv to a dict, with long&lat as key, region_id as value
def get_grid_dict(city_path, city_name):
    d = {}
    with open(os.path.join(city_path, 'grid_attr.csv'), 'r') as f:
        for line in f:
            items = line.strip().split(',')
            axis = ",".join(items[0:2])
            ID = items[2]
            d[axis] = "_".join([city_name, ID])

    # d = {'x,y': ID}
    return d

In [4]:
a = get_grid_dict("Data/city_A", "A")
a

{'146.757689,30.248075': 'A_0',
 '146.757689,30.249627': 'A_0',
 '146.757698,30.238715': 'A_0',
 '146.757698,30.240275': 'A_0',
 '146.757698,30.241835': 'A_0',
 '146.757698,30.243395': 'A_0',
 '146.757698,30.244955': 'A_0',
 '146.757698,30.246515': 'A_0',
 '146.757707,30.235595': 'A_0',
 '146.757707,30.237155': 'A_0',
 '146.759495,30.246484': 'A_0',
 '146.759495,30.248044': 'A_0',
 '146.759495,30.249604': 'A_0',
 '146.759504,30.237124': 'A_0',
 '146.759504,30.238684': 'A_0',
 '146.759504,30.240244': 'A_0',
 '146.759504,30.241804': 'A_0',
 '146.759504,30.243364': 'A_0',
 '146.759504,30.244924': 'A_0',
 '146.759513,30.235564': 'A_0',
 '146.7613,30.244893': 'A_0',
 '146.7613,30.246453': 'A_0',
 '146.7613,30.248013': 'A_0',
 '146.7613,30.249573': 'A_0',
 '146.761309,30.235533': 'A_0',
 '146.761309,30.237093': 'A_0',
 '146.761309,30.238653': 'A_0',
 '146.761309,30.240213': 'A_0',
 '146.761309,30.241773': 'A_0',
 '146.761309,30.243333': 'A_0',
 '146.763106,30.244862': 'A_0',
 '146.763106,30.

In [5]:
# change x_coord, y_coord of start&end in transfer.csv to region_id, according to the returned dict
# from get_grid_dict, drop those mismatches, write output to csv
def coord2ID(data_path, city_name, output_path):
    city_path = os.path.join(data_path, "city_%s" % city_name)
    grid_dict = get_grid_dict(city_path, city_name)

    trans_filename = os.path.join(city_path, "transfer.csv")
    output_file = os.path.join(output_path, "%s_transfer.csv" % (city_name))
    with open(trans_filename, 'r') as f, open(output_file, 'w') as writer:
        for line in f:
            items = line.strip().split(',')
            start_axis = ",".join(items[1:3])
            end_axis = ",".join(items[3:5])
            index = items[5]
            try:
                start_ID = grid_dict[start_axis]
                end_ID = grid_dict[end_axis]
            except KeyError: # remove no ID axis
                continue

            writer.write("%s,%s,%s,%s\n" % (items[0], start_ID, end_ID, index))

In [6]:
coord2ID("Data", "A", "Data")

In [7]:
# sum up transfer index of 24 hours (calculate transfer index per day)
def calc_index_in_one_day(data_path, city_name):
    trans_filename = os.path.join(data_path, "%s_transfer.csv" % (city_name))
    transfer = pd.read_csv(trans_filename, 
            header=None,
            names=['hour', 's_region', 'e_region', 'index'])
        
    df = transfer.groupby(['s_region', 'e_region'])['index'].sum().reset_index()
    df = df[['s_region', 'e_region', 'index']]
    #  df = df.T
    #  df_list.append(df)
    return df

In [8]:
t = calc_index_in_one_day("Data", "A")
t.head()

Unnamed: 0,s_region,e_region,index
0,A_0,A_0,187.5
1,A_0,A_1,0.3
2,A_0,A_10,0.8
3,A_0,A_100,0.2
4,A_0,A_108,0.4


In [9]:
# calculate migration index per day
def process_city_migration(data_path, city_name):
    filename = os.path.join(data_path, "city_%s" % city_name, "migration.csv")
    migration = pd.read_csv(filename, 
                            sep=',', 
                            header=None,
                            names=['date', 's_city', 'e_city', city_name])

    # only use moving in "city" data, ignore moving out data
    df = migration[migration.e_city == city_name]
    df = df[["date", city_name]]

    # calculate total move in data of "city"
    df = df.groupby('date')[city_name].sum().reset_index()
    return df

In [10]:
m = process_city_migration("Data", "A")
m.head()

Unnamed: 0,date,A
0,21200501,0.81162
1,21200502,0.742641
2,21200503,0.964937
3,21200504,0.771767
4,21200505,0.727024


In [11]:
# new_migration df: date, start_region, end_region, new_index
def migration_process(data_path, city_list, output_path):
    for city_name in city_list:
        coord2ID(data_path, city_name, output_path)
        transfer = calc_index_in_one_day(output_path, city_name)
        migration = process_city_migration(data_path, city_name)

        df_list = []
        for i in range(len(migration)):
            df = transfer.copy()
            date = migration.date[i]
            index = migration[city_name][i]
            # new_index = transfer index between regions * migration index in city per day
            df['index'] = df['index'] * index
            df['date'] = date
            df = df[['date', 's_region', 'e_region', 'index']]
            df_list.append(df)

        df = pd.concat(df_list, axis=0)

        df.to_csv(os.path.join(output_path, '%s_migration.csv' % city_name), 
                header=None,
                index=None,
                float_format = '%.4f')

In [12]:
migration_process("Data", ["A", "B"], "Data")

In [13]:
test = pd.read_csv("Data/A_migration.csv", sep=',', 
                                header=None,
                                names=['date', 's_region', 'e_region', 'index'])
test.head()

Unnamed: 0,date,s_region,e_region,index
0,21200501,A_0,A_0,152.1787
1,21200501,A_0,A_1,0.2435
2,21200501,A_0,A_10,0.6493
3,21200501,A_0,A_100,0.1623
4,21200501,A_0,A_108,0.3246


In [14]:
# create adjacent matrix, column as start_region, row as end_region
def adj_matrix_process(data_path, city_list, region_nums, output_path):
    total_region_num = np.sum(region_nums)
    adj_matrix = np.zeros((total_region_num, total_region_num))

    offset = 0
    for i, city in enumerate(city_list):
        filename = os.path.join(output_path, "%s_migration.csv" % city)
        migration = pd.read_csv(filename, 
                                sep=',', 
                                header=None,
                                names=['date', 's_region', 'e_region', 'index'])

        matrix = np.zeros((region_nums[i], region_nums[i]))
        order = sorted(range(region_nums[i]), key=lambda x:str(x))
        for j, idx in enumerate(order):
            target_region = "%s_%d" % (city, idx)
            # only use moving in "city" data, ignore moving out data
            df = migration[migration['e_region'] == target_region]

            df = df.groupby('s_region')['index'].mean().reset_index()
            #  res = df['index'].values.reshape(-1)
            for k, o in enumerate(order):
                s_region_id = "%s_%d" % (city, o)
                try:
                    value = df[df['s_region'] == s_region_id]['index'].values[0]
                except:
                    value = 0.0
                if s_region_id == target_region:
                    value = 0.0
                matrix[j, k] = value

        # merge two adj_matrix
        adj_matrix[offset:(offset + region_nums[i]), offset:(offset + region_nums[i])] = matrix
        offset += region_nums[i]

    file_to_save = os.path.join(output_path, 'adj_matrix.npy')
    print("saving result to %s" % file_to_save)
    np.save(file_to_save, adj_matrix)

In [15]:
o = sorted(range(118), key=lambda x:str(x))
o

[0,
 1,
 10,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 11,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 2,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 3,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 4,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 5,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 6,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 7,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 8,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 9,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99]

In [16]:
test[(test['e_region'] == "A_0") & (test['s_region'] == "A_0")]

Unnamed: 0,date,s_region,e_region,index
0,21200501,A_0,A_0,152.1787
13056,21200502,A_0,A_0,139.2451
26112,21200503,A_0,A_0,180.9257
39168,21200504,A_0,A_0,144.7063
52224,21200505,A_0,A_0,136.317
65280,21200506,A_0,A_0,206.4771
78336,21200507,A_0,A_0,140.7943
91392,21200508,A_0,A_0,188.3554
104448,21200509,A_0,A_0,166.4551
117504,21200510,A_0,A_0,166.9714


In [17]:
c = test[test['e_region'] == "A_0"]
c

Unnamed: 0,date,s_region,e_region,index
0,21200501,A_0,A_0,152.1787
70,21200501,A_1,A_0,0.0812
156,21200501,A_10,A_0,0.8116
272,21200501,A_100,A_0,0.3246
607,21200501,A_103,A_0,0.1623
...,...,...,...,...
586164,21200614,A_89,A_0,0.0541
586279,21200614,A_9,A_0,0.1624
586620,21200614,A_92,A_0,0.0541
586963,21200614,A_95,A_0,0.0541


In [18]:
c = c.groupby('s_region')['index'].mean().reset_index()
c

Unnamed: 0,s_region,index
0,A_0,144.105862
1,A_1,0.076853
2,A_10,0.768567
3,A_100,0.307436
4,A_103,0.153711
...,...,...
62,A_89,0.076853
63,A_9,0.230571
64,A_92,0.076853
65,A_95,0.076853


In [19]:
c[c['s_region'] == "A_100"]['index'].values[0]

0.30743555555555546

In [20]:
adj_matrix_process("Data", ["A", "B"], [118, 30], "Data")

saving result to Data/adj_matrix.npy


In [21]:
ad = np.load("Data/adj_matrix.npy")
ad

array([[ 0.        ,  0.07685333,  0.76856667, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.23057111,  0.        ,  0.46112889, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.61484444,  0.23057111,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
        45.65078444, 29.98957111],
       [ 0.        ,  0.        ,  0.        , ..., 42.61851111,
         0.        , 39.71952444],
       [ 0.        ,  0.        ,  0.        , ..., 29.28982   ,
        39.68619556,  0.        ]])

In [22]:
# 118 + 30
ad.shape

(148, 148)

In [23]:
# infection matrix & region_names
def infection_process(data_path, city_list, region_nums, output_path):
    res = []
    region_name_list = []
    for i, city in enumerate(city_list):
        filename = os.path.join(data_path, "city_%s" % city, "infection.csv")
        migration = pd.read_csv(filename, 
                                sep=',', 
                                header=None,
                                names=["city", "region", "date", "infect"])

        order = sorted(range(region_nums[i]), key=lambda x:str(x))
        for j, idx in enumerate(order):
            target_region = idx #str(idx)
            df = migration[migration['region'] == target_region].reset_index(drop=True)
            if i == 0 and j == 0:
                df = df[['date', 'infect']]
            else:
                df = df[['infect']]

            df = df.rename(columns={'infect': '%s_%d' % (city, idx)})
            region_name_list.append("%s_%d" % (city, idx))

            res.append(df)
    df = pd.concat(res, axis=1)

    file_to_save = os.path.join(output_path, "infection.csv")
    print("saving result to %s" % file_to_save)
    # format: [date, A, B, C, D, E]
    df.to_csv(file_to_save, index=False)

    region_name_file = os.path.join(output_path, "region_names.txt")
    with open(region_name_file, 'w') as f:
        names = ' '.join(region_name_list)
        f.write(names + '\n')

In [29]:
mm = pd.read_csv("Data/city_A/infection.csv", sep=',', 
                                header=None,
                                names=["city", "region", "date", "infect"])
mm.head()

Unnamed: 0,city,region,date,infect
0,A,0,21200501,0
1,A,0,21200502,0
2,A,0,21200503,0
3,A,0,21200504,0
4,A,0,21200505,0


In [33]:
dfdf = mm[mm['region'] == 0].reset_index(drop=True)
dfdf

Unnamed: 0,city,region,date,infect
0,A,0,21200501,0
1,A,0,21200502,0
2,A,0,21200503,0
3,A,0,21200504,0
4,A,0,21200505,0
5,A,0,21200506,0
6,A,0,21200507,0
7,A,0,21200508,0
8,A,0,21200509,0
9,A,0,21200510,0


In [24]:
infection_process("Data", ["A", "B"], [118, 30], "Data")

saving result to Data/infection.csv


In [26]:
pd.read_csv("Data/infection.csv").head()

Unnamed: 0,date,A_0,A_1,A_10,A_100,A_101,A_102,A_103,A_104,A_105,...,B_27,B_28,B_29,B_3,B_4,B_5,B_6,B_7,B_8,B_9
0,21200501,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,21200502,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,21200503,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,21200504,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,21200505,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [38]:
pd.read_table("Data/region_names.txt", sep=' ', header=None)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,138,139,140,141,142,143,144,145,146,147
0,A_0,A_1,A_10,A_100,A_101,A_102,A_103,A_104,A_105,A_106,...,B_27,B_28,B_29,B_3,B_4,B_5,B_6,B_7,B_8,B_9


In [39]:
# similar to infection matrix, merge the new migration matrix
def region_migration_process(data_path, city_list, region_nums, output_path):
    res = []
    #  import ipdb; ipdb.set_trace()
    for i, city in enumerate(city_list):
        filename = os.path.join(output_path, "%s_migration.csv" % city)
        migration = pd.read_csv(filename, 
                                sep=',', 
                                header=None,
                                names=['date', 's_region', 'e_region', 'index'])

        order = sorted(range(region_nums[i]), key=lambda x:str(x))
        for j, idx in enumerate(order):
            target_region = "%s_%d" % (city, idx)
            df = migration[migration['e_region'] == target_region]

            df = df.groupby('date')['index'].sum().reset_index()

            if i == 0 and j == 0:
                df = df[['date', 'index']]
            else:
                df = df[['index']]

            df = df.rename(columns={'index': target_region})

            res.append(df)

    df = pd.concat(res, axis=1)

    file_to_save = os.path.join(output_path, "region_migration.csv")
    print("saving result to %s" % file_to_save)
    # format: [date, A, B, C, D, E]
    df.to_csv(file_to_save, index=False, float_format = '%.2f')

In [40]:
region_migration_process("Data", ["A", "B"], [118, 30], "Data")

saving result to Data/region_migration.csv


In [None]:
if __name__=="__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--data_path', type=str, default='./dataset/train_data')
    parser.add_argument('--output_path', type=str, default='./dataset/data_processed')
    args = parser.parse_args()

    if not os.path.exists(args.output_path):
        os.makedirs(args.output_path)

    city_list = ["A", "B", "C", "D", "E"]
    region_nums = [118, 30, 135, 75, 34]

    print("migration process")

    migration_process(args.data_path, city_list, args.output_path)
    adj_matrix_process(args.data_path, city_list, region_nums, args.output_path)
    infection_process(args.data_path, city_list, region_nums, args.output_path)
    region_migration_process(args.data_path, city_list, region_nums, args.output_path)