In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.linear_model import LinearRegression as LR

In [2]:
# pd.set_option("display.max_rows", 20000)

In [3]:
root_dir = '/Users/kessapassa/OneDrive/research_log/2018_Graduate/OD/'

In [4]:
def find_not_nan_index(_list):
    index = -1

    for key, value in enumerate(_list):
        if not np.isnan(value):
            index = key
    
    return index

In [5]:
def interpolate_times(df):
    convert_dic = {
        '3600': 0,
        '21600': 5,
        'is_arrived': 6
    }

    new_times = []
    for row in np.asanyarray(df):
        times = np.delete(row, -1)
            
        if np.isnan(times[0]):
            index = find_not_nan_index(times)
            for i in range(0, index):
                times[i] = times[index]
                
        if row[6] == True:
            times = times[::-1]
            index = find_not_nan_index(times)
            for i in range(0, index):
                times[i] = times[index]
            times = times[::-1]
        
        new_times.append(times)
        
    return pd.DataFrame(new_times)

In [6]:
dir_list = ['2_8', '4_6', '6_4', '8_2']
seed_list = [str(123 + i) for i in range(3)]
times_list = [str(3600 * (i + 1)) for i in range(6)]

csv_array = {}
for _dir in dir_list:
    csv_array[_dir] = {}
    for _seed in seed_list:
        csv_array[_dir][_seed] = pd.read_csv(root_dir+_dir+'seed'+_seed+'.csv', index_col=0)
        csv_array[_dir][_seed] =  csv_array[_dir][_seed].loc[csv_array[_dir][_seed]['type'] == ' Vehicle']
        csv_array[_dir][_seed] .reset_index(drop=True, inplace=True)
        csv_array[_dir][_seed][times_list] = interpolate_times(csv_array[_dir][_seed][times_list + ['is_arrived']])
#         csv_array[_dir][_seed].dropna(how='any', inplace=True)
        csv_array[_dir][_seed].to_csv('/Users/kessapassa/OneDrive/research_log/2018_Graduate/OD2/'+_dir+'seed'+_seed+'.csv')

In [7]:
csv_array['6_4']['123']

Unnamed: 0,id,type,3600,7200,10800,14400,18000,21600,is_arrived
0,57426,Vehicle,430.0,430.0,430.0,430.0,430.0,430.0,True
1,39467,Vehicle,330.0,330.0,330.0,330.0,330.0,330.0,True
2,65310,Vehicle,330.0,330.0,330.0,330.0,330.0,330.0,True
3,36310,Vehicle,430.0,430.0,430.0,430.0,430.0,430.0,True
4,51630,Vehicle,430.0,430.0,430.0,430.0,430.0,430.0,True
5,48006,Vehicle,430.0,430.0,430.0,430.0,430.0,430.0,True
6,40967,Vehicle,440.0,440.0,440.0,440.0,440.0,440.0,True
7,44138,Vehicle,330.0,330.0,330.0,330.0,330.0,330.0,True
8,48257,Vehicle,440.0,440.0,440.0,440.0,440.0,440.0,True
9,43848,Vehicle,440.0,440.0,440.0,440.0,440.0,440.0,True


In [8]:
for _dir in dir_list:
    for _seed in seed_list:
        print(csv_array[_dir][_seed].shape)

(4768, 9)
(9553, 9)
(14458, 9)
(19313, 9)


# NaNではない最初のindexを見つける

In [9]:
def find_not_nan_index(_list):
    index = -1

    for key, value in enumerate(_list):
        if not np.isnan(value):
            index = key
    
    return index

# 出現していない時間は最初のエリアで補間する

In [10]:
def interpolate_absence_time(df):
    times_list = [str(3600 * (i + 1)) for i in range(6)]

    # 抽出して削除
    only_nan = df.loc[pd.isna(df['3600']) == True, :]
    not_nan = df.loc[pd.isna(df['3600']) != True, :]

    new_times_list = []
    for times in np.asanyarray(only_nan[times_list]):
        index = find_not_nan_index(times)
        for i in range(0, index):
            times[i] = times[index]
        new_times_list.append(times)

    only_nan = only_nan.loc[:, ['id', 'type', 'is_arrived']]

    df_times = pd.DataFrame(new_times_list)
    df_times.set_index(only_nan.index, inplace=True)

    df_interpolated = pd.concat([only_nan, df_times], axis=1)
    df_interpolated.columns = df.columns

    df_new = pd.concat([not_nan, df_interpolated])
    
    return df_new

In [11]:
# times_list = [str(3600 * (i + 1)) for i in range(6)]
# interpolate_times(csv_array['6_4']['123'][times_list + ['is_arrived']])

# エリア番号を線形的な数から、iとjで回した数のようにする

In [12]:
def convert_area_to_contour(df):
    new_area_list = []
    for area_id in np.asanyarray(df):
        area_id = int(area_id)
        contour_id = str(area_id // 6)
        contour_id += str(area_id % 6) + '0'
        new_area_list.append(contour_id)
        
    return new_area_list

In [13]:
# times_list = [str(3600 * (i + 1)) for i in range(6)]

# for _dir in dir_list:
#     for _seed in seed_list:
#         csv_array[_dir][_seed][times_list] = interpolate_times(csv_array[_dir][_seed][times_list + ['is_arrived']])
#         csv_array[_dir][_seed].dropna(how='any', inplace=True)
# #         csv_array[_dir][_seed][times_list] = csv_array[_dir][_seed][times_list].apply(convert_area_to_contour)

In [14]:
csv_array['6_4']['123']

Unnamed: 0,id,type,3600,7200,10800,14400,18000,21600,is_arrived
0,57426,Vehicle,430.0,430.0,430.0,430.0,430.0,430.0,True
1,39467,Vehicle,330.0,330.0,330.0,330.0,330.0,330.0,True
2,65310,Vehicle,330.0,330.0,330.0,330.0,330.0,330.0,True
3,36310,Vehicle,430.0,430.0,430.0,430.0,430.0,430.0,True
4,51630,Vehicle,430.0,430.0,430.0,430.0,430.0,430.0,True
5,48006,Vehicle,430.0,430.0,430.0,430.0,430.0,430.0,True
6,40967,Vehicle,440.0,440.0,440.0,440.0,440.0,440.0,True
7,44138,Vehicle,330.0,330.0,330.0,330.0,330.0,330.0,True
8,48257,Vehicle,440.0,440.0,440.0,440.0,440.0,440.0,True
9,43848,Vehicle,440.0,440.0,440.0,440.0,440.0,440.0,True


In [15]:
for _dir in dir_list:
    for _seed in seed_list:
        print(csv_array[_dir][_seed].shape)

(4768, 9)
(9553, 9)
(14458, 9)
(19313, 9)


# 重回帰分析

In [16]:
reader = csv_array['6_4']['123']

In [17]:
Y = reader['21600'].values.reshape(-1, 1)

In [18]:
X = reader.drop(['id', 'type', '21600'], axis=1)

In [19]:
model = LR()
model.fit(X, Y)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
model.coef_

In [None]:
model.intercept_

In [None]:
model.score(X, Y)

In [None]:
pred = model.predict(X)
pred

In [None]:
reader['pred'] = pred
reader