In [1]:
import pandas as pd
import numpy as np
import math

In [None]:
data_dir = '../Data/cali_example/'
save_dir = '../Data/cali_example_overlap_subsets/'

In [None]:
models = ['AV', 'GS', 'GM']

In [None]:
av_2005 = pd.read_csv(data_dir + 'AV_2005_align.csv')
av_2010 = pd.read_csv(data_dir + 'AV_2010_align.csv')
av_2011 = pd.read_csv(data_dir + 'AV_2011_align.csv')
gm = pd.read_csv(data_dir + 'gb_gm_ca_mapped_2010_2016/gb_gm_ca_mapped_2010_2016/GM_align_2010_2016.csv')
gb = pd.read_csv(data_dir + 'gb_gm_ca_mapped_2010_2016/gb_gm_ca_mapped_2010_2016/GB_align_2010_2016.csv')

In [None]:
av_2005.columns = ['lon', 'lat', 'pm25']
av_2010.columns = ['lon', 'lat', 'pm25'] 
av_2011.columns = ['lon', 'lat', 'pm25']

In [None]:
time_2005 = np.repeat(2005, av_2005.shape[0])
time_2010 = np.repeat(2010, av_2010.shape[0])
time_2011 = np.repeat(2011, av_2011.shape[0])

In [None]:
av_2005['time'] = time_2005
av_2010['time'] = time_2010
av_2011['time'] = time_2011

In [None]:
av_2005_2010 = av_2005.append(av_2010)
av_2005_2011 = av_2005_2010.append(av_2011)

In [None]:
gm_2005_2011 = gm.loc[gm.time.isin([2005, 2010, 2011])]
gb_2005_2011 = gb.loc[gb.time.isin([2005, 2010, 2011])]
# gm_2005 = gm.loc[gm.time.isin([2005])]
# gb_2005 = gb.loc[gb.time.isin([2005])]

In [None]:
intersecting_lat_lon = pd.merge(av_2005_2011, gm_2005_2011, how='inner', on=['lon', 'lat', 'time'])

In [None]:
intersecting_lat_lon.head()

In [None]:
intersecting_lat_lon.columns = ['lon', 'lat', 'pm25_av', 'time', 'Unnamed: 0', 'pm25_gm']

In [None]:
intersecting_lat_lon_all = pd.merge(gb_2005_2011, intersecting_lat_lon, how = 'inner', on=['lon', 'lat', 'time'])

In [None]:
intersecting_lat_lon_all.head()

In [None]:
intersecting_lat_lon_all.columns = ['Unnamed: 0_x', 'time', 'y', 'x', 'pred_gb', 'pred_av', 'Unnamed: 0_y',
       'pred_gm']

In [None]:
intersecting_lat_lon_all = intersecting_lat_lon_all[['time','x', 'y' 'pred_gb', 'pred_av', 'pred_gm']]

In [None]:
all_preds_file = save_dir + 'predictions_2005_2011.csv'

In [None]:
intersecting_lat_lon_all.to_csv(all_preds_file, index = False)

In [None]:
intersecting_lat_lon_all = intersecting_lat_lon_all[~intersecting_lat_lon_all['pm25_av'].isnull()]

In [None]:
intersecting_lat_lon_all.head()

In [None]:
av_2005_2011 = intersecting_lat_lon_all[['lon', 'lat', 'pm25_av', 'time']]
gm_2005_2011 = intersecting_lat_lon_all[['lon', 'lat', 'pm25_gm', 'time']]
gb_2005_2011 = intersecting_lat_lon_all[['lon', 'lat', 'pm25', 'time']]

In [None]:
av_2005_2011_file = data_dir + 'AV_2005_2011_align.csv'
gm_2005_2011_file = data_dir + 'GM_2005_2011_align.csv'
gb_2005_2011_file = data_dir + 'GB_2005_2011_align.csv'
av_2005_2011.to_csv(av_2005_2011_file, index = False)
gm_2005_2011.to_csv(gm_2005_2011_file, index = False)
gb_2005_2011.to_csv(gb_2005_2011_file, index = False)

In [None]:
av_2005_2011.head()

In [None]:
av_2005_2011.columns = ['lon', 'lat', 'pm25', 'time']
gm_2005_2011.columns = ['lon', 'lat', 'pm25', 'time']

In [None]:
min_lon = min(av_2005_2011.lon)
max_lon = max(av_2005_2011.lon)
min_lat = min(av_2005_2011.lat)
max_lat = max(av_2005_2011.lat)

upper_left = (max_lat, min_lon)
upper_right = (max_lat, max_lon)
lower_left = (min_lat, min_lon)
lower_right = (min_lat, max_lon)

In [None]:
print (upper_left, upper_right, lower_left, lower_right)

In [None]:
test = av_2005.loc[(av_2005.lon >= min_lon) & (av_2005.lon <= max_lon) & (av_2005.lat >= min_lat) & (av_2005.lat <= max_lat)]
test.shape

In [None]:
X_valid = np.asarray(av_2005_2011[["lon", "lat"]].values.tolist()).astype(np.float32)

In [None]:
X_scale = np.max(X_valid, axis=0) - np.min(X_valid, axis=0)

In [None]:
X_scale

In [None]:
X_valid[:5]

In [None]:
def round_nearest(x, a):
    return round(round(x / a) * a, -int(math.floor(math.log10(a))))

In [None]:
split_range = [X_scale[0]/6, X_scale[1]/6]
split_range = [ round(elem, 4) for elem in split_range ]
split_lon_range = split_range[0]
split_lat_range = split_range[1]
print (split_lon_range, split_lat_range)

In [None]:
start_split_lon = []
for i in range(10):
    start_split_lon.append(min_lon + i*split_lon_range)
    
start_split_lat = []
for i in range(10):
    start_split_lat.append(min_lat + i*split_lat_range)

In [None]:
a = [round_nearest(x, 0.005) for x in start_split_lat]
b = [x + 0.005 if (str(x)[-1] != '5' or len(str(x).split('.')[1]) !=  3) else x for x in a]
start_split_lat = [round(x, -int(math.floor(math.log10(0.005)))) for x in b]

In [None]:
start_split_lat, start_split_lon

In [None]:
start_split_lon = [round(x, 3) for x in start_split_lon]
end_split_lon = start_split_lon[1:]
end_split_lon.append(max_lon)
start_split_lon = [x - 0.5 for x in start_split_lon[1:]]
start_split_lon.insert(0, min_lon)
print (start_split_lon)
print (end_split_lon)

end_split_lat = start_split_lat[1:]
end_split_lat.append(max_lat)
start_split_lat = [x - 0.5 for x in start_split_lat[1:]]
start_split_lat.insert(0, min_lat)

lon_range = list(zip(start_split_lon, end_split_lon))
lat_range = list(zip(start_split_lat, end_split_lat))

In [None]:
lat_range

In [None]:
lon_range

In [None]:
monthly_pred = pd.DataFrame(columns=['lon', 'lat', 'pm25', 'time'])
subset_num = 1
num_subset = {'AV' : 1, 'GB' : 1, 'GM' : 1}
num_coords = 0
models = [av_2005_2011, gm_2005_2011, gb_2005_2011]
model_list = ['AV', 'GM', 'GB']

for m, model in enumerate(models):
    for i in range(len(lat_range)):
        for j in range(len(lon_range)):
            df_subset = model.loc[(model.lon >= lon_range[j][0]) & (model.lon <= lon_range[j][1]) & (model.lat >= lat_range[i][0]) & (model.lat <= lat_range[i][1])]
            df_subset = df_subset[['lon', 'lat', 'pm25', 'time']]
            monthly_pred = monthly_pred.append(df_subset)
            if (df_subset.shape[0] != 0):
                new_file = '{}{}_2005_2011_align.{}.csv'.format(save_dir, model_list[m], num_subset[model_list[m]])
#                 print (new_file)
                num_subset[model_list[m]] += 1
#                 df_subset.to_csv(new_file, index = False)

In [None]:
print (num_coords/3)

In [None]:
monthly_pred.drop_duplicates(inplace = True)

In [None]:
av_jan.shape

In [None]:
split_range

In [None]:
split_poitns = []

In [None]:
#this creates different sized splits

num_lon_splits = 6
total_coords = 0
lon_splits = np.array_split(np.unique(np.sort(av_jan.lon.values)), num_lon_splits)
num_subset = {'AV' : 1, 'GS' : 1, 'GM' : 1}

for model in models:
    model_df = pd.read_csv('{}{}_clean_20101_align.csv'.format(data_dir,model))
    for i in range(num_lon_splits):
        sub_lon = lon_splits[i]
        df_sub = model_df.loc[model_df.lon.isin(sub_lon)]
        num_lat_splits = 5
        lat_splits = np.array_split(np.unique(np.sort(df_sub.lat.values)), num_lat_splits)
        for j in range(num_lat_splits):
            sub_lat = lat_splits[j]
            df_sub_sub = df_sub.loc[df_sub.lat.isin(sub_lat)]
            total_coords += df_sub_sub.shape[0]
            new_file = '{}{}_clean_20101_align.{}.csv'.format(data_dir, model, num_subset[model])
            print (new_file)
            num_subset[model] += 1
            print (df_sub_sub.shape)
#             df_sub_sub.to_csv(new_file, index = False)

In [None]:
# using coordinates from R
num_coords = 0
num_r_subsegs = 36
num_subsegs = {'AV' : 0, 'GS' : 0, 'GM' : 0}

for model in models:
    model_df = pd.read_csv('{}{}_clean_20101_align.csv'.format(data_dir,model))
    for i in range(1, num_r_subsegs + 1):
        file_name = '{}coordinates_{}.csv'.format(data_dir, i)
        coordinates = pd.read_csv(file_name)
        lon_list = list(np.around(np.array(np.unique(coordinates.x.values)),3))
        lat_list = list(np.around(np.array(np.unique(coordinates.y.values)),3))
        df_sub = model_df.loc[(model_df.lon.isin(lon_list)) & (model_df.lat.isin(lat_list))]
        if (df_sub.shape[0] != 0):
            df_sub = df_sub[['lon', 'lat', 'pm25']]
            num_subsegs[model] += 1
            new_file = '{}{}_clean_20101_align.{}.csv'.format(data_dir, model, num_subsegs[model])
            df_sub.to_csv(new_file, index = False)

# Nationwide

In [46]:
data_dir = '../Data/nationwide/'
save_dir = '../Data/nationwide_subsets/'

In [None]:
av_2010 = pd.read_csv(data_dir + 'AV_2010_align.csv')

In [None]:
scott_2010 = pd.read_csv(data_dir + 'scott_2010_align.csv')

In [None]:
gbd_2010 = pd.read_csv(data_dir + 'GBD_2010_align.csv')

In [None]:
av_2010.shape, gbd_2010.shape, scott_2010.shape, test_scott_2010.shape

In [3]:
av_2010_2016 = pd.DataFrame(columns=['lon', 'lat', 'pred_AV', 'time'])
for i in range(2010, 2017):
    av_file_name = '{}AV_{}_align.csv'.format(data_dir, i)
    av_i = pd.read_csv(av_file_name)
    time_i = np.repeat(i, av_i.shape[0])
    av_i['time'] = time_i
    print (av_i.columns)
    av_i = av_i.rename(columns={"pm25": "pred_AV"})
    print (av_i.columns)
    av_2010_2016 = av_2010_2016.append(av_i)

av_2010_2016 = av_2010_2016.loc[av_2010_2016.pred_AV != -999.99]

Index(['lon', 'lat', 'pm25', 'time'], dtype='object')
Index(['lon', 'lat', 'pred_AV', 'time'], dtype='object')
Index(['lon', 'lat', 'pm25', 'time'], dtype='object')
Index(['lon', 'lat', 'pred_AV', 'time'], dtype='object')
Index(['lon', 'lat', 'pm25', 'time'], dtype='object')
Index(['lon', 'lat', 'pred_AV', 'time'], dtype='object')
Index(['lon', 'lat', 'pm25', 'time'], dtype='object')
Index(['lon', 'lat', 'pred_AV', 'time'], dtype='object')
Index(['lon', 'lat', 'pm25', 'time'], dtype='object')
Index(['lon', 'lat', 'pred_AV', 'time'], dtype='object')
Index(['lon', 'lat', 'pm25', 'time'], dtype='object')
Index(['lon', 'lat', 'pred_AV', 'time'], dtype='object')
Index(['lon', 'lat', 'pm25', 'time'], dtype='object')
Index(['lon', 'lat', 'pred_AV', 'time'], dtype='object')


In [4]:
gbd_2010_2016 = pd.DataFrame(columns=['time', 'lat', 'lon', 'pred_GS'])
for i in range(2010, 2017):
    gbd_file_name = '{}GBD_{}_align.csv'.format(data_dir, i)
    gbd_i = pd.read_csv(gbd_file_name)
    gbd_i = gbd_i.drop(columns=['Unnamed: 0'])
    print(gbd_i.columns)
    gbd_i = gbd_i.rename(columns={'pm25': "pred_GS"})
    print(gbd_i.columns)
    gbd_2010_2016 = gbd_2010_2016.append(gbd_i)

Index(['time', 'lat', 'lon', 'pm25'], dtype='object')
Index(['time', 'lat', 'lon', 'pred_GS'], dtype='object')
Index(['time', 'lat', 'lon', 'pm25'], dtype='object')
Index(['time', 'lat', 'lon', 'pred_GS'], dtype='object')
Index(['time', 'lat', 'lon', 'pm25'], dtype='object')
Index(['time', 'lat', 'lon', 'pred_GS'], dtype='object')
Index(['time', 'lat', 'lon', 'pm25'], dtype='object')
Index(['time', 'lat', 'lon', 'pred_GS'], dtype='object')
Index(['time', 'lat', 'lon', 'pm25'], dtype='object')
Index(['time', 'lat', 'lon', 'pred_GS'], dtype='object')
Index(['time', 'lat', 'lon', 'pm25'], dtype='object')
Index(['time', 'lat', 'lon', 'pred_GS'], dtype='object')
Index(['time', 'lat', 'lon', 'pm25'], dtype='object')
Index(['time', 'lat', 'lon', 'pred_GS'], dtype='object')


In [21]:
observed_2010_2016 = pd.DataFrame(columns = ['time', 'x', 'y', 'pm25_obs'])
for i in range(2010, 2017):
    obs_file_name = '{}pm25_observed_{}.csv'.format(data_dir, i)
    obs_i = pd.read_csv(obs_file_name)
    print (obs_i.columns)
    observed_2010_2016 = observed_2010_2016.append(obs_i)

Index(['time', 'x', 'y', 'pm25_obs'], dtype='object')
Index(['time', 'x', 'y', 'pm25_obs'], dtype='object')
Index(['time', 'x', 'y', 'pm25_obs'], dtype='object')
Index(['time', 'x', 'y', 'pm25_obs'], dtype='object')
Index(['time', 'x', 'y', 'pm25_obs'], dtype='object')
Index(['time', 'x', 'y', 'pm25_obs'], dtype='object')
Index(['time', 'x', 'y', 'pm25_obs'], dtype='object')


In [23]:
observed_2010_2016.shape

(5861, 4)

In [None]:
scott_2010_2016 = pd.DataFrame(columns=['time', 'lat', 'lon', 'pred_SC'])
for i in range(2010, 2017):
    scott_file_name = '{}scott_{}_align.csv'.format(data_dir, i)
    scott_i = pd.read_csv(scott_file_name)
    scott_i = scott_i.drop(columns=['Unnamed: 0'])
    print(scott_i.columns)
    scott_i = scott_i.rename(columns={'pm25': "pred_SC"})
    print(scott_i.columns)
    scott_2010_2016 = scott_2010_2016.append(scott_i)

In [5]:
cmaq_2010_2016 = pd.read_csv('{}mapped_cmaq.csv'.format(data_dir))

In [6]:
cmaq_2010_2016 = cmaq_2010_2016.drop(columns = ['GEOID'])

In [7]:
cmaq_2010_2016 = cmaq_2010_2016.rename(columns={"pm25": "pred_CM"})

In [8]:
cmaq_2010_2016 = cmaq_2010_2016.dropna()

In [9]:
cmaq_2010_2016.dtypes

lon        float64
lat        float64
pred_CM    float64
time       float64
dtype: object

In [10]:
cmaq_2010_2016 = cmaq_2010_2016.astype({'time': 'int64'})

In [11]:
cmaq_2010_2016.dtypes

lon        float64
lat        float64
pred_CM    float64
time         int64
dtype: object

In [12]:
av_2010_2016.shape[0]/7, gbd_2010_2016.shape[0]/7, cmaq_2010_2016.shape[0]/7

(8273037.0, 8348297.0, 8186013.0)

In [24]:
# tri-state bounding box 

min_lon = -124.41
max_lon = -114.13
min_lat = 32.53
max_lat = 42.01

In [27]:
def filter_state(df, min_lon, max_lon, min_lat, max_lat):
    out = df[(df.lat>=min_lat) & (df.lat<=max_lat) & \
    (df.lon>=min_lon) & (df.lon<=max_lon)]
    
    return out

In [28]:
observed_2010_2016_CA = filter_state(observed_2010_2016, min_lon, max_lon, min_lat, max_lat)

In [30]:
observed_2010_2016_CA.shape[0]

88.71428571428571

In [31]:
observed_2010_2016_CA.to_csv('CA_observed_2010_2016.csv', header = False)

In [14]:
av_2010_2016 = filter_state(av_2010_2016, min_lon, max_lon, min_lat, max_lat)
gbd_2010_2016 = filter_state(gbd_2010_2016, min_lon, max_lon, min_lat, max_lat)
cmaq_2010_2016 = filter_state(cmaq_2010_2016, min_lon, max_lon, min_lat, max_lat)

In [19]:
av_2010_2016.shape[0]/7, gbd_2010_2016.shape[0]/7, cmaq_2010_2016.shape[0]/7

(729301.0, 724588.0, 727962.0)

In [13]:
intersecting_av_gbd = pd.merge(av_2010_2016, gbd_2010_2016, how = 'inner', on=['time', 'lon', 'lat'])

In [14]:
intersecting_all = pd.merge(intersecting_av_gbd, cmaq_2010_2016, how = 'inner', on=['time', 'lon', 'lat'])

In [None]:
intersecting_av_scott = pd.merge(av_2010_2016, scott_2010_2016, how = 'inner', on=['time','lon', 'lat'])

In [None]:
intersecting_av_gbd_scott = pd.merge(intersecting_av_scott, gbd_2010_2016, how = 'inner', on=['time','lat','lon'])

In [None]:
intersecting_all = pd.merge(intersecting_av_gbd_scott, cmaq_2010_2016, how = 'inner', on=['time', 'lat', 'lon'])

In [16]:
intersecting_all.head()

Unnamed: 0,lon,lat,pred_AV,time,pred_GS,pred_CM
0,-95.145,49.375,5.1,2010,5.897139,7.13877
1,-95.145,49.365,5.1,2010,5.897139,7.13877
2,-95.135,49.365,5.1,2010,5.897139,7.13877
3,-95.125,49.365,5.2,2010,5.897139,7.13877
4,-95.115,49.365,5.2,2010,5.897139,7.13877


In [None]:
intersecting_av_gbd_scott.shape[0]/7

In [15]:
intersecting_all.shape[0]/7

8142295.0

In [17]:
intersecting_all.columns

Index(['lon', 'lat', 'pred_AV', 'time', 'pred_GS', 'pred_CM'], dtype='object')

In [None]:
intersecting_av_gbd_scott.shape[0]/7

In [None]:
intersecting_av_gbd_scott.head()

In [None]:
intersecting_av_gbd_scott.columns

In [18]:
all_pred_file = '{}USA_predictions_2010_2016_AV_GBD_CMAQ.csv'.format(data_dir)
intersecting_all.to_csv(all_pred_file, index = False)  

In [None]:
intersecting_av_gbd_scott.head()

In [None]:
intersecting_av_gbd_scott.shape

In [24]:
intersecting_av_gbd_scott = pd.read_csv('{}USA_predictions_2010_2016_AV_GBD_CMAQ.csv'.format(data_dir))

In [25]:
intersecting_av_gbd_scott.shape[0]/7

8142295.0

In [23]:
intersecting_all.shape[0]/7

8142295.0

In [51]:
intersecting_av_gbd_scott.columns

Index(['lon', 'lat', 'time', 'pred_AV', 'pred_GS', 'pred_CM'], dtype='object')

In [50]:
intersecting_av_gbd_scott.shape[0]/7

8142295.0

In [52]:
all_models = ['AV', 'GS', 'CM']
for m in all_models:
    pred_name = 'pred_{}'.format(m)
    pred_subset = intersecting_av_gbd_scott[['lon', 'lat', 'time', pred_name]]
    pred_subset = pred_subset.rename(columns={pred_name: "pm25"})
    new_file = 'USA_{}_2010_2016_align.csv'.format(m)
    print (new_file)
    pred_subset.to_csv(new_file, index = False)

USA_AV_2010_2016_align.csv
USA_GS_2010_2016_align.csv
USA_CM_2010_2016_align.csv


In [None]:
intersecting_av_gbd_scott.shape

In [None]:
intersecting_av_gbd_scott.shape

In [47]:
intersecting_av_gbd_scott.head()

Unnamed: 0,lon,lat,time,pred_AV,pred_GS,pred_CM
0,-124.195,42.005,2010,3.5,4.944805,5.367877
1,-124.185,42.005,2010,3.5,4.944805,5.367877
2,-124.175,42.005,2010,3.5,4.944805,5.367877
3,-124.165,42.005,2010,3.5,4.944805,5.367877
4,-124.155,42.005,2010,3.5,4.944805,5.367877


In [None]:
intersecting_av_gbd_scott.head()

In [36]:
one_year = intersecting_av_gbd_scott.loc[intersecting_av_gbd_scott.time == 2010]

In [31]:
one_year.shape[0]

722912

In [37]:
min_lon = min(one_year.lon)
max_lon = max(one_year.lon)
min_lat = min(one_year.lat)
max_lat = max(one_year.lat)

In [38]:
X_valid = np.asarray(one_year[["lon", "lat"]].values.tolist()).astype(np.float32)
X_scale = np.max(X_valid, axis=0) - np.min(X_valid, axis=0)

In [39]:
def round_nearest(x, a):
    return round(round(x / a) * a, -int(math.floor(math.log10(a))))

In [40]:
def round_location(locations):
    round_loc = [round_nearest(x, 0.005) for x in locations]
    round_loc = [x + 0.005 if (str(x)[-1] != '5' or len(str(x).split('.')[1]) !=  3) else x for x in round_loc]
    round_loc =  [round(x, -int(math.floor(math.log10(0.005)))) for x in round_loc]
    return round_loc

In [41]:
def get_splits(X_val, num_splits):
    split_range = [X_scale[0]/num_splits, X_scale[1]/num_splits]
    split_range = [round(elem, 4) for elem in split_range ]
    split_lon_range = split_range[0]
    split_lat_range = split_range[1]

    start_split_lon = []
    for i in range(num_splits):
        start_split_lon.append(min_lon + i*split_lon_range)

    start_split_lat = []
    for i in range(num_splits):
        start_split_lat.append(min_lat + i*split_lat_range)

    start_split_lon = round_location(start_split_lon)
    start_split_lat = round_location(start_split_lat)
    
    end_split_lon = start_split_lon[1:]
    end_split_lon.append(max_lon)
    start_split_lon = [x - 0.5 for x in start_split_lon[1:]]
    start_split_lon.insert(0, min_lon)

    end_split_lat = start_split_lat[1:]
    end_split_lat.append(max_lat)
    start_split_lat = [x - 0.5 for x in start_split_lat[1:]]
    start_split_lat.insert(0, min_lat)
    
    lon_range = list(zip(start_split_lon, end_split_lon))
    lat_range = list(zip(start_split_lat, end_split_lat))
    
    return lon_range, lat_range

In [42]:
lon_range, lat_range = get_splits(X_valid, 50)

In [43]:
intersecting_av_gbd_scott = intersecting_av_gbd_scott[['lon', 'lat', 'time', 'pred_AV', 'pred_GS', 'pred_CM']]

In [48]:
full_data = pd.DataFrame(columns=['lon', 'lat', 'time', 'pred_AV', 'pred_GS', 'pred_CM'])
model = intersecting_av_gbd_scott
all_models = ['AV', 'GS', 'CM']
subset_num = 0
subset_shapes = []

most_recent_subset = pd.DataFrame(columns=['lon', 'lat', 'pred_AV', 'pred_GS', 'pred_CM'])

for i in range(len(lat_range)):
    for j in range(len(lon_range)):
        df_subset = model.loc[(model.lon >= lon_range[j][0]) & (model.lon <= lon_range[j][1]) & (model.lat >= lat_range[i][0]) & (model.lat <= lat_range[i][1])]
        df_subset = df_subset[['lon', 'lat', 'time', 'pred_AV', 'pred_GS', 'pred_CM']]
        if (df_subset.shape[0] != 0):
            if (df_subset.shape[0] > 800):
                most_recent_subset = df_subset
                print (most_recent_subset.shape)
                full_data = full_data.append(most_recent_subset)
                subset_shapes.append(most_recent_subset.shape[0])
                subset_num += 1
                for m in all_models:
                    pred_name = 'pred_{}'.format(m)
                    pred_subset = most_recent_subset[['lon', 'lat', 'time', pred_name]]
                    print (pred_subset.columns)
                    pred_subset = pred_subset.rename(columns={pred_name: "pm25"})
                    print (pred_subset.columns)
                    new_file = '{}{}_2010_2016_align.{}.csv'.format(save_dir, m, subset_num)
                    print (new_file)
                    print (pred_subset.head())
#                     pred_subset.to_csv(new_file, index = False)
            else:
                print ("DF Subset: " + str(df_subset.shape))
                most_recent_subset = most_recent_subset.append(df_subset)
                print (most_recent_subset.shape)
                try:
                    subset_shapes.pop()
                except:
                    pass
                subset_shapes.append(most_recent_subset.shape[0])
                full_data = full_data.append(most_recent_subset)
                for m in all_models:
                    pred_name = 'pred_{}'.format(m)
                    pred_subset = most_recent_subset[['lon', 'lat', 'time', pred_name]]
                    pred_subset = pred_subset.rename(columns={pred_name: "pm25"})
                    print (pred_subset.columns)
                    new_file = '{}{}_2010_2016_align.{}.csv'.format(save_dir, m, subset_num)
#                     print (new_file)
                    print (pred_subset.head())
#                     pred_subset.to_csv(new_file, index = False) 

(917, 6)
Index(['lon', 'lat', 'time', 'pred_AV'], dtype='object')
Index(['lon', 'lat', 'time', 'pm25'], dtype='object')
../Data/nationwide_subsets/AV_2010_2016_align.1.csv
            lon     lat  time  pm25
8142164 -81.565  24.695  2010   7.7
8142165 -81.555  24.695  2010   7.7
8142166 -81.545  24.695  2010   7.7
8142167 -81.535  24.695  2010   7.7
8142168 -81.525  24.695  2010   7.7
Index(['lon', 'lat', 'time', 'pred_GS'], dtype='object')
Index(['lon', 'lat', 'time', 'pm25'], dtype='object')
../Data/nationwide_subsets/GS_2010_2016_align.1.csv
            lon     lat  time      pm25
8142164 -81.565  24.695  2010  6.099124
8142165 -81.555  24.695  2010  6.099124
8142166 -81.545  24.695  2010  6.099124
8142167 -81.535  24.695  2010  6.099124
8142168 -81.525  24.695  2010  6.099124
Index(['lon', 'lat', 'time', 'pred_CM'], dtype='object')
Index(['lon', 'lat', 'time', 'pm25'], dtype='object')
../Data/nationwide_subsets/CM_2010_2016_align.1.csv
            lon     lat  time      pm25
814216

KeyboardInterrupt: 

In [None]:
print (len(subset_shapes))
print (np.min(subset_shapes))

In [None]:
full_data.drop_duplicates(inplace = True)
# print (av_2010_2016.shape[0]/7)
print (full_data.shape)

# 1605 subsets or 1590?