-
Notifications
You must be signed in to change notification settings - Fork 1
/
iii_check_dupl_from_csv.py
262 lines (206 loc) · 10.3 KB
/
iii_check_dupl_from_csv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
# USED
# Check for duplicates in data for climatology
import pandas as pd
import numpy as np
from tqdm import trange
from copy import deepcopy
import os
variable_name = 'Sal' # 'Sal' 'Oxy'
# Now with the one big csv file
infile = 'C:\\Users\\HourstonH\\Documents\\NEP_climatology\\data\\' \
'profile_data_tables\\ALL_Profiles_{}_1991_2020.csv'.format(
variable_name)
duplicate_folder = 'C:\\Users\\HourstonH\\Documents\\NEP_climatology\\data\\' \
'profile_data_tables\\duplicates_flagged\\'
# infile = 'C:\\Users\\HourstonH\\Documents\\NEP_climatology\\data\\' \
# 'profile_data_tables\\Argo\\NODC_noCAD_PFL_Profiles_Oxy_1991_2020.csv'
df_all = pd.read_csv(infile)
print(df_all.head())
print(df_all.shape)
df_all.drop(columns=['Unnamed: 0'], inplace=True)
###############################################
# # Check for NaN values
# nan_ind = np.where(pd.isna(df_all.Date_string))
# np.where(pd.isna(df_all))
# # These are equal, so only these rows contain NaNs
#
# print(df_all.loc[nan_ind, ('Latitude', 'Longitude')])
#
# # Check MEDS file for NaN times
# fmeds = 'C:\\Users\\HourstonH\\Documents\\NEP_climatology\\data\\meds_data_extracts\\' \
# 'bo_extracts\\MEDS_19940804_19930816_BO_DOXY_profiles_source.csv'
# dmeds = pd.read_csv(fmeds)
#
# dmeds['Hour'] = dmeds.Time.astype(str).apply(lambda x: ('000' + x)[-4:][:-2])
# dmeds['Minute'] = dmeds.Time.astype(str).apply(lambda x: ('000' + x)[-4:][-2:])
#
# dmeds['Timestring'] = pd.to_datetime(
# dmeds[['Year', 'Month', 'Day', 'Hour', 'Minute']]).dt.strftime(
# '%Y%m%d%H%M%S')
############################################
# Find duplicates
# Exact duplicates
df_all['Exact_duplicate_row'] = df_all.duplicated(
subset=['Instrument_type', 'Date_string', 'Latitude', 'Longitude'])
# Accounting statistics
print(len(df_all['Exact_duplicate_row'].iloc[(df_all['Exact_duplicate_row'] == True).values]))
print(len(df_all['Exact_duplicate_row'].iloc[(df_all['Exact_duplicate_row'] == False).values]))
# edf: Exact duplicates flagged
# edf_name = 'C:\\Users\\HourstonH\\Documents\\NEP_climatology\\data_extracts\\' \
# 'duplicates_flagged\\ALL_Profiles_Oxy_1991_2020_edf.csv'
edf_name = duplicate_folder + os.path.basename(infile).replace('.csv', '_edf.csv')
df_all.to_csv(edf_name)
# ------------------------------------------------------------------------------
# How to specify to keep CTD data over BOT data regardless of order in the df?
# Sort df by instrument type before running duplicated(); sort reverse order to
# have CTD before BOT
# Check for exact duplicates between CTD and bottle data
# df_all['CTD_BOT_duplicate_row'] = df_all.sort_values(
# by=['Instrument_type'], ascending=False).duplicated(
# subset=['Date_string', 'Latitude', 'Longitude'])
# Alternative method attempt:
df_all['CTD_BOT_duplicate_row'] = np.repeat(False, len(df_all))
# for i in trange(len(df_all)):
# if df_all.Instrument_type == 'BOT':
# bot_row_info = df_all.loc[i, ['Date_string', 'Latitude', 'Longitude']]
# for j in range(len(df_all)):
# # Check all the other rows
# row_info = df_all.loc[j, ['Date_string', 'Latitude', 'Longitude']]
# if df_all.loc[j, 'Instrument_type'] == 'CTD' and bot_row_info == row_info:
# df_all.loc[i, 'CTD_BOT_duplicate_row'] = True
for i in trange(len(df_all)):
if df_all.loc[i, 'Instrument_type'] == 'BOT':
mask = (df_all.loc[:, 'Instrument_type'] == 'CTD') & \
(df_all.loc[:, 'Date_string'] == df_all.loc[i, 'Date_string']) & \
(df_all.loc[:, 'Latitude'] == df_all.loc[i, 'Latitude']) & \
(df_all.loc[:, 'Longitude'] == df_all.loc[i, 'Longitude'])
if len(mask) > 0:
df_all.loc[i, 'CTD_BOT_duplicate_row'] = True
# Exclude exact duplicates from previous step by subsetting by the negation of
# Exact_duplicate_row
# [mask, column_name]
df_all.loc[df_all['Exact_duplicate_row'].values, 'CTD_BOT_duplicate_row'] = False
# Accounting statistics
print(len(df_all))
print(len(df_all.iloc[df_all['Exact_duplicate_row'].values]))
print(len(df_all.iloc[df_all['CTD_BOT_duplicate_row'].values]))
# Instrument types of rows that were flagged as a CTD-bottle duplicate - should be bot??
print(np.unique(df_all.loc[df_all['CTD_BOT_duplicate_row'].values, 'Instrument_type']))
print(np.unique(df_all.loc[:, 'Instrument_type']))
# CTD-BOT exact duplicates flagged
cb_edf_name = edf_name.replace('edf', 'cb_edf')
df_all.to_csv(cb_edf_name, index=False)
############################################
# For speeding up inexact duplicate checking
def pdt_inexact_dupl(df):
# "|" represents the inclusive "or"
subsetter = (
~(df['Exact_duplicate_row'] | df['CTD_BOT_duplicate_row'])).values
# How many rows subsetter removes
print(len(subsetter[subsetter]), len(subsetter[~subsetter]))
# deepcopy to help avoid memory problems
df_copy = deepcopy(df.loc[subsetter, :])
# Convert time to pandas datetime format
df_copy['Time_pd'] = pd.to_datetime(df_copy.loc[:, 'Date_string'],
format='%Y%m%d%H%M%S')
# Initialize column for inexact duplicates
df_copy['Inexact_duplicate_row'] = np.repeat(False, len(df_copy))
# Get array of indices of dataframe, since they are not linearly spaced
# e.g., 1,2,3,5,6,7,11,...
ind = df_copy.index.values
# Set ranges (limits) for inexact duplicate checking
latlon_lim = 0.01 # decimal degrees
t_lim = pd.Timedelta(1, unit='h') # hours
# Iterate through dataframe to check validity of inexact duplicate checking
# Partner index is the index of the row that other row(s) are flagged as a
# duplicate of
df_copy['Partner_index'] = np.repeat(-1, len(df_copy))
for i in trange(len(df_copy)):
# Create masks to check for values in between selected ranges
mask_lat = df_copy.loc[:, 'Latitude'].between(
df_copy.loc[ind[i], 'Latitude'] - latlon_lim,
df_copy.loc[ind[i], 'Latitude'] + latlon_lim,
inclusive='both')
mask_lon = df_copy.loc[:, 'Longitude'].between(
df_copy.loc[ind[i], 'Longitude'] - latlon_lim,
df_copy.loc[ind[i], 'Longitude'] + latlon_lim,
inclusive='both')
mask_time = df_copy.loc[:, 'Time_pd'].between(
df_copy.loc[ind[i], 'Time_pd'] - t_lim,
df_copy.loc[ind[i], 'Time_pd'] + t_lim,
inclusive='both')
# Mask to check for same instrument
instrument_type = df_copy.loc[ind[i], 'Instrument_type']
mask_inst = df_copy.loc[:, 'Instrument_type'] == instrument_type
# Perform intersection of masks (set 'and')
mask_llt = mask_lat & mask_lon & mask_time & mask_inst
# Exclude the first True occurrence and flag its inexact duplicates
# Make sure that the change "sticks"
# Find index of first occurrence of "True"
# IndexError: index 0 is out of bounds for axis 0 with size 0 for row 50165
# Need to search for rows that have Time_pd == NaT (b/c of Date_string == NaN)
# print(ind[i], len(mask_llt.loc[(mask_llt == True).values]))
if len(mask_llt.loc[(mask_llt == True).values]) > 1:
# Note down partner index?
first_true_ind = mask_llt.loc[mask_llt == True].index[0]
remainder_true_ind = mask_llt.loc[mask_llt == True].index[1:]
df_copy.loc[remainder_true_ind, 'Partner_index'] = first_true_ind
# Union intersect (set inclusive "or") with the "all" mask
df_copy.Inexact_duplicate_row = df_copy.Inexact_duplicate_row | mask_llt
else:
# Set non-duplicate row flag to False
# Don't need to intersect with the Inexact_duplicate_row column
# because there's nothing new to add
first_true_ind = mask_llt.loc[mask_llt == True].index[0]
mask_llt.loc[first_true_ind] = False
# Accounting statistics
print(len(df_copy.Inexact_duplicate_row))
print(len(df_copy.Inexact_duplicate_row.iloc[df_copy.Inexact_duplicate_row.values]))
# Print the number of non-first occurrences of nonexact duplicates
print(len(df_copy.iloc[(df_copy.Partner_index != -1).values]))
# Remove column
df_copy = df_copy.drop(columns='Time_pd')
# Intersect the ie_subset dataframe with the rows that weren't part of the subset
# Get inverse of subsetter
df_copy_inv = df[~subsetter]
# Add columns to df_copy_inv that are in df_copy
df_copy_inv.insert(len(df_copy_inv.columns), 'Inexact_duplicate_row',
np.repeat(False, len(df_copy_inv)))
df_copy_inv.insert(len(df_copy_inv.columns), 'Partner_index',
np.repeat(-1, len(df_copy_inv)))
df_all_out = pd.concat([df_copy, df_copy_inv])
return df_all_out
# # ie_subs stands for inexact subset
# # pi stands for PARTNER INDEX (all duplicate rows are flagged including first occurrence
# # Will need to intersect these rows and the rows of the *cb_edr.csv dataframe
# df_copy_name = 'C:\\Users\\HourstonH\\Documents\\NEP_climatology\\data_extracts\\' \
# 'duplicates_flagged\\ALL_Profiles_Oxy_1991_2020_ie_subs_001ll_pi.csv'
#
# df_out = pdt_inexact_dupl(df_all)
#
# df_out.to_csv(df_copy_name)
# cb_edf_name = 'C:\\Users\\HourstonH\\Documents\\NEP_climatology\\data\\' \
# 'profile_data_tables\\duplicates_flagged\\' \
# 'ALL_Profiles_Oxy_1991_2020_cb_edf.csv'
duplicate_folder = 'C:\\Users\\HourstonH\\Documents\\NEP_climatology\\data\\' \
'profile_data_tables\\duplicates_flagged\\'
# df_all = pd.read_csv(cb_edf_name)
#
# df_out = pdt_inexact_dupl(df_all)
#
# which = 'ALL' # 'PFL' # ALL
#
# df_all_out_name = duplicate_folder + '{}_Profiles_{}_1991_2020_ie_001ll_pi.csv'.format(
# which, variable_name)
#
# df_out.to_csv(df_all_out_name)
for varname in ['Temp', 'Sal']: #
cb_edf_name = 'C:\\Users\\HourstonH\\Documents\\NEP_climatology\\data\\' \
'profile_data_tables\\duplicates_flagged\\' \
'ALL_Profiles_{}_1991_2020_cb_edf.csv'.format(varname)
df_all = pd.read_csv(cb_edf_name)
df_out = pdt_inexact_dupl(df_all)
df_all_out_name = duplicate_folder + 'ALL_Profiles_{}_1991_2020_ie_001ll_pi.csv'.format(
varname)
df_out.to_csv(df_all_out_name) # Include index!
##################################