From 64144ccd38925742fe8ab045ef32e72178da59f4 Mon Sep 17 00:00:00 2001 From: Matthew Gidden Date: Mon, 9 Jul 2018 15:33:30 +0200 Subject: [PATCH 1/4] add ability to read rcp db data --- pyam/utils.py | 24 +++++++++++++----- .../data/test_RCP_database_raw_download.xlsx | Bin 0 -> 6450 bytes tests/test_core.py | 12 +++++++-- 3 files changed, 27 insertions(+), 9 deletions(-) create mode 100644 tests/data/test_RCP_database_raw_download.xlsx diff --git a/pyam/utils.py b/pyam/utils.py index f6f27582a..3d82a1eca 100644 --- a/pyam/utils.py +++ b/pyam/utils.py @@ -146,25 +146,35 @@ def read_files(fnames, *args, **kwargs): for fname in fnames: logger().info('Reading `{}`'.format(fname)) df = read_pandas(fname, *args, **kwargs) - dfs.append(format_data(df)) + df = format_data(df) + dfs.append(df) return pd.concat(dfs) def format_data(df): """Convert an imported dataframe and check all required columns""" + # all lower case + df.rename(columns={c: str(c).lower() for c in df.columns}, inplace=True) + + if 'notes' in df.columns: # this came from the database + logger().info('Ignoring notes column in dataframe') + df.drop(columns='notes', inplace=True) + # model for SSPs, scenario for RCPs + col = 'model' if 'model' in df else 'scenario' + df = df[~df[col].str.contains('database', case=False)] + if 'scenario' in df.columns and 'model' not in df.columns: + # model and scenario are jammed together in RCP data + scen = df['scenario'] + df['model'] = scen.apply(lambda s: s.split('-')[0].strip()) + df['scenario'] = scen.apply( + lambda s: '-'.join(s.split('-')[1:]).strip()) # format columns to lower-case and check that all required columns exist - df.rename(columns={c: str(c).lower() for c in df.columns}, inplace=True) if not set(IAMC_IDX).issubset(set(df.columns)): missing = list(set(IAMC_IDX) - set(df.columns)) raise ValueError("missing required columns `{}`!".format(missing)) - if 'notes' in df.columns: - logger().info('Ignoring notes column in dataframe') - df.drop(columns='notes', inplace=True) - df = df[~df.model.str.contains('database', case=False)] - # check whether data in IAMC style or year/value layout if 'value' not in df.columns: numcols = sorted(set(df.columns) - set(IAMC_IDX)) diff --git a/tests/data/test_RCP_database_raw_download.xlsx b/tests/data/test_RCP_database_raw_download.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..5ccad2310c55c944b3aa33dfa7b63c32c0427f8a GIT binary patch literal 6450 zcmaJ_1z1#Fx2D6PbLdo%ZjkP7h8)6S7&?arkq!}Q5D^4PsR2PsN`{b`j|GhFj@R=SYm2uU=y{fLqkWx_;by$h@6oth~zUNI{&cHXacK>Qv4LHhld{| zRdUf3j>n048j0#&l~%k-$rr%T;sTP>Zwwu7<{vml&9pg5p+l9^z3xY>>7JPN3_|nP zMo0@=>6QDXjIHqwXot`k4;wq|pAMTR2^A-c&Wqn9w%poVZSZ(l)( zx3@r$hkJgifm@FdQQz)4RrgWuJRz97&*vjhhPp{bA^?D>dBc3AQJr~&8bcNC1WU@v z?PnaF1^6^cb!Y1CF;(!Y-`eAGUL|D;-nAPNze}NATT09<4Ubc8T1oEGV?)N^8K)7- z`+rFQN@m{i$Kf)`@+im`*44Mqo{Eg~5xeywI+rP^gc!;ni`(rs4r|94WbyGu{#5oi zR5&e7xndMG*kiVSL!smy!haqydD0(wzcq~fLRj4`vo|kc{@Eo?Z5QvRimAEBOm#s2 z*zjJfXZ%TL3mxzPU{{zrJMwyF%Dq~ELEp4vDgO%)p7u4#CH`AWQ(90aA!6?9`kuZ0W7AqDNEOtIt{#l&thiu7PwjjcyCbzDLk)6DnM`1I2otje5^k9wVrn zhr5GDHR+L{#HLxEsMioM*))zm8&hk=o5EQooBL`XY56fyklu;77P%%mKG4moZt{FS zUkQuKd;xHD9)w#d;tigLQp}hYz$&MT)M{P%Zi^gG3(wA=Qkc*KvRQwoX&+wyON}#+ z{dRYa<5qXGJ`C7k1vyg85YW0oq-l&KEntr7J7Tw16F9aQ-g{;(h>_Z*sDE# z0--gRcAFRV_92OLK~%RvG_EwItc@+VKOsd^uR=a*$ANr@(&^j0HI}>wSdvFqwGq3T zdGMc%F3KyZNvn_3LZX}Me)KHCY?+6Hm7jj^QTQo*j(4+1 z6)~V>*!BLP{?VD~ulFd(UGVQf5A=e&*?W1p-R#rfL7%2~Ln%uEdt|DNP066Pe#PA2+vpu5!Qlg~sXWXN z)Fw4x>^GjghHclrTnC*aCb26HEf5+YCj0==g7Pp0`MY4)fhh9zTiAOXnv+vQ@eY+Gj)SEP>2+S0K9AxKEKK#DH~}RFzqfiBWn<~2S z)KM>7MfSXP7u6n>i+bs)XjC#?wys(B@vkP~cK|Tvef~>sf;$gC#M(0Woj_VCj~PwD417yj8AGqIhienEqhN|4YDZ!3#LHzg--^sr2rx%5A?WDro{<5v`;T z)`sssQKzYFLtdft)&HwFwqGN|9A;y*K4*kl$;ToNi!=MhPE=s`1%j-Z)KPebLF;bvSuq|5M5% z{@ZDM{es#jZ{5st&nOi)xV{=KDpfqfJmUALv>c*p&d$JsXS=F!JGi2T7piIwq#3`li^q%<~7U|{J;etw;OKrG5f7CYL%43V6d z^RbzWkv2uV(~$y~GAaUaUOd1#O>j*V$$QC&Iw@t9u#AE$Kc(4bU>!owj?h@2#v@>mCZ0n+|XzoHbDG{!q*zp9Hxo?wuBRQ z&lb>5WBpk(tHFj4pScoSsUJ6Zdb*5i(ltOhqnJG{a(iF7a8LR2%PmWqnk}V5`^xTq zRpg4u>+c!QShqcgR3`o0o+ql~70SNt@!=QYc+I?!=tnb{Yw~@le6dsBRC?}-YmZ0y z*Wu;Mge{gnck+fn52ilGe;?Z%j@}`ffkR5Wh0{bN+Il}zr<0Z}lBR4sQGE~Gn8KTO zMINTfnl8>BDsSz2=yiQ)x*}=Xd5CP5#lJ+o(ZsfRBR6Fbj_)?V^)OEETkAf@k47Dv z&c&pIt08z$-iO7-rVlee_N`IYMbVN4lP8BGWg#BJd9JvVA+__>>gsi|SceEyyuyeE z#sb_!f(t}$)>za!{oV)m+6>{JOE~5WYB-8x%AInWj9B_!5a&Aml3p&_RRG6vq8=js zO^Eo#*z|<|B18t2ze5l31-K{lGZlRfD4NkWDX?}rj=XyYxQVcDyjmVy*U?pa{Z0DM z)fezrTK08@!Tfv$|Me$)6KoykI)MvPK=fGc-hNRPox3fe9~Nc0BQCgzC?NcK&Tw1YVa))p#=@5 zy6_Mj)Po%92YDoDS{`&tu|kv_?U=|*(7pStPgNw1DqHqs>F$Ve)b{~$d-O-t)T#LT z`ZHn+IYr{@r@x(DJ$p)?-8PGiJ)>rjO}5nJlKtc;WbIYGM?uhh?6%yE)_ce`luFzySc+?nTaGNDj^czhFW7hXw$Yy@Ox-3gdkk> z?bAjlr?G>CUGpejdL9#Px5+fZB?God5A836LHrhC(0jO`M?07gmd}L^yb5*4)BfsL z0PS(G&fkd~cM>zx$VjYaak(|>0#a!HR7sU5+v(u;f|*!V><$XFMfQ$~_I779D9#h{ zEk8=j)|nQXTP7VoM<{+uvt4j(y^(deEzEfrRJ(KDc#LFx{|a%K)V^0!F$Xaxoya|O zB6ex({QhE4d9H&sWSPXMg#^=2!$WLUKg=UV`r!E%)ix7D1@TV0)Eg%L*ZKlTPkJKRNj6(`DQ#4jnoJIenRp+dVbV znrhbcpxX7HGT+VgaieqeBif_G4g}^v?k7j$;$;IZAj_S2LE~4eM1BtsiTrdK(N=wi zpKEiIM8&Z6`tRjN-hoCq7LE=q<-Vza9k~)wz4xCQ4xJ7g-ZV$Qf5(L92+V=rkBq#H zX9jE$Y%;aodD=$CXFiM^Fy_LP!w-2o8U9@R9D>v9^07Jwa5ChpHYx0E!{eHWjnifi z@$G526{z1E7r8pkt=i=;FSX0!0Byv9z;MF5d>>v!3#N;k3rTqGaH?mU;n|b8Hd}>i z_;>Itwo!v*>&?mJc3!P8Jg@dts}ePVW*aBb(# zm|O3p0C>2pX5O^c+)6<1zM2wVn{b{xW{al&w&k1;I!B6Y2~9N6yQ)#sC4L0y@Db6= zD7HrW(F);veUYmIdbjY*%{MHX%9WLEge}Z9I|y!?9Td{tA>HxX ze(IrvhZQ_;-+Y40*E)kPKhMh^Q}{VzY#~(IN=2I$aYlWgJzbb~P;G5%e?g`uLITt) zmK9|KGSfAex4Q~JG-Ar~9I3%Pbr&n<{KcH;W>nX8m!Cfa(TTT!v}`sKCoIP3L@lkn z&aSlNimgGc>9In*?aio(%VX3z-Ukv7yE@NIIeiHZ7R!p9;4!>TZ zGvx&ml%>xoY2ck6_Yg(%SXAF%pMF;&8ArJSO~GXsPNd()>YX+$OpHv$TL;06_w4xa zIQFTq^+fSZD(R|$rivW58w0$oA!42kSj}D=c?yU$d0~@?iZ6alg!ceCFUo{qimO16 z#>}XTc=pSQ3^BP)#*5?f9`yTa#>CI1N1`In%MWxr9kz1V5%NR~}rjX-#J zF}*CAl>KYOg7swpu_qPOUckdxGACc%Y6()(iy4LJ=g)_1)63R73pnslGEYz;O6|#K z`q#apV?gZ$c`F@kB0a!PIHFkKu*MF*WPvqo|mdxcJ@EzF{AYyrU3ht!M z6HBi05(iVh>Gij{SXQ20W{%V&SXrsW)~BZHqasXtB|vsAdG6rHX6J(<5w)hR z_-v~z?(k{PK}9haAqJ-`@QKk69tzL$l3-F?a0S$dN=9*Jnc%EBcLq3LJ_)$&8`nw8 zc~$pXH~0WlAgAHXXs9mMou}7 z=MbZpr&k2b8f^c~Mo2&UWT$iew40%lB96940&))}#hid^Zfv37Vv59t4w-Q~&n{Q* z;O6u_#0E3ipBPu#rje-FKPQL=a;a7`Aw%g0w}#}BTf5z~?`077Y}LVcUU+}fYwY=W zhs=}hspfDL}&Io~kXs9WVcxs;ozeP?Uc zkupww;?7Y=Zu>UhzGn6d&4!Dqwq$q4ndMk^7ghTbMNH+cc(w3bZQ^;l`u+$Gw)N;T zwr(qwD$8I35n=ltC#))LZ5~_zFJB458=re0_%W{VR{D29aa&kt(l547qk-Jki zCkq%-wQW(iPZH`792+m*ih|Cf$rCjk7ubO$_f-mbTa*`rR9rP4z24qb(PYX%hFYk6 zJxMi*qI=YKiYA^*i1kxn78o|lyA5zRivL7{t2%qW5TrpNnGxS4WuPf|Zdb(dT%Xce z!YZFt$|K|q`c2JzT&gPTkd3%7AD==L}@O^QbC0F1E} zih2x*^bIIJVL-jgAq19mZNIEnQn^bpCX}Vr#02+BfA1Dr<2hnCk;8$A?Fb*T4G}tf zRz9gEmA;VKUa(RsvD#AiVJ)EUGrFOJcJn!pKG9zIXF=ecj>OOG zVnLkUyy%+hr&`_NeA{eP-vAFY63itHx2wRGy^b=*v(-&mnu~W5)u?t%uoUw5zQ<+q zd$aqy`vmYr`5BMsC&Cd6iMR8zVE(v^NklR!v!^h=V6Hu_h~Q~|xxEwvFmuza3}AtH z*L@@URWi%*({SvYq86X*^X+C_3=9gQe@gt6*NGpj;^pZF^YpU=1$)DMZT?lyW-94i zYp$`^@{10l{&@{P`9}}ZC=Gs#F^Y^W&rMGXM|<>DD3@CnyE53S-GSiQY3E>cNiJ#F zQxA;65c9__Mu|juC;e~BXatAjFU&@qGF)4U73of;E7cgFEg9*4fQ}+0zW0OrgZZif zu)Bj{NDY1J94wflF_7GP1bRT$@b!_8GqD38cHijO5bz^$JdJOXr~X_}$$nNjcHFXt zeT++T^rC1@#@IMnZhAq{{1fY!CC=4jtT5KWezmVMgd66(RgrRwq1jYIb(0Z;*kCMm z!28#Y8d6MGVRI$&=~8}e&I+=KrFJ{6ui?ET%VA$E@0utY##u!S^rSRyX8GfvQ8hD- zmu?krkxR1Ra=sVo*dtT8il=am#;o&$nka61;<5N;>YZXtBtr<-4Y zZ4GrWv8XV9$=ko{yEpRo|Hf}(_n(Hp>zz09?=R3@zv@ph_)qKKmAD&C^cR$`6{!EQ z{%?i!PxIg9svEiP7uK)4rT;A6{W;F>3dW7N^&74o^m?4%rLI4Xe{ariM2=q&$N!h{ zFUjLi)8A{Ko9^uwp5FSO_x_(=?$1$v52Twq=@)#j7jr#(-#@FSKS%i8Gj0Oz7u>1; Y-|#ck!Nt3Ii17MQxZWu=&CT6^0h%HmEC2ui literal 0 HcmV?d00001 diff --git a/tests/test_core.py b/tests/test_core.py index 28e2aea99..3fc4740ac 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -236,7 +236,8 @@ def test_check_aggregate_pass(meta_df): 'region': 'World', 'variable': 'Primary Energy|Gas', 'unit': 'EJ/y', 'year': [2005, 2010], 'value': [.5, 3]}) meta_df.data = meta_df.data.append(df, ignore_index=True) - obs = meta_df.filter(scenario='a_scenario').check_aggregate('Primary Energy') + obs = meta_df.filter( + scenario='a_scenario').check_aggregate('Primary Energy') assert obs is None @@ -258,7 +259,7 @@ def test_category_pass(meta_df): exp = pd.DataFrame(dct).set_index(['model', 'scenario'])['category'] meta_df.categorize('category', 'foo', {'Primary Energy': - {'up': 6, 'year': 2010}}) + {'up': 6, 'year': 2010}}) obs = meta_df['category'] pd.testing.assert_series_equal(obs, exp) @@ -294,6 +295,13 @@ def test_load_SSP_database_downloaded_file(test_df): pd.testing.assert_frame_equal(obs_df.as_pandas(), test_df.as_pandas()) +def test_load_RCP_database_downloaded_file(test_df): + obs_df = IamDataFrame(os.path.join( + TEST_DATA_DIR, 'test_RCP_database_raw_download.xlsx') + ) + pd.testing.assert_frame_equal(obs_df.as_pandas(), test_df.as_pandas()) + + def test_append(test_df): df2 = test_df.append(other=os.path.join( TEST_DATA_DIR, 'testing_data_2.csv')) From 102f318e2f7d403a9c90a7c49fe7afcb25b99150 Mon Sep 17 00:00:00 2001 From: Matthew Gidden Date: Tue, 10 Jul 2018 08:27:02 +0200 Subject: [PATCH 2/4] fix database column, always first --- pyam/utils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pyam/utils.py b/pyam/utils.py index 3d82a1eca..f66f806fa 100644 --- a/pyam/utils.py +++ b/pyam/utils.py @@ -160,8 +160,7 @@ def format_data(df): if 'notes' in df.columns: # this came from the database logger().info('Ignoring notes column in dataframe') df.drop(columns='notes', inplace=True) - # model for SSPs, scenario for RCPs - col = 'model' if 'model' in df else 'scenario' + col = df.columns[0] # first column has database notes df = df[~df[col].str.contains('database', case=False)] if 'scenario' in df.columns and 'model' not in df.columns: # model and scenario are jammed together in RCP data From ca65e0a40354b3164b3d3acf92741820b477fc2f Mon Sep 17 00:00:00 2001 From: Matthew Gidden Date: Tue, 10 Jul 2018 08:28:52 +0200 Subject: [PATCH 3/4] add release notes and update docs --- RELEASE_NOTES.md | 1 + pyam/core.py | 23 ++++++++++++++--------- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md index 8f79b0517..e3ee214b3 100644 --- a/RELEASE_NOTES.md +++ b/RELEASE_NOTES.md @@ -1,5 +1,6 @@ # Next Release +- (#70)[https://github.com/IAMconsortium/pyam/pull/70] Support reading of both SSP and RCP data files downloaded from the IIASA database. - (#66)[https://github.com/IAMconsortium/pyam/pull/66] Fixes a bug in the `interpolate()` function (duplication of data points if already defined) - (#65)[https://github.com/IAMconsortium/pyam/pull/65] Add a `filter_by_meta()` function to filter/join a pd.DataFrame with an IamDataFrame.meta table diff --git a/pyam/core.py b/pyam/core.py index b88aecf40..f9e5f8c0e 100644 --- a/pyam/core.py +++ b/pyam/core.py @@ -51,7 +51,12 @@ def __init__(self, data, **kwargs): ---------- data: ixmp.TimeSeries, ixmp.Scenario, pd.DataFrame or data file an instance of an TimeSeries or Scenario (requires `ixmp`), - or pd.DataFrame or data file with IAMC-format data columns + or pd.DataFrame or data file with IAMC-format data columns. + + Special support is provided for data files downloaded directly from + IIASA SSP and RCP databases. If you run into any problems loading + data, please make an issue at: + https://github.com/IAMconsortium/pyam/issues """ # import data from pd.DataFrame or read from source if isinstance(data, pd.DataFrame): @@ -298,11 +303,11 @@ def set_meta(self, meta, name=None, index=None): # reduce index dimensions to model-scenario only _meta = ( - _meta - .reset_index() - .reindex(columns=META_IDX + [name]) - .set_index(META_IDX) - ) + _meta + .reset_index() + .reindex(columns=META_IDX + [name]) + .set_index(META_IDX) + ) # raise error if index is not unique if _meta.index.duplicated().any(): @@ -499,9 +504,9 @@ def check_aggregate(self, variable, components=None, units=None, # filter and groupby data, use `pd.Series.align` for machting index df_variable, df_components = ( - _aggregate_by_variables(self.data, variable, units) - .align(_aggregate_by_variables(self.data, components, units)) - ) + _aggregate_by_variables(self.data, variable, units) + .align(_aggregate_by_variables(self.data, components, units)) + ) # use `np.isclose` for checking match diff = df_variable[~np.isclose(df_variable, multiplier * df_components, From f3d6b4ba30983d4116a353df3b3bdad043a5d662 Mon Sep 17 00:00:00 2001 From: Matthew Gidden Date: Mon, 16 Jul 2018 12:01:36 +0200 Subject: [PATCH 4/4] update comment --- pyam/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyam/utils.py b/pyam/utils.py index f66f806fa..fcb20f955 100644 --- a/pyam/utils.py +++ b/pyam/utils.py @@ -160,7 +160,7 @@ def format_data(df): if 'notes' in df.columns: # this came from the database logger().info('Ignoring notes column in dataframe') df.drop(columns='notes', inplace=True) - col = df.columns[0] # first column has database notes + col = df.columns[0] # first column has database copyright notice df = df[~df[col].str.contains('database', case=False)] if 'scenario' in df.columns and 'model' not in df.columns: # model and scenario are jammed together in RCP data