# Collect data:
To acquire the dst data I used davitpy:https://github.com/vtsuperdarn/davitpy the script: plotGMagIndices <br>
For the OMNI bx data I selected data from https://omniweb.gsfc.nasa.gov/form/dx1.html <br>
In the second source some of the data is missing, so I only selected the data from 2000 - 2016 

DST

In [None]:
%pylab inline
import datetime as dt
import numpy as np
import matplotlib.pyplot as mp
import pickle

from davitpy import pydarn
from davitpy import gme

In [None]:
from davitpy.gme.base.gmeBase import gmeData

class dstRec(gmeData):
    """a class to represent a record of dst data.  Extends class
    gme.base.gmeBase.gmeData. Note that Dst data is available from
    1980-present day (or whatever the latest WDC has uploaded is).  **The
    data are 1-hour values**.  Information about dst can be found here:
    http://wdc.kugi.kyoto-u.ac.jp/dstdir/dst2/onDstindex.html
        
    Parameters
    ----------
    webLine : Optional[str]
        an ASCII line from the datafile from WDC. if this is provided, the
        object is initialized from it.  default=None
    dbDict : Optional[dict]
        a dictionary read from the mongodb.  if this is provided, the
        object is initialized from it.  default = None

    Attributes
    ----------
    time : datetime
        an object identifying which time these data are for
    dataSet : str
        a string dicating the dataset this is from
    info : str
        information about where the data come from.  *Please be courteous
        and give credit to data providers when credit is due.*
    dst : float
        the actual dst value

    Notes
    -----
    If any of the members have a value of None, this means that they
    could not be read for that specific time
   
    In general, users will not need to worry about this.
        
    Methods
    -------
    parseWeb

    Example
    -------
        emptyDstObj = gme.ind.dstRec()

    or

        myDstObj = dstRec(webLine=awebLine)
        
    written by AJ, 20130131
    """
    

    def parseWeb(self,line):
        """This method is used to convert a line of dst data from the WDC to a dstRec object
        
        Parameters
        ----------
        line : str
            the ASCII line from the WDC data file

        Returns
        -------
        Nothing

        Notes
        -----
        In general, users will not need to worry about this.
        
        Belongs to class gme.ind.dst.dstRec
        
        Example
        -------
            myDstObj.parseWeb(webLine)
            
        written by AJ, 20130131

        """

        cols = line.split()
        self.time = dt.datetime(int(cols[0][0:4]),int(cols[0][5:7]),int(cols[0][8:10]), \
                                                            int(cols[1][0:2]),int(cols[1][3:5]),int(cols[1][6:8]))
        if(float(cols[3]) != 99999.0): self.dst = float(cols[3])
        
    def __init__(self, webLine=None, dbDict=None):
        """the intialization fucntion for a :class:`gme.ind.dst.dstRec` object.  
        
        written by AJ, 20130131

        """
        #note about where data came from
        self.dataSet = 'Dst'
        self.time = None
        self.info = 'These data were downloaded from WDC For Geomagnetism, Kyoto.  *Please be courteous and give credit to data providers when credit is due.*'
        self.dst = None
        
        #if we're initializing from an object, do it!
        if(webLine != None): self.parseWeb(webLine)
        if(dbDict != None): self.parseDb(dbDict)
        
def readDstWeb(sTime,eTime=None):
    """This function reads dst data from the WDC kyoto website
    
    Parameters
    ----------
    sTime : datetime
        the earliest time you want data for
    eTime : Optional[datetime]
        the latest time you want data for.  if this is None, eTime will
        be equal to sTime.  default = None

    Notes
    -----
    You should not use this. Use the general function readDst instead.
    
    Example
    -------
        import datetime as dt
        dstList = gme.ind.readDstWeb(dt.datetime(2011,1,1,1,50),eTime=dt.datetime(2011,1,1,10,0))
        
    written by AJ, 20130131

    """
    import datetime as dt
    import mechanize
    
    assert(isinstance(sTime,dt.datetime)),logging.error('sTime must be a datetime object')
    if(eTime == None): eTime = sTime
    assert(isinstance(eTime,dt.datetime)),logging.error('eTime must be a datetime object')
    assert(eTime >= sTime), logging.error('eTime < eTime')
    
    sCent = sTime.year/100
    sTens = (sTime.year - sCent*100)/10
    sYear = sTime.year-sCent*100-sTens*10
    sMonth = sTime.strftime("%m")
    eCent = eTime.year/100
    eTens = (eTime.year - eCent*100)/10
    eYear = eTime.year-eCent*100-eTens*10
    eMonth = eTime.strftime("%m")
    
    br = mechanize.Browser()
    br.set_handle_robots(False)   # no robots
    br.set_handle_refresh(False)  # can sometimes hang without this
    br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
    br.open('http://wdc.kugi.kyoto-u.ac.jp/dstae/index.html')
    
    br.form = list(br.forms())[0]
    
    #fill out the page fields
    br.form.find_control('SCent').value = [str(sCent)]
    br.form.find_control('STens').value = [str(sTens)]
    br.form.find_control('SYear').value = [str(sYear)]
    br.form.find_control('SMonth').value = [sMonth]
    br.form.find_control('ECent').value = [str(eCent)]
    br.form.find_control('ETens').value = [str(eTens)]
    br.form.find_control('EYear').value = [str(eYear)]
    br.form.find_control('EMonth').value = [eMonth]
    
    br.form.find_control('Output').value = ['DST']
    br.form.find_control('Out format').value = ['IAGA2002']
    br.form.find_control('Email').value = "vt.sd.sw@gmail.com"
    
    response = br.submit()
    
    #get the data
    lines = response.readlines()

    dstList = []
    for l in lines:
        #check for headers
        if(l[0] == ' ' or l[0:4] == 'DATE'): continue
        
        cols = l.split()
        
        linedate = dt.datetime(int(cols[0][0:4]),int(cols[0][5:7]),int(cols[0][8:10]),
                                int(cols[1][0:2]),int(cols[1][3:5]),int(cols[1][6:8]))
        if(sTime <= linedate <= eTime):
            dstList.append([linedate, float(cols[3])])
            #try: 
            #    dstList.append(dstRec(webLine=l))
            #except Exception,e:
            #    #logging.exception(e)
            #    #logging.exception('problem assigning initializing dst object')
            #    print str(e)
        if(linedate > eTime): break
        
    if(dstList != []): return dstList
    else: return None

In [None]:
for year in range(1957,2017):
    sTime = dt.datetime(year,1,1,0)
    eTime = dt.datetime(year,12,31,23)
    
    dst=readDstWeb(sTime=sTime, eTime=eTime)
    
    with open('./dst_yearly/'+str(year)+'.txt','w') as fd:
        pickle.dump(dst, fd)

In [None]:
with open('./dst_yearly/2016.txt', 'r') as fd:
    dst_year_2016 = pickle.load(fd)
    print(len(dst_year_2016))

In [None]:
aux = []
for year in range(2000,2017):
    
    with open('./dst_yearly/'+str(year)+'.txt', 'r') as fd:
        data = pickle.load(fd)
        aux.extend(data)
with open('./dst_yearly/alldata_2000-2016.txt', 'w') as fd:
    pickle.dump(aux, fd)

bz <br>
Due to the fact that dst prior to 2000 is missing values, we skipped to only the values before year 2000

In [None]:
bz_e_m = []
with open('./omni_bxe_bxm_hourly_averaged/omni2_25423.lst', 'r') as fd:
    for line in fd:
        year, DOY, hour, bze, bzm  = line.split()
        year = int(year)
        DOY = int(DOY)
        hour = int(hour)
        bze = float(bze)
        bzm = float(bzm)
        
        if year >= 2000:
            bz_e_m.append((year, DOY, hour, bze, bzm))

print len(bz_e_m)
for idx in range(len(bz_e_m)):
    bz_e_m[idx] = [aux[idx][0], bz_e_m[idx][-2], bz_e_m[idx][-1]]
print(aux[-1])
print(bz_e_m[-1])
with open('./omni_bxe_bxm_hourly_averaged/alldata_2000-2016.txt', 'w') as fd:
    pickle.dump(bz_e_m, fd)
print("saved")