# Here I'll organize and generate all the databases derived from 5m data.

We use the following code to concat the 4 databases from Dukascopy in only 1 (Dukascopy only permits download 3 years in candles lower than 1h)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import itertools 
from itertools import product
from scipy.stats import linregress
from datetime import datetime
plt.style.use("seaborn")
pd.set_option("display.max_rows", None, "display.max_columns", None)#to display complete dataframe

#### First step is read the 4 files without designing an index

In [98]:
market = "CADJPY" #MODIFY THE MARKET !

In [99]:
#These documents must be in "organizing_data" folder.
#We got to download these data by parts because Dukascopy only allows to download 3 years in candles lower than 1h
x = pd.read_csv(f"{market}_1_5m.csv") #2010-01-01 to 2012-12-31
y = pd.read_csv(f"{market}_2_5m.csv") #2013-01-01 to 2015-12-31
z = pd.read_csv(f"{market}_3_5m.csv") #2016-01-01 to 2018-12-31
w = pd.read_csv(f"{market}_4_5m.csv") #2019-01-01 to 2020-12-31

#### Second step is join them

In [100]:
file_5m = pd.concat([x,y,z,w])

In [101]:
file_5m.iloc[0:3] #Verify it all correct

Unnamed: 0,Gmttime,Open,High,Low,Close,Volume
0,01.01.2010 00:00:00.000,88.4,88.405,88.38,88.395,927.7454
1,01.01.2010 00:05:00.000,88.399,88.408,88.38,88.383,338.4967
2,01.01.2010 00:10:00.000,88.382,88.392,88.373,88.373,494.1476


In [102]:
file_5m.iloc[-3:]

Unnamed: 0,Gmttime,Open,High,Low,Close,Volume
149691,31.12.2020 21:45:00.000,81.119,81.133,81.104,81.104,179.68
149692,31.12.2020 21:50:00.000,81.104,81.11,81.088,81.1,225.13
149693,31.12.2020 21:55:00.000,81.105,81.123,81.066,81.074,269.81


#### Third step: save the file, without the index and with 5 decimals

In [103]:
file_5m.to_csv(f"{market}_5m_BID_10_20_EET_GMT.csv",float_format='%.5f', index = False) #Saving without numeric index and with 5
                                                                                                                      #decimals.

#### Fourth: reading the file to validate

In [73]:
custom_date_parser = lambda x: datetime.strptime(x, "%d.%m.%Y %H:%M:%S.%f")
t = pd.read_csv(f"{market}_5m_BID_10_20_EET_GMT.csv", parse_dates = ["Gmttime"] 
                ,date_parser=custom_date_parser, index_col = "Gmttime"
               )

# RESAMPLES
#### Let's Open the 5m databases and resample them into other timeframes
in the following code we resample 5m datasets into 15m or 30m datasets

In [106]:

mkts = ["GBPNZD_5m_BID_10_20_EET_GMT.csv", "NZDCAD_5m_BID_10_20_EET_GMT.csv", "GBPCAD_5m_BID_10_20_EET_GMT.csv",
        "AUDCAD_5m_BID_10_20_EET_GMT.csv", "EURNZD_5m_BID_10_20_EET_GMT.csv", "AUDNZD_5m_BID_10_20_EET_GMT.csv",
        "GBPAUD_5m_BID_10_20_EET_GMT.csv", "NZDCHF_5m_BID_10_20_EET_GMT.csv", "EURCAD_5m_BID_10_20_EET_GMT.csv",
        "GBPCHF_5m_BID_10_20_EET_GMT.csv", "CADCHF_5m_BID_10_20_EET_GMT.csv", "AUDCHF_5m_BID_10_20_EET_GMT.csv",
        "EURCHF_5m_BID_10_20_EET_GMT.csv", "CHFJPY_5m_BID_10_20_EET_GMT.csv", "NZDJPY_5m_BID_10_20_EET_GMT.csv",
        "EURAUD_5m_BID_10_20_EET_GMT.csv", "GBPJPY_5m_BID_10_20_EET_GMT.csv", "CADJPY_5m_BID_10_20_EET_GMT.csv"]

mkts_save = ["GBPNZD_30m_BID_10_20_EET_GMT.csv", "NZDCAD_30m_BID_10_20_EET_GMT.csv", "GBPCAD_30m_BID_10_20_EET_GMT.csv",
             "AUDCAD_30m_BID_10_20_EET_GMT.csv", "EURNZD_30m_BID_10_20_EET_GMT.csv", "AUDNZD_30m_BID_10_20_EET_GMT.csv",
             "GBPAUD_30m_BID_10_20_EET_GMT.csv", "NZDCHF_30m_BID_10_20_EET_GMT.csv", "EURCAD_30m_BID_10_20_EET_GMT.csv",
             "GBPCHF_30m_BID_10_20_EET_GMT.csv", "CADCHF_30m_BID_10_20_EET_GMT.csv", "AUDCHF_30m_BID_10_20_EET_GMT.csv",
             "EURCHF_30m_BID_10_20_EET_GMT.csv", "CHFJPY_30m_BID_10_20_EET_GMT.csv", "NZDJPY_30m_BID_10_20_EET_GMT.csv",
             "EURAUD_30m_BID_10_20_EET_GMT.csv", "GBPJPY_30m_BID_10_20_EET_GMT.csv", "CADJPY_30m_BID_10_20_EET_GMT.csv"]
             


for i in range(len (mkts)):
    
    custom_date_parser = lambda x: datetime.strptime(x, "%d.%m.%Y %H:%M:%S.%f") #Parser for reading dates (Gmttime column)

    ##########################################################
    df_parsed = pd.read_csv(mkts[i], parse_dates = ["Gmttime"], date_parser=custom_date_parser, index_col = "Gmttime")
                      #Previous line reads the 5m files using the custom parser and fix an datetime index needed for resampling
        
    df = pd.read_csv(mkts[i]) #reads the 5m files but without an index
    ##########################################################

    u = df.Gmttime.to_numpy() #store in u the column Gmttime in string format, not in datetime.
    t = df_parsed.copy() #Copy of the 5m files readed with an index
    t["Gmttime"] = u #Generates a new column which is a string one, but its the same as the index to could be resampled then. 
    t = t[["Gmttime", "Open", "High", "Low", "Close"]] #reasign t with only the columns we need

    index = t["Gmttime"].copy().to_frame() 
    Open_price = t["Open"].copy().to_frame()
    Close_price = t["Close"].copy().to_frame()
    
    #Here we need to separate the resamples in different df because the resample methods for Close & Open are different
    #one is for the first(Open) and one for the last (Close) values; also the index by itself is resampled.
    

    index_5m_to_30m = index.resample(rule='30min', closed='left', label='left').first().dropna()
    Open_5m_to_30m = Open_price.resample(rule='30min', closed='left', label='left').first().dropna()
    Close_5m_to_30m = Close_price.resample(rule='30min', closed='left', label='left').last().dropna()

    index_5m_to_30m[["Open", "Close"]] = [Open_5m_to_30m.to_numpy(), Close_5m_to_30m.to_numpy()] #Generates complete df.

    ##########################################################
    index_5m_to_30m.to_csv(mkts_save[i],float_format='%.5f', index = False) #Saving without the index. z
    ##########################################################                                                                                                     #index and with 5 decimals.
                                                                                                            

In [3]:
custom_date_parser = lambda x: datetime.strptime(x, "%d.%m.%Y %H:%M:%S.%f")
##########################################################
resampled_15m = pd.read_csv("USDCAD_15m_BID_10_20_EET_GMT.csv", parse_dates = ["Gmttime"], date_parser=custom_date_parser, index_col = "Gmttime")
Original_15m_dukascopy = pd.read_csv("USDCAD_15m_2019-2020______.csv", parse_dates = ["Gmttime"], date_parser=custom_date_parser, index_col = "Gmttime")
##########################################################

In [4]:
Original_15m_dukascopy["Close"].sum()

66567.56491

In [5]:
resampled_15m.Close.loc["2019-01-01":"2020-12-31"].sum()

66567.56491

In [6]:
Original_15m_dukascopy["Open"].mean()

1.3340493184232107

In [7]:
resampled_15m.Open.loc["2019-01-01":"2020-12-31"].mean()

1.3340493184232107