In [1]:
import pandas as pd
import sqlite3
import hashlib
import numpy as np
import datetime

In [2]:
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.options.mode.chained_assignment = None

In [25]:
class DataVault:
    def __init__(self, db):
        self.db_name = db
        self.RS = 1 # changes later
        
        # list of our hubs
        self.hub_list = ["Player", "League", "Team"]       
        self.sat_dict = {"HSAT" : ["HSAT_Team", "HSAT_Team_Attributes", "HSAT_League","HSAT_Player","HSAT_Player_Attributes"],
                         "LSAT" : ["LSAT_Match_Attributes", "LSAT_Match_Bookmakers"],
                         "BSAT" : ['BSAT_Goal', 'BSAT_Shoton', 'BSAT_Shotoff', 'BSAT_Foulcommit', 'BSAT_Card', 
                                   'BSAT_Cross', 'BSAT_Corner', 'BSAT_Possession']}
        
    def connect_to_db(self):
        self.conn = sqlite3.connect(self.db_name)
        self.cursor = self.conn.cursor()
    
    def disconnect(self):
        self.conn.close()
        
    def get_table_names(self) -> list:
        """returns a list of all the names of the tables of a SQL database"""
        query = (
        'SELECT name FROM sqlite_master '
        'WHERE type IN ("table", "view") '
        'AND name NOT LIKE "sqlite_%"'
        ';'
        )
        table = pd.read_sql(query, con=self.conn)
        self.all_tables = table['name'].to_list()
        
        self.hub_tables, self.table_names = [], []
        for table in self.all_tables:
            
            if table[:4] == "HUB_":# or table in self.links:
                self.hub_tables.append(table)
            else:
                self.table_names.append(table)
        return self.hub_tables, self.table_names
                

    def get_column_names(self, tables) -> list:
        """returns a list of the column names of a table from a SQL database
        if index was created properly"""
        
        self.columns = {}
        for table in tables:
            df_column = pd.read_sql(f'PRAGMA table_info({table});', con=self.conn)
            self.columns[table] = df_column['name'].to_list()
            
        return self.columns
    
    def get_hub_primary_key(self, table) -> str:
        """returns the name of the primary keys of a table from a SQL database"""
        
        hub_pk = pd.read_sql(f"pragma table_info({table});", con = self.conn)["name"].values[1]
        return hub_pk
    
    def get_sat_primary_key(self, table) -> str:
        """returns the name of the primary keys of a table from a SQL database"""
        
        hub_pk = pd.read_sql(f"pragma table_info({table});", con = self.conn)["name"].values[2]
        return hub_pk
    
    def get_primary_keys(self) -> str:
        """returns the name of the primary keys of a table from a SQL database if index was created properly"""
        
        # get all columns
        cols = self.get_column_names(self.table_names)
        # dict for the keys
        self.p_keys = {}
        for table in self.table_names:
            try:
                info = pd.read_sql(f"PRAGMA table_info({table})", con = self.conn);
                ind = info.index[info.pk == 1].item()
                self.p_keys[table] = cols[table][ind]
            except:
                print(f"Table 'index_info' not found, can't get primary key of {table}")
        return self.p_keys
        
    def hash_row(self, column_values):
        """function to create a hashkey of a row of a a list of values"""
        string_vars = [str(c).strip().upper() if c is not None else "" for c in column_values]
        string_vars = [c.replace(" ", "_") for c in string_vars]
        # join them
        one_string = "_".join(string_vars)

        # hash gen
        hash_gene = hashlib.new("MD5")

        hash_gene.update(one_string.encode("utf-8")) 

        return hash_gene.hexdigest()
    
    def get_foreign_keyss(self):
        """returns the name(s) of the foreign key of a table from a SQL database if index was created properly"""
        
        self.f_keys = {}
        
        p_key_col_dict = {k:v for k, v in zip(self.p_keys.values(), self.columns.values())}
        
        for p_key, col_names, table in zip(p_key_col_dict.keys(), p_key_col_dict.values(), self.table_names):
            f_list = []
            for col_name in col_names:
                if col_name in p_key_col_dict.keys() and col_name!= p_key:
                    f_list.append(col_name)
            self.f_keys[table] = f_list
        return self.f_keys
    
    def create_hubs(self):
        """function to create a hub dataframe"""
        
        print("creating hubs...")
        
        # necesary functions
        self.connect_to_db()
        self.get_table_names()
        self.get_primary_keys()
        
        self.created_hubs = []
        for table in self.hub_list:
            try:
                # create hub table name
                hub_table = "HUB_" + table
                
                # append to list
                self.created_hubs.append(hub_table)
                print("Creating " + hub_table)
                
                # drop if exists
                self.cursor.execute(f"""DROP TABLE IF EXISTS {hub_table};""")
                # create table
                self.cursor.execute(f"""CREATE TABLE IF NOT EXISTS {hub_table}(
                HK VARYINGN CHARACTER(64) NOT NULL PRIMARY KEY,
                BK INTEGER NOT NULL,
                RS INTEGER NOT NULL,
                LDTS DATETIME NOT NULL
                
                );""")
                
                print(f"Successfully created {hub_table}")
                
            except:
                
                print(f"Could not create hub table {table}")
        
    def fill_hubs(self):
        
        print("filling hubs...")
        
        # necesary functions
        self.connect_to_db()
        self.get_table_names()
        self.get_primary_keys()
        
        for table, hub_table in zip(self.hub_list, self.created_hubs):
            try:
                
                print("filling " + hub_table)
                
                # get the primary key of that table
                p_key = self.get_hub_primary_key(table)
                print("pkey: " + p_key)
                self.cursor.execute(f"""INSERT INTO {hub_table}(HK, BK,LDTS,RS) select {p_key}, {p_key},
                date("now"), 1 from {table} ORDER BY {p_key} ASC;""")

                HUB = pd.read_sql(f"SELECT * FROM {hub_table}", con = self.conn)
                HUB["HK"] = [self.hash_row(HUB.loc[i]) for i in range(len(HUB))]
                
                HUB.to_sql(f'{hub_table}', self.conn, if_exists='replace',  index=False,
                       dtype={"HK": "TEXT NOT NULL PRIMARY KEY", "BK":"INTEGER", "LDTS": "DATETIME", "RS": "INTEGER"})
                
                print("Successfully filled " + hub_table)
            except:
                
                print("Could not fill " + hub_table)
             
    def create_links(self):
        """Function to create and fill links"""
        
        # necesary functions
        self.connect_to_db()
        self.get_table_names()
        self.get_primary_keys()
        
        self.created_links = []
            
        # create name
        link_table_1 = "LINK_Match"

        print("creating "+ link_table_1)
        # drop if exists
        self.cursor.execute(f"""DROP TABLE IF EXISTS {link_table_1};""")

        self.cursor.execute(f"""CREATE TABLE {link_table_1}(
            HK VARYINGN CHARACTER(64) NOT NULL PRIMARY KEY,
            HK_away_team VARYINGN CHARACTER(64) NOT NULL, 
            HK_home_team VARYINGN CHARACTER(64) NOT NULL, 
            HK_league VARYINGN CHARACTER(64) NOT NULL, 
            LDTS DATETIME NOT NULL,
            RS INTEGER NOT NULL,
            FOREIGN KEY(HK_away_team) REFERENCES HUB_Team(HK),
            FOREIGN KEY(HK_home_team) REFERENCES HUB_Team(HK),
            FOREIGN KEY(HK_league) REFERENCES HUB_league(HK)
            );""")
        print("created " + link_table_1)
        
        self.created_links.append(link_table_1)
        
        link_table_2 = "LINK_Match_Player"

        print("creating "+ link_table_2)
        # drop if exists
        self.cursor.execute(f"""DROP TABLE IF EXISTS {link_table_2};""")

        self.cursor.execute(f"""CREATE TABLE {link_table_2}(
            HK VARYINGN CHARACTER(64) NOT NULL PRIMARY KEY,
            HK_Link_Match VARYINGN CHARACTER(64) NOT NULL,
            HK_Player VARYINGN CHARACTER(64) NOT NULL,
            LDTS DATETIME NOT NULL,
            RS INTEGER NOT NULL,
            FOREIGN KEY(HK_Link_Match) REFERENCES LINK_Match(HK),
            FOREIGN KEY(HK_Player) REFERENCES HUB_Player(HK)
            );""")
        print("created " + link_table_2)
        self.created_links.append(link_table_2)
        
    def fill_links(self):
        
        
        print("filling links...")
        
        # necesary functions
        self.connect_to_db()
        self.get_table_names()
        self.get_primary_keys()
        #---------------------------------------------------------------------------------------------#
        # LINK 1
        # create name
        link_table_1 = "LINK_Match"
        print("filling " + link_table_1)
        # match_df
        match_df = pd.read_sql("select * from Match", con = self.conn)
        
        # get the hash keys
        teams_HK = pd.read_sql("select HK, BK from HUB_Team", con = self.conn)
        league_HK = pd.read_sql("select HK, BK from HUB_League", con = self.conn)
        
        hk_home_list, hk_away_list, hk_league_list = [], [], []
        for i in range(len(match_df)):
            # ids
            home_id = match_df.iloc[i]["home_team_api_id"]
            away_id = match_df.iloc[i]["away_team_api_id"]
            league_id = match_df.iloc[i]["league_id"]
            
            #hashkeys
            hk_home = teams_HK[teams_HK.BK == home_id]["HK"].item()
            hk_away = teams_HK[teams_HK.BK == away_id]["HK"].item()
            hk_league = league_HK[league_HK.BK == league_id]["HK"].item()
            
            # append to list
            hk_home_list.append(hk_home)
            hk_away_list.append(hk_away)
            hk_league_list.append(hk_league)
            
        link_hks = []
        # create the link_hash
        for home_hk, away_hk, league_hk, date in zip(hk_home_list, hk_away_list, hk_league_list, match_df.date.tolist()):
            var_list = [home_hk, away_hk, league_hk, date, self.RS]
            
            # calculate hash
            link_hk = self.hash_row(var_list)
            link_hks.append(link_hk)
        
        # insert into the link table
        match_link_df = pd.DataFrame({
            "HK" : link_hks,
            "HK_away_team" : hk_away_list,
            "HK_home_team" : hk_home_list,
            "HK_league" : hk_league_list,
            "LDTS" : match_df.date,
            "RS" : self.RS
        })
        
        match_link_df.to_sql(name = link_table_1,con=self.conn, if_exists='append', index = False,
                              dtype={"HK": "VARYINGN CHARACTER(64) NOT NULL PRIMARY KEY",
                                     "HK_away_team" : "VARYINGN CHARACTER(64) NOT NULL",
                                     "HK_home_team" : "VARYINGN CHARACTER(64) NOT NULL",
                                     "HK_league" : "VARYINGN CHARACTER(64) NOT NULL",
                                     "LDTS": "DATETIME NOT NULL",
                                     "RS": "INT NOT NULL"
                                     })
        print("succsessfully filled " + link_table_1)
        # LINK 2 --------------------------------------------------------------------------------------------#
        # create name
        link_table_2 = "LINK_Match_Player"
        print("filling " + link_table_2)
            
        hub_player = pd.read_sql("select * from HUB_Player", con = self.conn)
        link_match = pd.read_sql("select * from LINK_Match", con = self.conn)
        match_df = pd.read_sql("select * from Match", con = self.conn)

        player_columns = []
        for i in range(1, 12):
            player_columns.append("home_player_" + str(i))
            player_columns.append("away_player_" + str(i))

        player_columns.append("match_api_id")

        match_df = match_df[player_columns]

        melted_player_df = match_df.melt(id_vars = "match_api_id", var_name = "player", value_name="player_api_id")
        #melted_player_df = melted_player_df.drop_duplicates()
        
        player_hash_keys, match_hash_keys = [], [] 
        for index, match_id in enumerate(melted_player_df.match_api_id.unique()):
            # create a dataframe with that match
            match_df = melted_player_df[melted_player_df.match_api_id == match_id]
            #print(len(match_df))

            # get the match hash key from the link_match table
            match_hk = link_match.iloc[index]["HK"]
            for player_id in match_df.player_api_id:
                try:
                    player_hk = hub_player[hub_player.BK == player_id]["HK"].item()
                    player_hash_keys.append(player_hk)
                    match_hash_keys.append(match_hk)
                except:
                    pass

        # create the link_match_player_HK
        link_match_hash_keys = []
        for player_hk, match_hk in zip(player_hash_keys, match_hash_keys):
            match_link_HK = self.hash_row([player_hk, match_hk, self.RS])
            link_match_hash_keys.append(match_link_HK)
            
        LDTS = datetime.datetime.now()
        
        LINK_match_player_df = pd.DataFrame({
            "HK" : link_match_hash_keys,
            "HK_Link_Match" : match_hash_keys,
            "HK_Player" : player_hash_keys,
            "LDTS" : LDTS,
            "RS" : self.RS
        })
        
        LINK_match_player_df = LINK_match_player_df.drop_duplicates()
        
        # into the database
        LINK_match_player_df.to_sql(name = link_table_2,con=self.conn, if_exists='append', index = False,
                              dtype={"HK": "VARYINGN CHARACTER(64) NOT NULL PRIMARY KEY",
                                     "HK_Link_Match" : "VARYINGN CHARACTER(64) NOT NULL",
                                     "HK_Player" : "VARYINGN CHARACTER(64) NOT NULL",
                                     "LDTS": "DATETIME NOT NULL",
                                     "RS": "INT NOT NULL"
                                     })
        
        print("succsessfully filled " + link_table_2)
                         
                         
    def create_and_fill_hub_sats(self):
            
        # necesary functions
        self.connect_to_db()
        self.get_table_names()
        self.get_primary_keys()
                         
        # HUB_Satelites-----------------------------------------------------------------------------------------------#
        hub_sat_list = self.sat_dict["HSAT"]
        
        # loop over sat_list
        for sat in hub_sat_list:
            print("creating " + sat)
            # get the corresponding name of the sat
            sat_df_split = sat.split("_")
            sat_df_name = "_".join(sat_df_split[1:])
            hub_name = sat_df_split[1]
            print(sat_df_name)
            
            # read original table and hub_table
            sat_df = pd.read_sql(f"SELECT * from {sat_df_name}", con = self.conn) 

            # get the colums from the original table that I need for the satelite
            info = pd.read_sql(f"PRAGMA table_info({sat_df_name})", con = self.conn)
            col_names = info["name"]
            col_type = info["type"]
            rel_cols = {}
            for col_name, col_type in zip(col_names, col_type):
                if "id" not in col_name:
                    rel_cols[col_name] = " " + col_type + "," # add comma
            # dict to string so I can add it to my create table
            var_string = ""
            for c, t in rel_cols.items():
                var_string += c + t
                
            # create sat_table
            # drop if exists
            self.cursor.execute(f"""DROP TABLE IF EXISTS HSAT_{sat_df_name};""")
            
            # create the LDTS now
            LDTS = datetime.datetime.now()
            self.cursor.execute(f"""CREATE TABLE HSAT_{sat_df_name}(
                HK VARYINGN CHARACTER(64) NOT NULL PRIMARY KEY,
                HK_{hub_name} VARYINGN CHARACTER(64) NOT NULL,
                LDTS DATETIME NOT NULL,
                EDTS DATETIME NOT NULL,
                RS INTEGER NOT NULL,
                HD VARYINGN CHARACTER(64) NOT NULL,
                {var_string}
                FOREIGN KEY(HK_{hub_name}) REFERENCES HUB_{sat_df_name}(HK)
                
            );""")
            
            print("Successfully created " + sat)
            
            # Filling HSATS--------------------------------------------------------------------#
            
            print("filling " + sat)
            
            # the sat_df with the relevant cols
            
            # if "attributes is in the table"
            if "Attributes" in sat_df_name:
                # corresponding hub df
                corr_hub_df = pd.read_sql(f"select * from HUB_{hub_name}", con = self.conn)
                # attributes df
                attr_df = pd.read_sql(f"select * from {sat_df_name}", con = self.conn)
                p_k = self.get_sat_primary_key(sat_df_name)
                
                # relevant cols only
                corr_hub_df = corr_hub_df[["HK", "BK"]]
                attr_df = attr_df[[p_k, "date"]]
                sat_fk = attr_df.merge(corr_hub_df, how = "inner" , right_on= "BK",left_on=p_k)["HK"].values
                                                                                                    
                sat_fk_df = attr_df.merge(corr_hub_df, how = "inner" , right_on= "BK",left_on=p_k)[["HK", "date"]]
            else:
                # read in corresponding hub table
                hub_df = pd.read_sql(f"SELECT * from HUB_{hub_name}", con = self.conn) 
                # hash key from hub
                sat_fk = hub_df["HK"].values
                sat_fk_df = hub_df["HK"]
                
            # make the sat table ready t be hashed
            sat_attr_df = sat_df[rel_cols.keys()]
            # hash the rows
            sat_HD = [self.hash_row(sat_attr_df.iloc[i]) for i in range(len(sat_attr_df))]

            # compute the Sat_HK
            sat_HK = [self.hash_row([sat_fk_df.iloc[i], LDTS, self.RS]) for i in range(len(sat_fk_df))]
            
            EDTS = datetime.datetime.now()
            
            #return pd.DataFrame(sat_HK, sat_fk, LDTS, EDTS, 
            # create the sat df
            print(len(sat_HK), len(sat_attr_df))
            sat_attr_df["HK"] = sat_HK
            sat_attr_df["HK_" + hub_name] = sat_fk
            sat_attr_df["LDTS"] = LDTS
            sat_attr_df["EDTS"] = EDTS
            sat_attr_df["RS"] = self.RS
            sat_attr_df["HD"] = sat_HD

            # into the database
            sat_attr_df.to_sql(name = sat,con=self.conn, if_exists='append', index = False)
        
            print("successfully filled " + sat)
            
    def create_and_fill_link_sats(self):
            
        # necesary functions
        self.connect_to_db()
        self.get_table_names()
        self.get_primary_keys()

        # CREATING LSATS ----------------------------------------------------------#
        link_table = "Match"
        link_name = "LINK_Match"
        link_sat_1 = "LSAT_Match_Statistics"
        link_sat_2 = "LSAT_Match_Bets"
        
        info = pd.read_sql(f"PRAGMA table_info ({link_table})", con = self.conn)
        col_names = info["name"]
        col_type = info["type"]

        match_attr_cols = col_names[:85]
        match_attr_types = col_type[:85]

        betting_cols = col_names[85:]
        betting_types = col_type[:85]


        rel_attr_cols = {}
        for col_name, col_type in zip(match_attr_cols, match_attr_types):
            if "id" not in col_name and len(col_name):
                rel_attr_cols[col_name] = " " + col_type + "," # add comma
        # dict to string so I can add it to my create table
        var_string_attr = ""
        for c, t in rel_attr_cols.items():
            var_string_attr += c + t


         # create link_table 1
        # drop if exists
        self.cursor.execute(f"""DROP TABLE IF EXISTS {link_sat_1};""")

        # create the LDTS now
        LDTS = datetime.datetime.now()
        self.cursor.execute(f"""CREATE TABLE {link_sat_1}(
            HK VARYINGN CHARACTER(64) NOT NULL PRIMARY KEY,
            HK_{link_name} VARYINGN CHARACTER(64) NOT NULL,
            LDTS DATETIME NOT NULL,
            RS INTEGER NOT NULL,
            HD VARYINGN CHARACTER(64) NOT NULL,
            {var_string_attr}
            FOREIGN KEY(HK_{link_name}) REFERENCES {link_name}(HK)

        );""")
            
        
        rel_bet_cols = {}
        for col_name, col_type in zip(betting_cols, betting_types):
            if "id" not in col_name and len(col_name):
                rel_bet_cols[col_name] = " " + col_type + "," # add comma
        # dict to string so I can add it to my create table
        var_string_bets = ""
        for c, t in rel_bet_cols.items():
            var_string_bets += c + t
                        
        # drop if exists
        self.cursor.execute(f"""DROP TABLE IF EXISTS {link_sat_2};""")
        
        # create the LDTS now
        LDTS = datetime.datetime.now()
        self.cursor.execute(f"""CREATE TABLE {link_sat_2}(
            HK VARYINGN CHARACTER(64) NOT NULL PRIMARY KEY,
            HK_{link_name} VARYINGN CHARACTER(64) NOT NULL,
            LDTS DATETIME NOT NULL,
            RS INTEGER NOT NULL,
            HD VARYINGN CHARACTER(64) NOT NULL,
            {var_string_bets}
            FOREIGN KEY(HK_{link_name}) REFERENCES {link_name}(HK)

        );""")
        
        # FILLING LSATS ------------------------------------------------------------------------------#
        
        # read in corresponding link table
        match_df = pd.read_sql(f"SELECT * from {link_table}", con = self.conn) 
        link_df = pd.read_sql(f"SELECT * from {link_name}", con = self.conn) 
        LDTS = datetime.datetime.now()
        
        # FILL MATCH STATISTICS LSAT
        rel_attr_df = match_df[rel_attr_cols.keys()]
        
        # hash key from hub
        sat_fk = link_df["HK"]
                
        # hash the rows
        sat_HD = [self.hash_row(rel_attr_df.iloc[i]) for i in range(len(rel_attr_df))]

        # compute the Sat_HK
        sat_HK = [self.hash_row([sat_fk.iloc[i], LDTS, self.RS]) for i in range(len(sat_fk))]

        # create the sat df
        rel_attr_df["HK"] = sat_HK
        rel_attr_df["HK_" + link_name] = sat_fk
        rel_attr_df["LDTS"] = LDTS
        rel_attr_df["RS"] = self.RS
        rel_attr_df["HD"] = sat_HD

        # into the database
        rel_attr_df.to_sql(name = link_sat_1,con=self.conn, if_exists='append', index = False)
        
        # FILL MATCH BET LSAT
        rel_attr_df = match_df[rel_bet_cols.keys()]
        
        # hash key from hub
        sat_fk = link_df["HK"]
                
        # hash the rows
        sat_HD = [self.hash_row(rel_attr_df.iloc[i]) for i in range(len(rel_attr_df))]

        # compute the Sat_HK
        sat_HK = [self.hash_row([sat_fk.iloc[i], LDTS, self.RS]) for i in range(len(sat_fk))]

        # create the sat df
        rel_attr_df["HK"] = sat_HK
        rel_attr_df["HK_" + link_name] = sat_fk
        rel_attr_df["LDTS"] = LDTS
        rel_attr_df["RS"] = self.RS
        rel_attr_df["HD"] = sat_HD

        # into the database
        rel_attr_df.to_sql(name = link_sat_2,con=self.conn, if_exists='append', index = False)
        
    def create_and_fill_business_sats(self):
        # necesary functions
        self.connect_to_db()
        self.get_table_names()
        self.get_primary_keys()
        
        # IMPORT DATA ------------------------------------------------------------------#
        # import the business satelites dfs
        card_df = pd.read_csv("../Business_Satelite_Data/card_df.csv", index_col = 0, low_memory=False)
        corner_df = pd.read_csv("../Business_Satelite_Data/corner_df.csv", index_col = 0, low_memory=False)
        cross_df = pd.read_csv("../Business_Satelite_Data/cross_df.csv", index_col = 0, low_memory=False)
        foulcommit_df = pd.read_csv("../Business_Satelite_Data/foulcommit_df.csv", index_col = 0, low_memory=False)
        goal_df = pd.read_csv("../Business_Satelite_Data/goal_df.csv", index_col = 0, low_memory=False)
        possession_df = pd.read_csv("../Business_Satelite_Data/possession_df.csv", index_col = 0, low_memory=False)
        shotoff_df = pd.read_csv("../Business_Satelite_Data/shotoff_df.csv", index_col = 0, low_memory=False)
        shoton_df = pd.read_csv("../Business_Satelite_Data/shoton_df.csv", index_col = 0, low_memory=False)

        match_df = pd.read_sql("select * from Match", con = self.conn)
        link_match_df = pd.read_sql("select * from LINK_Match", con = self.conn)

        match_ids = match_df["match_api_id"].values
        match_hks = link_match_df["HK"].values

        match_id_hk = pd.DataFrame({"match_api_id" : match_ids, 
                           "HK_Link_Match" : match_hks})

        # MERGING ------------------------------------------------------------------------------------#
        
        df_dict = {}
        # loop over the business satelite data
        business_sat_data = {"card_df" : card_df,
                             "cross_df" : cross_df,
                             "corner_df" : corner_df,
                             "foulcommit_df" : foulcommit_df,
                             "goal_df" : goal_df,
                             "possession_df" : possession_df,
                             "shotoff_df" : shotoff_df,
                             "shoton_df" : shoton_df}

        for name, df in business_sat_data.items():
            merged_df = df.merge(match_id_hk, how = "inner" , right_on= "match_api_id",left_on="match_api_id")
            df_dict[name] = merged_df

        #hash keys -----------------------------------------------------
        for name, df in df_dict.items():
            hk_df = df[[name.split("_")[0] + "_id", "HK_Link_Match"]]
            hk_df["RS"] = 1
            sat_hk = [self.hash_row(hk_df.iloc[i]) for i in range(len(hk_df))]
            sat_attr_df = df.drop(columns = ["HK_Link_Match", name.split("_")[0] + "_id"])
            sat_HD = [self.hash_row(sat_attr_df.iloc[i]) for i in range(len(sat_attr_df))]
            
            # add new cols
            df.insert(0, "HK", sat_hk)
            df.insert(1, "LDTS", datetime.datetime.now())
            df.insert(2, "RS", self.RS)
            df.insert(3, "HD", sat_HD)
            # to sql
            df.to_sql(name = "B_LSAT_" + name.split("_")[0] ,con=self.conn, if_exists='replace', index = False,
                              dtype={"HK": "VARYINGN CHARACTER(64) NOT NULL PRIMARY KEY",
                                     "HK_Link_Match" : "VARYINGN CHARACTER(64) NOT NULL",
                                     "LDTS": "DATETIME NOT NULL",
                                     "HD" : "VARYINGN CHARACTER(64) NOT NULL",
                                     "RS": "INT NOT NULL"
                                     })


In [26]:
path = "../../Databases/test_2.sqlite"

data_vault = DataVault(path)
data_vault.create_hubs()
data_vault.fill_hubs()
data_vault.create_links()
data_vault.fill_links()
data_vault.create_and_fill_hub_sats()
data_vault.create_and_fill_link_sats()
data_vault.create_and_fill_business_sats()

creating hubs...
Creating HUB_Player
Successfully created HUB_Player
Creating HUB_League
Successfully created HUB_League
Creating HUB_Team
Successfully created HUB_Team
filling hubs...
filling HUB_Player
pkey: player_api_id
Successfully filled HUB_Player
filling HUB_League
pkey: country_id
Successfully filled HUB_League
filling HUB_Team
pkey: team_api_id
Successfully filled HUB_Team
creating LINK_Match
created LINK_Match
creating LINK_Match_Player
created LINK_Match_Player
filling links...
filling LINK_Match
succsessfully filled LINK_Match
filling LINK_Match_Player
succsessfully filled LINK_Match_Player
creating HSAT_Team
Team
Successfully created HSAT_Team
filling HSAT_Team
299 299
successfully filled HSAT_Team
creating HSAT_Team_Attributes
Team_Attributes
Successfully created HSAT_Team_Attributes
filling HSAT_Team_Attributes
1458 1458
successfully filled HSAT_Team_Attributes
creating HSAT_League
League
Successfully created HSAT_League
filling HSAT_League
11 11
successfully filled HSA