In [20]:
import config.config as cf
import sqlite3
import pandas as pd
import numpy as np
import os
import sys

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [21]:
file_path = '../data/tv/tv_live_1.csv'

In [22]:
names = {'NEONET.PL': 'NEONET AGD RTV',
         'EURO.COM.PL': 'EURO RTV AGD',}
dayparts = {'Off stacji': 'off',
            'Prime stacji': 'prime'}

In [None]:
def get_index_val(table_name: str, cur: sqlite3.Cursor)-> int:
    """
    Function gets max index value from the selected table and returns it as an integer increased by one.
    When the table is empty, returns 1

    :param table_name: Name of the table out of which the data is going to be pulled, 
    represented as a str
    :param cur: Is a cursor object created for con object
    :raise sqlite3.ProgrammingError: If there are any error raised by the DB-API
    :daise sqlite3.OperationalError: If any eceptions on the DB side are raised, i.g. DB being locked
    :return: number representiung max index value of selected table icreased by 1
    :rtype: int
    """
    
    query = (f'SELECT MAX(id) FROM {table_name};')
    cur.execute(query)
    ind = cur.fetchone()
    if ind[0] == None:
        return 1
    else:
        return ind[0] + 1
    
def iter_over_inputs(data_set:list[dict[list,str,str]], con: sqlite3.Connection, 
                        cur: sqlite3.Cursor)-> None:
    """
    Main loop for iteration over one column tables.

    :param data_set: List containing dicts with data, table name and field/column name.
    List contains strings or integers representing the data to be added into selected tables.
    :param table_name: String representing name of the table into which data is going to be added
    :param con: Is a connection object, pointing to a DB
    :param cur: Is a cursor object created for con object
    :param avoid_adding: List of tablet which doesn't need to be updated
    :raise KeyError: If key name does not match the pattern
    :raise sqlite3.ProgrammingError: If there are any error raised by the DB-API
    :daise sqlite3.OperationalError: If any eceptions on the DB side are raised, i.g. DB being locked
    :return: None
    """
    
    for elem in data_set:
        data = elem['data']
        table = elem['table']
        field = elem['field']
        new_data, data = check_for_data_1_field(data, table, field, cur, avoid_adding)
        if new_data:
            add_1_field(data, table, field, con, cur)

def add_1_field(data:list, table_name:str, field_name: str, 
                con: sqlite3.Connection, cur: sqlite3.Cursor)-> None:
    """
    Skeleton function for adding data to single column tables.

    :param data: List of strings or integers representing table contents
    :param table_name: String reprexsenting name of the table into which data is going to be added
    :param con: Is a connection object, pointing to a DB
    :param cur: Is a cursor object created for con object
    :raise sqlite3.ProgrammingError: If there are any error raised by the DB-API
    :daise sqlite3.OperationalError: If any eceptions on the DB side are raised, i.g. DB being locked
    :return: None
    """
    
    query = (f'INSERT INTO {table_name} ({field_name}) VALUES(:name);')
    for elem in data:
        to_add = {'name': str(elem)}
        cur.execute(query, to_add)
    con.commit()

def check_for_data_1_field(data_:list, table_name:str, field_name:str, 
                            cur: sqlite3.Cursor, avoid_adding: list[str])-> tuple[bool,list[str]]:
    """
    Skeleton function for checking if there is data inside each of one column tables.
    Ads data if there are any new entries, skips if no new data was found. 
    If DB is empty returns immediately.

    :param data_: List containing data to be checked and added. Data is of str or int types.
    :param table_name: String representing name of the table into which data is going to be added
    :param field_name: String representing name of the field/ column name
    :param cur: Is a cursor object created for con object
    :param avoid_adding: List of tablet which doesn't need to be updated
    :raise sqlite3.ProgrammingError: If there is an error raised by the DB-API
    :daise sqlite3.OperationalError: If any eceptions on the DB side are raised, i.g. DB being locked
    :return: A tuple containing bool for logic purposes, anbd the data set to be added
    :rtype: tuple[bool, list[str/int]]
    """
    
    if table_name == 'kody_rek':
        query = (f"SELECT kod_rek FROM {table_name};")
    else:
        query = (f"SELECT {field_name} FROM {table_name};")
    cur.execute(query)
    
    in_db = pd.DataFrame([elem[0] for elem in cur.fetchall()])
    in_db.rename(columns={0: field_name}, inplace=True)
    
    if len(in_db) == 0:
        return (True, data_)
    else:
        if field_name == 'data':
            in_db = list(in_db['data'])
        elif table_name == 'kody_rek':
            pass
        else: 
            in_db = list(in_db[field_name])
        
        if len(in_db) != 0 and table_name in avoid_adding:
            print(f'>>> Not adding to {table_name}. No new data found.')
            return (False, list(''))
        if table_name == 'kody_rek':
            in_df = pd.DataFrame(data_)
            in_df = in_df[0].str.split(pat='@|@', expand=True, regex=False)
        else:
            in_df = pd.DataFrame(data_)
            in_df = in_df.rename(columns={0: field_name})
        
        # we check if df contains new data in comparison to DB
        if table_name == 'kody_rek':
            in_db = in_db.astype('int32')
            in_df = in_df.astype({0: 'int32', 1:'object'})
            filtr = ~in_df[0].isin(in_db[field_name])
            new_data = in_df[filtr]
            new_data = new_data.rename({0:'a', 1:'b'}, axis=1)
            new_data.loc[:, field_name] = new_data.loc[:, ['a', 'b']].apply(lambda x: f'{x['a']}@|@{x['b']}', axis=1)
            new_data.drop(['a', 'b'], axis=1, inplace=True)
            new_data.astype({field_name:'object'})
            new_data = new_data[field_name]
        else:
            new_data = in_df[~in_df.isin(in_db)].dropna()
            new_data = new_data[field_name]
            
        new_data = list(new_data)
        
        if len(new_data) != 0:
            print(f'>>> Adding to {table_name}. New data found.')
            return (True, new_data)
        else:
            print(f'>>> Not adding to {table_name}. No new data found.')
            return (False, list(''))

In [23]:
# Openes connection to the DB
print('Oppening connection.')
con = sqlite3.Connection(cf.DB_PATH)
cur = con.cursor()

Oppening connection.


In [24]:
df_tv = pd.read_csv(file_path, delimiter=';', thousands=' ', decimal=',', dtype={
                    'Month': 'category', 'Week': 'category', 'Weekday': 'category',
                    'Dayparts': 'object', 'Channel Groups': 'object', 
                    'Channel': 'object', 'PIB pos': 'int8', 'PIB (real) rel': 'object',
                    'PIB count': 'int16', 'Dur rounded,sp': 'int8', 'Spot Class': 'object',
                    'Block Code': 'object', 'Syndicate': 'object', 'Producer': 'object',
                    'Brand': 'object', 'Film Code': 'object', 'Prog Campaign': 'object',
                    'Prog Before': 'object', 'Prog After': 'object', 
                    'Film Code/2': 'object'}, 
                    parse_dates=['Date', 'Time'], date_format='%d.%m.%Y'
                    )
df_tv['Brand'] = df_tv['Brand'].str.strip()
df_tv['Brand'] = df_tv['Brand'].str.upper()
df_tv['Producer'] = df_tv['Producer'].str.upper()
df_tv['Syndicate'] = df_tv['Syndicate'].str.upper()
df_tv['Brand'] = df_tv['Brand'].map(names).fillna(df_tv['Brand'])
df_tv['Dayparts'] = df_tv['Dayparts'].map(dayparts).fillna(df_tv['Dayparts'])
df_tv['DateTime'] = df_tv['Date'].astype('str') + ' ' + df_tv['Time']
df_tv['Kod Opis'] = df_tv['Film Code/2'] + '@|@' + df_tv['Film Code']
df_tv['Kanal Grupa'] = df_tv['Channel'] + '@|@' + df_tv['Channel Groups']
df_tv['Brand Prod Synd'] = df_tv['Brand'] + '@|@' + df_tv['Producer'] + '#|#' + df_tv['Syndicate']
df_tv['DateTime'] = pd.to_datetime(df_tv['DateTime'], format='ISO8601')
df_tv[['Dayparts', 'Prog Before', 'Prog After']] = df_tv[['Dayparts', 'Prog Before', 'Prog After']].fillna('brak danych', axis=1)
df_tv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 972606 entries, 0 to 972605
Data columns (total 27 columns):
 #   Column                                      Non-Null Count   Dtype         
---  ------                                      --------------   -----         
 0   Month                                       972606 non-null  category      
 1   Week                                        972606 non-null  category      
 2   Weekday                                     972606 non-null  category      
 3   Date                                        972606 non-null  datetime64[ns]
 4   Time                                        972606 non-null  object        
 5   Dayparts                                    972606 non-null  object        
 6   Channel Groups                              972606 non-null  object        
 7   Channel                                     972606 non-null  object        
 8   PIB pos                                     972606 non-null  int8         

In [25]:
dates = df_tv['Date'].dt.strftime('%Y-%m-%d').unique()
ad_codes = df_tv.loc[:, ['Film Code/2', 'Film Code', 'Kod Opis']].drop_duplicates(subset=['Film Code/2'], keep='first', ignore_index=True)
ad_codes = ad_codes['Kod Opis']
brands = df_tv.loc[:, ['Brand', 'Producer', 'Syndicate', 'Brand Prod Synd']].drop_duplicates(subset=['Brand'], keep='first', ignore_index=True)
brands = brands['Brand Prod Synd']
channels = df_tv.loc[:, ['Channel', 'Channel Groups', 'Kanal Grupa']].drop_duplicates(subset=['Channel'], keep='first', ignore_index=True)
channels = channels['Kanal Grupa']
dayparts = df_tv['Dayparts'].unique()
pib_real_rels = df_tv['PIB (real) rel'].unique()
durations = df_tv['Dur rounded,sp'].unique()
spot_classes = df_tv['Spot Class'].unique()
block_codes = df_tv['Block Code'].unique()
prog_campaign = df_tv['Prog Campaign'].unique()
prog_before= df_tv['Prog Before'].unique()
prog_after = df_tv['Prog After'].unique()

data_set = [{'data': dates, 'table': cf.DATES['table'], 'field': cf.DATES['field'], 'type': cf.DATES['type']},
            {'data': ad_codes, 'table': cf.AD_CODE['table'], 'field': cf.AD_CODE['field'], 'type': cf.AD_CODE['type']},
            {'data': brands, 'table': cf.BRANDS['table'], 'field': cf.BRANDS['field'], 'type': cf.BRANDS['type']},
            {'data': channels, 'table': cf.CHANNELS['table'], 'field': cf.CHANNELS['field'], 'type': cf.CHANNELS['type']},
            {'data': dayparts, 'table': cf.DAYPARTS['table'], 'field': cf.DAYPARTS['field'], 'type': cf.DAYPARTS['type']},
            {'data': pib_real_rels, 'table': cf.PIB_R['table'], 'field': cf.PIB_R['field'], 'type': cf.PIB_R['type']},
            {'data': durations, 'table': cf.DUR['table'], 'field': cf.DUR['field'], 'type': cf.DUR['type']},
            {'data': spot_classes, 'table': cf.SPOT_CLASS['table'], 'field': cf.SPOT_CLASS['field'], 'type': cf.SPOT_CLASS['type']},
            {'data': block_codes, 'table': cf.BLOCK_CODE['table'], 'field': cf.BLOCK_CODE['field'], 'type': cf.BLOCK_CODE['type']},
            {'data': prog_campaign, 'table': cf.PR_CAMP['table'], 'field': cf.PR_CAMP['field'], 'type': cf.PR_CAMP['type']},
            {'data': prog_before, 'table': cf.PR_BEF['table'], 'field': cf.PR_BEF['field'], 'type': cf.PR_BEF['type']},
            {'data': prog_after, 'table': cf.PR_AFT['table'], 'field': cf.PR_AFT['field'], 'type': cf.PR_AFT['type']},
            ]

In [None]:
try:
    iter_over_inputs(data_set, con, cur)
except sqlite3.ProgrammingError as e:
    con.close()
    print('Failed to input the data.')
    print(f'Error: {e}')
except sqlite3.OperationalError as e:
    con.close()
    print('Failed to input the data.')
    print(f'Error: {e}')
    exit()
    
# TODO CHANGE iter_over_inputs table for new key handling, and create logic for tables populated by triggers!