In [1]:
from lib import log_setup
log_setup()


from config import Config
import logging
from jobs.linguist_pipeline import Linguist_Pipeline

log = logging.getLogger(__name__)
config = Config()

In [2]:
from collections.abc import Callable
import pandas as pd

def create_dict(
        ref_list: list[str]
        ,translation: Callable[[int], str]
        ) -> dict[str, str]:
    '''Used to create lightweight reference dictionaries for transformations.

    Args:
        ref_list (list[str]): Constants used for mappings.
        translation (Callable[[int], str]): Function to map against constants.

    Returns:
        dict[str, str]: Mapped dictionary with reference ids.
    '''
    log.debug('Creating dictionary for reference table.')
    # Dictionary comprehension used to apply function to each item as it's placed in dictionary
    return {item : translation(num) for num, item in enumerate(ref_list, start = 1)}


def create_ref_table(
        mapping: dict[str, str]
        ,target_col: str
    ) -> pd.DataFrame:
    '''Readies reference/parent table for SQL insertion.

    Args:
        mapping (dict[str, str]): Mapping from `create_dict()`.
        target_col (str): Primary key column to be created.

    Returns:
        pd.DataFrame: 2 column table with unique IDs.
    '''
    log.debug('Creating reference table.')
    # Adds 'id' to target column's name to create reference ID column
    new_col = f'{target_col}_id'

    # Initializes the dataframe using dictionary pairs for values
    return pd.DataFrame(
        {
            new_col: mapping.values(),
            target_col: mapping.keys()
        }
    )

In [3]:
languages = Linguist_Pipeline()
for x, y in languages:
    print(x)

04-10-25 15:49:21 jobs.linguist_pipeline.extract: DEBUG - Reading github-linguist YAML from cache.
04-10-25 15:49:22 lib.etl_tools: DEBUG - Creating dictionary for reference table.
04-10-25 15:49:22 lib.etl_tools: DEBUG - Creating reference table.


1C Enterprise
2-Dimensional Array
4D
ABAP
ABAP CDS
ABNF
AGS Script
AIDL
AL
AMPL
ANTLR
API Blueprint
APL
ASL
ASN.1
ASP.NET
ATS
ActionScript
Ada
Adblock Filter List
Adobe Font Metrics
Agda
Alloy
Alpine Abuild
Altium Designer
AngelScript
Answer Set Programming
Ant Build System
Antlers
ApacheConf
Apex
Apollo Guidance Computer
AppleScript
Arc
AsciiDoc
AspectJ
Assembly
Astro
Asymptote
Augeas
AutoHotkey
AutoIt
Avro IDL
Awk
B4X
BASIC
BQN
Ballerina
Batchfile
Beef
Befunge
Berry
BibTeX
BibTeX Style
Bicep
Bikeshed
Bison
BitBake
Blade
BlitzBasic
BlitzMax
Bluespec
Bluespec BH
Boo
Boogie
Brainfuck
BrighterScript
Brightscript
Browserslist
C
C#
C++
C-ObjDump
C2hs Haskell
CAP CDS
CIL
CLIPS
CMake
COBOL
CODEOWNERS
COLLADA
CSON
CSS
CSV
CUE
CWeb
Cabal Config
Caddyfile
Cadence
Cairo
Cairo Zero
CameLIGO
Cap'n Proto
Carbon
CartoCSS
Ceylon
Chapel
Charity
Checksums
ChucK
Circom
Cirru
Clarion
Clarity
Classic ASP
Clean
Click
Clojure
Closure Templates
Cloud Firestore Security Rules
CoNLL-U
CodeQL
CoffeeScript
ColdF

In [4]:
languages

<LinguistPipeline(rows=762, cols=11)>

In [5]:
languages.table

Unnamed: 0,name_id,name
0,L1,1C Enterprise
1,L2,2-Dimensional Array
2,L3,4D
3,L4,ABAP
4,L5,ABAP CDS
...,...,...
757,L758,sed
758,L759,templ
759,L760,vCard
760,L761,wisp
