In [10]:
from pathlib import Path

import duckdb
import pandas as pd
from IPython.core.magic import register_cell_magic

# -----------------------------
# ## SAFE DUCKDB INITIALISATION ##
# -----------------------------

"""
 ------##### OBJECTIVES ######---------
 
    - parse fee to 'transfer_type'
        ##### DONE ######
        
    - translate fee to 'M's 
        ##### DONE ######

    - find unique values for '
        - club_name'
            ##### DONE ######
        -'player_name',
            ##### DONE ######
        - 'nationality',
        - 'position', 
        - 'club_involved_name',
        - 'club_involved_country'"
        
    - find way to parse
        - 'position', 
        - 'nationality', 
        - 'market value'
        - 'club_involved_name'
        - 'club_involved_country' 
"""


DB_PATH = Path("./plt.duckdb")

# Try to close any existing connection first
try:
    con.close()
    print("Closed previous DuckDB connection.")
except NameError:
    pass  # con not defined yet
except Exception as e:
    print("No previous connection to close or already closed:", e)

# Remove stale lock file if present
lock_path = DB_PATH.with_suffix(".duckdb.lock")
if lock_path.exists():
    lock_path.unlink()
    print("Removed stale DuckDB lock file.")

# Ensure folder exists
DB_PATH.parent.mkdir(parents=True, exist_ok=True)

# Connect fresh
con = duckdb.connect(database=":memory:")
print(f"✅ Connected to DuckDB at {DB_PATH.resolve()}")

# --- Optional: setup pretty pandas display ---
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)
pd.set_option("display.expand_frame_repr", False)


# --- Register %%ducksql magic for SQL cells ---
@register_cell_magic
def ducksql(line, cell):
    try:
        res = con.sql(cell)
        display(res.df())
    except Exception as e:
        print("SQL error:", e)

Closed previous DuckDB connection.
✅ Connected to DuckDB at /Users/mohammed/repos/GreatLockIn2025/dbt_projects/dbt_football_project/DBT_PFL_Statistics/python_scripts/other_file/plt.duckdb


In [7]:
%%ducksql
CREATE SCHEMA IF NOT EXISTS staging;

-- 2) Point directly at your file (absolute path)
CREATE OR REPLACE VIEW staging.transfers_all AS
SELECT *
FROM read_csv_auto('/Users/mohammed/repos/GreatLockIn2025/dbt_projects/dbt_football_project/DBT_PFL_Statistics/dbt/seeds/premier-league.csv', HEADER=TRUE);

SELECT * FROM staging.transfers_all LIMIT 50;

Unnamed: 0,club_name,player_name,age,position,club_involved_name,fee,transfer_movement,transfer_period,fee_cleaned,league_name,year,season
0,Middlesbrough FC,Tommy Wright,26,Left Winger,Leicester,€910Th.,in,Summer,0.91,Premier League,1992,1992/1993
1,Middlesbrough FC,Jonathan Gittens,28,defence,Southampton,€250Th.,in,Summer,0.25,Premier League,1992,1992/1993
2,Middlesbrough FC,Chris Morris,28,Right-Back,Celtic,?,in,Summer,,Premier League,1992,1992/1993
3,Middlesbrough FC,Ben Roberts,17,Goalkeeper,Boro U18,-,in,Summer,,Premier League,1992,1992/1993
4,Middlesbrough FC,Andy Todd,17,Centre-Back,Boro U18,-,in,Summer,,Premier League,1992,1992/1993
5,Middlesbrough FC,Stuart Ripley,24,Right Winger,Blackburn,€2.00m,out,Summer,2.0,Premier League,1992,1992/1993
6,Middlesbrough FC,Gary Parkinson,24,Right-Back,Southend United,loan transfer,out,Summer,,Premier League,1992,1992/1993
7,Middlesbrough FC,Bernie Slaven,31,Centre-Forward,Port Vale,?,out,Summer,,Premier League,1992,1992/1993
8,Middlesbrough FC,Andy Payton,24,attack,Celtic,?,out,Summer,,Premier League,1992,1992/1993
9,Middlesbrough FC,Ian Arnold,20,Centre-Forward,Carlisle United,?,out,Summer,,Premier League,1992,1992/1993


In [8]:
from pathlib import Path

import duckdb
import pandas as pd

"""
----------------- ## OBJECTIVES ## ------------------

               #### CLUB NAME #####
 - To Do:
    - find unique values for:
        -"club_name" <--------
            - writes these clubs to a file
                ------ ###### DONE #####------
"""

# 1) Where to write the seed
SEED_DIR = Path("./dbt/seeds")
SEED_DIR.mkdir(parents=True, exist_ok=True)
OUT = (SEED_DIR / "premier-league_clubs.csv").as_posix()

# 2) Build canonical club table
con.execute(r"""
CREATE OR REPLACE TEMP VIEW final_fpl_clubs AS
WITH base AS (
  SELECT club_name
  FROM staging.transfers_all
),
norm AS (
  SELECT
    club_name,
    REGEXP_REPLACE(
      club_name,
      '[\u00A0\u2000-\u200F\u202A-\u202E\u2060-\u2069\uFEFF]',
      ''
    ) AS s0
  FROM base
),
clean AS (
  SELECT
    TRIM(REGEXP_REPLACE(s0, '\s+', ' ')) AS club_name_canonical,
    TRIM(
      REGEXP_REPLACE(
        REGEXP_REPLACE(LOWER(s0), '[^a-z0-9 ]+', ' '),
        '\s+', ' '
      )
    ) AS club_key
  FROM norm
)
SELECT
  MIN(club_name_canonical) AS club_name,
  club_key
FROM clean
GROUP BY club_key
ORDER BY club_name;
""")

# 3) Fetch and display result to verify
df = con.sql("SELECT * FROM final_fpl_clubs;").df()
print(f"\nPreview ({len(df)} rows):")
display(df.head(30))  # shows first 30 clubs

# 4) Write only if data is present
if df.empty:
    print("⚠️ No rows returned — check your season filter or source table.")
else:
    con.execute(f"""
    COPY (SELECT * FROM final_fpl_clubs)
    TO '{OUT}'
    (HEADER, DELIMITER ',');
    """)
    print(f"✅ Wrote {len(df)} rows to {OUT}")


Preview (50 rows):


Unnamed: 0,club_name,club_key
0,AFC Bournemouth,afc bournemouth
1,Arsenal FC,arsenal fc
2,Aston Villa,aston villa
3,Barnsley FC,barnsley fc
4,Birmingham City,birmingham city
5,Blackburn Rovers,blackburn rovers
6,Blackpool FC,blackpool fc
7,Bolton Wanderers,bolton wanderers
8,Bradford City,bradford city
9,Brentford FC,brentford fc


✅ Wrote 50 rows to dbt/seeds/premier-league_clubs.csv
