In [0]:
%pip install GitPython

In [0]:
%restart_python

In [0]:
import os
from git import Repo
import tempfile

In [0]:
catalogs = [row.catalog for row in spark.sql("SHOW CATALOGS").collect()]
if "mycatalog" in catalogs:
    schemas = [row.databaseName for row in spark.sql("SHOW SCHEMAS IN mycatalog").collect()]
    if "myschema" in schemas:
        volumes = [row.volume_name for row in spark.sql("SHOW VOLUMES IN mycatalog.myschema").collect()]
        if "myvolume" in volumes:
            spark.sql("DROP VOLUME mycatalog.myschema.myvolume")
        spark.sql("DROP SCHEMA mycatalog.myschema CASCADE")
    spark.sql("DROP CATALOG mycatalog CASCADE")

In [0]:
%sql
Create catalog if not exists MyCatalog;
Create schema if not exists MyCatalog.MySchema;
Create Volume if not exists MyCatalog.MySchema.MyVolume;

In [0]:
repo_url = "https://github.com/Gowtham-n-db/databricks-learning.git"

with tempfile.TemporaryDirectory() as repo_path:
    # Repo.clone_from(repo_url, repo_path)
    # Repo.clone_from(repo_url, repo_path, branch='test')
    Repo.clone_from(repo_url, repo_path, branch='Gowtham', single_branch=True, depth=1)

    file_data = {}
    for root, dirs, files in os.walk(repo_path):
        dirs = [item for item in dirs if item != '.git']
        for file in files:
            if (file.endswith('.csv') or file.endswith('.json') or file.endswith('.xml') or file.endswith('.parquet')):
                file_path = os.path.join(root, file)
                with open(file_path, 'rb') as f:
                    file_data[file_path] = f.read()
            


In [0]:
# Write the files back to the volume
output_path = "/Volumes/mycatalog/myschema/myvolume/repofiles"

if not os.path.exists(output_path):
    os.makedirs(output_path)

for file_path, data in file_data.items():
    relative_path = os.path.relpath(file_path, repo_path)
    output_file_path = os.path.join(output_path, relative_path)
    output_dir = os.path.dirname(output_file_path)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    with open(output_file_path, 'wb') as f:
        f.write(data)