# Auto install python libraries for Data Science

In [None]:
# -*- coding: utf-8 -*-

import subprocess
import sys

def install_libs(packages):
    for package in packages:
        try:
            __import__(package["import_name"])
            print(f"✅ {package['name']} already installed")
        except ImportError:
            print(f"⬇️ Installing {package['name']}...")

            # ✅ Handle both string and list pip_name
            pip_args = package["pip_name"]
            if isinstance(pip_args, str):
                pip_args = [pip_args]

            try:
                subprocess.check_call([sys.executable, "-m", "pip", "install", *pip_args])
                print(f"✅ {package['name']} installed")
            except subprocess.CalledProcessError as e:
                print(f"❌ Failed to install {package['name']}: {e}")



In [None]:
def upgrade_pip():
    """
    Upgrade pip to the latest version using subprocess.
    """
    try:
        subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade", "pip"])
        print("✅ pip upgraded successfully.")
    except subprocess.CalledProcessError as e:
        print(f"❌ Failed to upgrade pip: {e}")


upgrade_pip() #upgrade pip first

Collecting pip
  Using cached pip-25.1.1-py3-none-any.whl.metadata (3.6 kB)
Using cached pip-25.1.1-py3-none-any.whl (1.8 MB)
Installing collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 25.0.1
    Uninstalling pip-25.0.1:
      Successfully uninstalled pip-25.0.1
Successfully installed pip-25.1.1
✅ pip upgraded successfully.


# Mac specific pre-setup by installing require brew packages.

In [3]:
import shutil
import platform

def is_brew_available():
    return shutil.which("brew") is not None

def is_package_installed(package):
    result = subprocess.run(["brew", "list", "--formula", package], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
    return result.returncode == 0

def brew_install(packages):
    if not is_brew_available():
        print("❌ Homebrew is not available in PATH.")
        return

    for package in packages:
        if is_package_installed(package):
            print(f"✅ {package} is already installed.")
        else:
            print(f"⬇️ Installing {package}...")
            try:
                subprocess.check_call(["brew", "install", package])
                print(f"✅ {package} installed successfully.\n")
            except subprocess.CalledProcessError as e:
                print(f"❌ Failed to install {package}: {e}\n")

# brew packages to install
libs = ["unixodbc", # ODBC driver for Snowflake
        "snowflake", # Snowflake CLI
        "libomp", # OpenMP support for XGBoost & LightGBM
        'cmake', # CMake build system
        "llvm", # Modern compiler toolchain used by some ML libraries
        'openssl@3', # OpenSSL 3.x for compatibility
        ]


# Brew install if on macOS
if platform.system() == 'Darwin':  # macOS returns 'Darwin'
    brew_install(libs)
else:
    print("Not on macOS, skipping brew install")


✅ unixodbc is already installed.
✅ snowflake is already installed.
✅ libomp is already installed.
✅ cmake is already installed.
✅ llvm is already installed.
✅ openssl@3 is already installed.


# Data Science Libraries
## 1. Data Wraggling Libs

In [4]:
dw = [
    # ✅ Data Wrangling
    {"name": "Pandas", "pip_name": "pandas", "import_name": "pandas"},
    {"name": "Polars", "pip_name": "polars", "import_name": "polars"},
    {"name": "Dask", "pip_name": "dask[dataframe]", "import_name": "dask"},
    {"name": "PyArrow", "pip_name": "pyarrow", "import_name": "pyarrow"},
    {"name": "PyTables", "pip_name": "tables", "import_name": "tables"},
    {"name": "Feather", "pip_name": "pyarrow", "import_name": "pyarrow.feather"},
    {"name": "Parquet", "pip_name": "pyarrow", "import_name": "pyarrow.parquet"},
    {"name": "HDF5", "pip_name": "tables", "import_name": "tables"},
    {"name": "CSV", "pip_name": "pandas", "import_name": "pandas"},
    {"name": "JSON", "pip_name": "pandas", "import_name": "pandas"},
    {"name": "Excel", "pip_name": "openpyxl", "import_name": "openpyxl"},
    {"name": "HDF5", "pip_name": "tables", "import_name": "tables"},
]

install_libs(dw)

✅ Pandas already installed
✅ Polars already installed
✅ Dask already installed
✅ PyArrow already installed
✅ PyTables already installed
✅ Feather already installed
✅ Parquet already installed
✅ HDF5 already installed
✅ CSV already installed
✅ JSON already installed
✅ Excel already installed
✅ HDF5 already installed


## 2. Data Connectors

In [5]:
dc = [
    # 🔌 Data Connections – Databases
    {"name": "SQLAlchemy", "pip_name": "sqlalchemy", "import_name": "sqlalchemy"},
    {"name": "PyODBC", "pip_name": "pyodbc", "import_name": "pyodbc"},
    {"name": "psycopg2 (PostgreSQL)", "pip_name": "psycopg2-binary", "import_name": "psycopg2"},
    {"name": "MySQL Connector", "pip_name": "mysql-connector-python", "import_name": "mysql.connector"},
    {"name": "MongoDB", "pip_name": "pymongo", "import_name": "pymongo"},
    {"name": "SQLite", "pip_name": "sqlite3", "import_name": "sqlite3"},

    # 🔌 Data Connections – Cloud
    {"name": "Snowflake Connector", "pip_name": "snowflake-connector-python", "import_name": "snowflake.connector"},
    {"name": "Google BigQuery", "pip_name": "google-cloud-bigquery", "import_name": "google.cloud.bigquery"},
    {"name": "AWS SDK (boto3)", "pip_name": "boto3", "import_name": "boto3"},
    {"name": "Azure SDK", "pip_name": "azure-storage-blob", "import_name": "azure.storage.blob"},
    {"name": "Databricks SQL Connector", "pip_name": "databricks-sql-connector", "import_name": "databricks.sql"},
    {"name": "Apache Kafka", "pip_name": "kafka-python", "import_name": "kafka"},

    # 📂 Microsoft SharePoint (via Office365 REST API)
    {"name": "Office365-REST-Python-Client", "pip_name": "Office365-REST-Python-Client", "import_name": "office365.sharepoint.client_context"},

    # 📊 Tableau / Power BI Integration
    {"name": "Tableau Hyper API", "pip_name": "tableauhyperapi", "import_name": "tableauhyperapi"},
    {"name": "Power BI REST Client", "pip_name": "powerbiclient", "import_name": "powerbiclient"},

    # 🌐 API Libraries
    {"name": "Requests", "pip_name": "requests", "import_name": "requests"},
    {"name": "HTTPX", "pip_name": "httpx", "import_name": "httpx"},
    {"name": "FastAPI", "pip_name": "fastapi", "import_name": "fastapi"},
    {"name": "Uvicorn (for FastAPI)", "pip_name": "uvicorn", "import_name": "uvicorn"},
    {"name": "Flask", "pip_name": "flask", "import_name": "flask"},
    {"name": "Django", "pip_name": "django", "import_name": "django"},
    {"name": "Tornado", "pip_name": "tornado", "import_name": "tornado"},
]

install_libs(dc)

✅ SQLAlchemy already installed
✅ PyODBC already installed
✅ psycopg2 (PostgreSQL) already installed
✅ MySQL Connector already installed
✅ MongoDB already installed
✅ SQLite already installed


  warn_incompatible_dep(


✅ Snowflake Connector already installed
✅ Google BigQuery already installed
✅ AWS SDK (boto3) already installed
✅ Azure SDK already installed
✅ Databricks SQL Connector already installed
✅ Apache Kafka already installed
✅ Office365-REST-Python-Client already installed
✅ Tableau Hyper API already installed
✅ Power BI REST Client already installed
✅ Requests already installed
✅ HTTPX already installed
✅ FastAPI already installed
✅ Uvicorn (for FastAPI) already installed
✅ Flask already installed
✅ Django already installed
✅ Tornado already installed


## 3. Stat and Visualization Libs

In [6]:
sv = [
    # 🔍 Search & Text Processing
    {"name": "Whoosh", "pip_name": "whoosh", "import_name": "whoosh"},
    {"name": "NLTK", "pip_name": "nltk", "import_name": "nltk"},
    {"name": "spaCy", "pip_name": "spacy", "import_name": "spacy"},
    {"name": "TextBlob", "pip_name": "textblob", "import_name": "textblob"},
    {"name": "Gensim", "pip_name": "gensim", "import_name": "gensim"},

    # 🧭 Data exploration and profiling
    {"name": "DataExplorer", "pip_name": "dataexplorer", "import_name": "dataexplorer"},
    {"name": "DataProfiler", "pip_name": "dataprofiler", "import_name": "dataprofiler"},
    {"name": "Autoviz", "pip_name": "autovizwidget", "import_name": "autovizwidget"},
    {"name": "Pandas Profiling", "pip_name": "pandas-profiling", "import_name": "pandas_profiling"},
    {"name": "YData Profiling", "pip_name": "ydata-profiling", "import_name": "ydata_profiling"},
    {"name": "Sweetviz", "pip_name": "sweetviz", "import_name": "sweetviz"},
    {"name": "D-Tale", "pip_name": "d-tale", "import_name": "dtale"},

    # 📊 Statistics & Math
    {"name": "NumPy", "pip_name": "numpy", "import_name": "numpy"},
    {"name": "SciPy", "pip_name": "scipy", "import_name": "scipy"},
    {"name": "Statsmodels", "pip_name": "statsmodels", "import_name": "statsmodels"},

    # 📈 Visualization
    {"name": "Matplotlib", "pip_name": "matplotlib", "import_name": "matplotlib"},
    {"name": "Seaborn", "pip_name": "seaborn", "import_name": "seaborn"},
    {"name": "Plotly", "pip_name": "plotly", "import_name": "plotly"},
    {"name": "Altair", "pip_name": "altair", "import_name": "altair"},
    {"name": "Bokeh", "pip_name": "bokeh", "import_name": "bokeh"},
    {"name": "Geopandas", "pip_name": "geopandas", "import_name": "geopandas"},
]

install_libs(sv)

✅ Whoosh already installed
✅ NLTK already installed
✅ spaCy already installed
✅ TextBlob already installed
⬇️ Installing Gensim...
Collecting gensim
  Using cached gensim-4.3.3.tar.gz (23.3 MB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting numpy<2.0,>=1.18.5 (from gensim)
  Using cached numpy-1.26.4-cp313-cp313-macosx_15_0_arm64.whl
Collecting scipy<1.14.0,>=1.7.0 (from gensim)
  Using cached scipy-1.13.1.tar.gz (57.2 MB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Installing backend dependencies: started
  Installing backend

  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mPreparing metadata [0m[1;32m([0m[32mpyproject.toml[0m[1;32m)[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m [31m[54 lines of output][0m
  [31m   [0m [36m[1m+ meson setup /private/var/folders/z0/fnx5vqm50lgchltnb7sh25680000gq/T/pip-install-vcb2bj6t/scipy_4fbb9db18e634f3ea774030f83751e1b /private/var/folders/z0/fnx5vqm50lgchltnb7sh25680000gq/T/pip-install-vcb2bj6t/scipy_4fbb9db18e634f3ea774030f83751e1b/.mesonpy-15u76ox0 -Dbuildtype=release -Db_ndebug=if-release -Db_vscrt=md --native-file=/private/var/folders/z0/fnx5vqm50lgchltnb7sh25680000gq/T/pip-install-vcb2bj6t/scipy_4fbb9db18e634f3ea774030f83751e1b/.mesonpy-15u76ox0/meson-python-native-file.ini[0m
  [31m   [0m The Meson build system
  [31m   [0m Version: 1.8.2
  [31m   [0m Source dir: /private/var/folders/z0/fnx5vqm50lgchltnb7sh25680000gq/T/pip-install-vcb2bj6t/scipy_4fbb9db18e634f3ea774030f83751e1b
  [

❌ Failed to install Gensim: Command '['/Users/nhan.tran/Library/CloudStorage/OneDrive-Interpublic/Documents/GitHub/andis-all-stars/Nhan/env/bin/python', '-m', 'pip', 'install', 'gensim']' returned non-zero exit status 1.
⬇️ Installing DataExplorer...
✅ DataExplorer installed


  import pkg_resources


⬇️ Installing DataProfiler...
Collecting numpy<2.0.0 (from dataprofiler)
  Using cached numpy-1.26.4-cp313-cp313-macosx_15_0_arm64.whl
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.3.0
    Uninstalling numpy-2.3.0:
      Successfully uninstalled numpy-2.3.0
Successfully installed numpy-1.26.4


[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
thinc 8.3.6 requires numpy<3.0.0,>=2.0.0, but you have numpy 1.26.4 which is incompatible.
ml-dtypes 0.5.1 requires numpy>=2.1.0; python_version >= "3.13", but you have numpy 1.26.4 which is incompatible.[0m[31m
[0m

✅ DataProfiler installed
✅ Autoviz already installed
⬇️ Installing Pandas Profiling...
Collecting pandas-profiling
  Using cached pandas_profiling-3.2.0-py2.py3-none-any.whl.metadata (21 kB)
Collecting joblib~=1.1.0 (from pandas-profiling)
  Using cached joblib-1.1.1-py2.py3-none-any.whl.metadata (5.2 kB)
Collecting markupsafe~=2.1.1 (from pandas-profiling)
  Using cached MarkupSafe-2.1.5.tar.gz (19 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting visions==0.7.4 (from visions[type_image_path]==0.7.4->pandas-profiling)
  Using cached visions-0.7.4-py3-none-any.whl.metadata (5.9 kB)
Collecting htmlmin>=0.1.12 (from pandas-profiling)
  Using cached htmlmin-0.1.12.tar.gz (19 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'error'


  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m [31m[19 lines of output][0m
  [31m   [0m Traceback (most recent call last):
  [31m   [0m   File [35m"<string>"[0m, line [35m2[0m, in [35m<module>[0m
  [31m   [0m     [31mexec[0m[1;31m(compile('''[0m
  [31m   [0m     [31m~~~~[0m[1;31m^^^^^^^^^^^^[0m
  [31m   [0m     [1;31m# This is <pip-setuptools-caller> -- a caller that pip uses to run setup.py[0m
  [31m   [0m     [1;31m^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^[0m
  [31m   [0m     ...<32 lines>...
  [31m   [0m     [1;31mexec(compile(setup_py_code, filename, "exec"))[0m
  [31m   [0m     [1;31m^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^[0m
  [31m   [0m     [1;31m''' % ('/private/var/folders/z0/fnx5vqm50lgchltnb7sh25680000gq/T/pip-install-8xb97fq9/htmlmin_ac4e7bd322b248c78577

❌ Failed to install Pandas Profiling: Command '['/Users/nhan.tran/Library/CloudStorage/OneDrive-Interpublic/Documents/GitHub/andis-all-stars/Nhan/env/bin/python', '-m', 'pip', 'install', 'pandas-profiling']' returned non-zero exit status 1.
⬇️ Installing YData Profiling...


[31mERROR: Ignored the following versions that require a different python version: 4.0.0 Requires-Python >=3.7, <3.11; 4.1.0 Requires-Python >=3.7, <3.12; 4.1.1 Requires-Python >=3.7, <3.12; 4.1.2 Requires-Python >=3.7, <3.12; 4.10.0 Requires-Python <3.13,>=3.7; 4.11.0 Requires-Python <3.13,>=3.7; 4.12.0 Requires-Python <3.13,>=3.7; 4.12.1 Requires-Python <3.13,>=3.7; 4.12.2 Requires-Python <3.13,>=3.7; 4.13.0 Requires-Python <3.13,>=3.7; 4.14.0 Requires-Python <3.13,>=3.7; 4.15.0 Requires-Python <3.13,>=3.7; 4.15.1 Requires-Python <3.13,>=3.7; 4.16.0 Requires-Python <3.13,>=3.7; 4.16.1 Requires-Python <3.13,>=3.7; 4.2.0 Requires-Python >=3.7, <3.12; 4.3.0 Requires-Python >=3.7, <3.12; 4.3.1 Requires-Python >=3.7, <3.12; 4.3.2 Requires-Python >=3.7, <3.12; 4.4.0 Requires-Python >=3.7, <3.12; 4.5.0 Requires-Python >=3.7, <3.12; 4.5.1 Requires-Python >=3.7, <3.12; 4.6.0 Requires-Python >=3.7, <3.12; 4.6.1 Requires-Python >=3.7, <3.12; 4.6.2 Requires-Python >=3.7, <3.12; 4.6.3 Requires-P

❌ Failed to install YData Profiling: Command '['/Users/nhan.tran/Library/CloudStorage/OneDrive-Interpublic/Documents/GitHub/andis-all-stars/Nhan/env/bin/python', '-m', 'pip', 'install', 'ydata-profiling']' returned non-zero exit status 1.
✅ Sweetviz already installed
⬇️ Installing D-Tale...


[31mERROR: Could not find a version that satisfies the requirement d-tale (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for d-tale[0m[31m
[0m

❌ Failed to install D-Tale: Command '['/Users/nhan.tran/Library/CloudStorage/OneDrive-Interpublic/Documents/GitHub/andis-all-stars/Nhan/env/bin/python', '-m', 'pip', 'install', 'd-tale']' returned non-zero exit status 1.
✅ NumPy already installed
✅ SciPy already installed
✅ Statsmodels already installed
✅ Matplotlib already installed
✅ Seaborn already installed
✅ Plotly already installed
✅ Altair already installed
✅ Bokeh already installed
✅ Geopandas already installed


## 4. ML and Other Libs

In [7]:
# Define required packages
import os
os.environ["DYLD_LIBRARY_PATH"] = "/opt/homebrew/opt/libomp/lib"

ml = [
    # 🤖 Machine Learning
    {"name": "Scikit-learn", "pip_name": "scikit-learn", "import_name": "sklearn"},
    {"name": "CatBoost", "pip_name": "catboost", "import_name": "catboost"},
    {"name": "Category Encoders", "pip_name": "category_encoders", "import_name": "category_encoders"},
    {"name": "PyTorch", "pip_name": ["torch", "torchvision", "torchaudio"], "import_name": "torch"},
    {"name": "Keras", "pip_name": "keras", "import_name": "keras"},
    {"name": "Hugging Face Transformers", "pip_name": "transformers", "import_name": "transformers"},
    {"name": "Fastai", "pip_name": "fastai", "import_name": "fastai"},
    {"name": "TPOT", "pip_name": "tpot", "import_name": "tpot"},
    {"name": "MLflow", "pip_name": "mlflow", "import_name": "mlflow"},

    # 🧪 Notebook Environment
    {"name": "JupyterLab", "pip_name": "jupyterlab", "import_name": "jupyterlab"},
    {"name": "Notebook", "pip_name": "notebook", "import_name": "notebook"},
    {"name": "Voila", "pip_name": "voila", "import_name": "voila"},
    
]

install_libs(ml)


✅ Scikit-learn already installed
✅ CatBoost already installed
✅ Category Encoders already installed
✅ PyTorch already installed
⬇️ Installing Keras...
Collecting numpy (from keras)
  Using cached numpy-2.3.0-cp313-cp313-macosx_14_0_arm64.whl.metadata (62 kB)
Using cached numpy-2.3.0-cp313-cp313-macosx_14_0_arm64.whl (5.1 MB)
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.26.4
    Uninstalling numpy-1.26.4:
      Successfully uninstalled numpy-1.26.4


[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
dataprofiler 0.13.3 requires numpy<2.0.0, but you have numpy 2.3.0 which is incompatible.[0m[31m
[0m

Successfully installed numpy-2.3.0
✅ Keras installed
✅ Hugging Face Transformers already installed
✅ Fastai already installed
✅ TPOT already installed
✅ MLflow already installed
✅ JupyterLab already installed
✅ Notebook already installed
✅ Voila already installed


In [8]:
ml2 = [
    # 🤖 Machine Learning 2 - separating out these bc they tend to encounter trouble when installing
    {"name": "XGBoost", "pip_name": "xgboost", "import_name": "xgboost"},
    {"name": "LightGBM", "pip_name": "lightgbm", "import_name": "lightgbm"},
    # {"name": "TensorFlow", "pip_name": "tensorflow", "import_name": "tensorflow"},
    ]

install_libs(ml2)

✅ XGBoost already installed
✅ LightGBM already installed


In [9]:
# !pip uninstall xgboost -y

# Pip Upgrade all Lib (optional)

In [None]:
import pkg_resources
def upgrade_all_packages():
    """
    Upgrade all installed pip packages to their latest version.
    """
    print("🔍 Collecting installed packages...")
    packages = [dist.project_name for dist in pkg_resources.working_set]
    
    print("🚀 Upgrading packages...")
    for package in packages:
        try:
            subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade", package])
            print(f"✅ Upgraded: {package}")
        except subprocess.CalledProcessError as e:
            print(f"❌ Failed to upgrade {package}: {e}")

# upgrade_all_packages()
