```bash
gcloud dataproc clusters create cluster-ad99 \
  --region europe-west1 \
  --zone europe-west1-c \
  --enable-component-gateway \
  --no-address \
  --master-machine-type n4-standard-2 \
  --master-boot-disk-type hyperdisk-balanced \
  --master-boot-disk-size 100 \
  --num-workers 2 \
  --worker-machine-type n4-standard-2 \
  --worker-boot-disk-type hyperdisk-balanced \
  --worker-boot-disk-size 200 \
  --image-version 2.2-debian12 \
  --project werner-staging
  ```

```bash
gcloud dataproc clusters describe cluster-ad99 --region=europe-west1
```

masterConfig :

```bash
instanceNames:
    - cluster-ad99-m # master
```

```bash
gcloud compute ssh cluster-ad99-m --zone=europe-west1-c
```

## Créer une VM d'installation

```bash
gcloud compute instances create image-vm \
  --zone=europe-west1-c \
  --image-family=debian-12 \
  --image-project=debian-cloud \
  --boot-disk-size=20GB \
  --metadata-from-file startup-script=startup-script.sh

````


## On vérifie les logs d'installations

```bash
cat /var/log/syslog | grep startup-script
```

## On se connecte à l'instance 

```bash
gcloud compute ssh image-vm --zone=europe-west1-c
```

## On installte miniconda 

```bash
# Download the installation script
wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh

# Make the script executable
chmod +x ~/miniconda.sh

# Run the installation script
~/miniconda.sh

# Initialize Conda for your shell
source ~/miniconda3/bin/activate
conda init bash

#Delete default base
conda config --set auto_activate_base false
```

## On crée l'environnement conda

```bash
conda create --name pipeline-prod python==3.10.9
conda activate pipeline-prod
```

## On installe le projet

```bash
git clone https://github.com/Erwin-Labs/werner-data-pipeline.git
```

## On installe les bibliothèques

```bash
cd werner-data-pipeline
pip install -r ./production/requirements.txt
```

## On crée une image à partir de la VM

```bash
gcloud compute instances stop image-vm --zone=europe-west1-c
gcloud compute images create dataproc-conda310-image \
  --source-disk=image-vm \
  --source-disk-zone=europe-west1-c


## Créer un cluster Dataproc avec cette image

```bash
gcloud dataproc clusters create cluster-ad99 \
  --image dataproc-conda310-image \
  --region europe-west1 \
  --zone europe-west1-c \
  --enable-component-gateway \
  --no-address \
  --master-machine-type n4-standard-2 \
  --master-boot-disk-type hyperdisk-balanced \
  --master-boot-disk-size 100 \
  --num-workers 2 \
  --worker-machine-type n4-standard-2 \
  --worker-boot-disk-type hyperdisk-balanced \
  --worker-boot-disk-size 200 \
  --image-version 2.2-debian12 \
  --project werner-staging
```