In [2]:
!pip install pandas numpy matplotlib seaborn scikit-learn


Defaulting to user installation because normal site-packages is not writeable
Collecting matplotlib
  Downloading matplotlib-3.10.0-cp312-cp312-win_amd64.whl.metadata (11 kB)
Collecting seaborn
  Downloading seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp312-cp312-win_amd64.whl.metadata (15 kB)
Collecting contourpy>=1.0.1 (from matplotlib)
  Downloading contourpy-1.3.1-cp312-cp312-win_amd64.whl.metadata (5.4 kB)
Collecting cycler>=0.10 (from matplotlib)
  Downloading cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0 (from matplotlib)
  Downloading fonttools-4.56.0-cp312-cp312-win_amd64.whl.metadata (103 kB)
Collecting kiwisolver>=1.3.1 (from matplotlib)
  Downloading kiwisolver-1.4.8-cp312-cp312-win_amd64.whl.metadata (6.3 kB)
Collecting pillow>=8 (from matplotlib)
  Downloading pillow-11.1.0-cp312-cp312-win_amd64.whl.metadata (9.3 kB)
Collecting pyparsing>=2.3.1 (from matplotlib)
  Downloading py

In [None]:
import os
import glob
import gdown 
import zipfile
import shutil

def download_data(save_folder, dataset_name="salaries"):
    dataset_list = ["salaries", "exams", "college", "cars", "mall"]
    assert dataset_name in dataset_list, f"Choose one of the available datasets: {dataset_list}"

    file_ids = {
        "college": "1vwfMpQ4ikAI91zn1bWxP_Iqz7DTFUA9F",
        "salaries": "1p-XtX29fgXT9CzBfpHm3t8r028gQPRhe",
        "exams": "1TYN_sRmauaDgNYgQ-0VSHVAJvLoxKx2R",
        "cars": "1Fi5IPdfEktnKyf3dyHmnh84a2jiXl33A",
        "mall": "1eGWJVRNmGjfaH0o3dczBbNe_-RrW0_Jm",
    }

    file_id = file_ids[dataset_name]
    zip_path = os.path.join(save_folder, f"{dataset_name}.zip")
    csv_path = os.path.join(save_folder, f"{dataset_name}.csv")

    os.makedirs(save_folder, exist_ok=True)

    if os.path.isfile(csv_path):
        print(f"{dataset_name}.csv is already downloaded.")
        return

    print(f"⬇️ Downloading dataset: {dataset_name}...")

    url = f"https://drive.google.com/uc?id={file_id}"
    gdown.download(url, zip_path, quiet=False)

    if not os.path.exists(zip_path):
        print("Error: ZIP file was not downloaded!")
        return
    if not zipfile.is_zipfile(zip_path):
        print("Error: The downloaded file is not a valid ZIP! Check the Google Drive link.")
        return

    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(save_folder)

    os.remove(zip_path)

    extracted_files = glob.glob(os.path.join(save_folder, "*"))
    
    if not extracted_files:
        print("Error: No files were extracted! Check if the ZIP contains a subfolder.")
        return

    if len(extracted_files) == 1 and os.path.isdir(extracted_files[0]):
        extracted_files = glob.glob(os.path.join(extracted_files[0], "*"))

    csv_files = [f for f in extracted_files if f.endswith(".csv")]

    if not csv_files:
        print("Error: No CSV files found after extraction!")
        return

    latest_csv = max(csv_files, key=os.path.getctime)
    shutil.move(latest_csv, csv_path)

    print(f" {dataset_name}.csv successfully downloaded and saved.")

download_data("datasets", "exams")
download_data("datasets", "college")
download_data("datasets", "cars")
download_data("datasets", "malls")

exams.csv is already downloaded.
college.csv is already downloaded.
