In [32]:
import os
import glob
import gdown 
import zipfile
import shutil

def download_data(save_folder, dataset_name="salaries"):
    dataset_list = ["salaries", "exams", "college", "cars", "mall"]
    assert dataset_name in dataset_list, f"Choose one of the available datasets: {dataset_list}"

    file_ids = {
        "college": "1vwfMpQ4ikAI91zn1bWxP_Iqz7DTFUA9F",
        "salaries": "1p-XtX29fgXT9CzBfpHm3t8r028gQPRhe",
        "exams": "1TYN_sRmauaDgNYgQ-0VSHVAJvLoxKx2R",
        "cars": "1Fi5IPdfEktnKyf3dyHmnh84a2jiXl33A",
        "mall": "1eGWJVRNmGjfaH0o3dczBbNe_-RrW0_Jm",
    }

    file_id = file_ids[dataset_name]
    zip_path = os.path.join(save_folder, f"{dataset_name}.zip")
    csv_path = os.path.join(save_folder, f"{dataset_name}.csv")

    os.makedirs(save_folder, exist_ok=True)

    if os.path.isfile(csv_path):
        print(f"{dataset_name}.csv is already downloaded.")
        return

    print(f"⬇️ Downloading dataset: {dataset_name}...")

    url = f"https://drive.google.com/uc?id={file_id}"
    gdown.download(url, zip_path, quiet=False)

    if not os.path.exists(zip_path):
        print("Error: ZIP file was not downloaded!")
        return
    if not zipfile.is_zipfile(zip_path):
        print("Error: The downloaded file is not a valid ZIP! Check the Google Drive link.")
        return

    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(save_folder)

    os.remove(zip_path)

    extracted_files = glob.glob(os.path.join(save_folder, "*"))
    
    if not extracted_files:
        print("Error: No files were extracted! Check if the ZIP contains a subfolder.")
        return

    if len(extracted_files) == 1 and os.path.isdir(extracted_files[0]):
        extracted_files = glob.glob(os.path.join(extracted_files[0], "*"))

    csv_files = [f for f in extracted_files if f.endswith(".csv")]

    if not csv_files:
        print("Error: No CSV files found after extraction!")
        return

    latest_csv = max(csv_files, key=os.path.getctime)
    shutil.move(latest_csv, csv_path)

    print(f" {dataset_name}.csv successfully downloaded and saved.")

download_data("datasets", "college")
download_data("datasets", "exams")

college.csv is already downloaded.
exams.csv is already downloaded.


In [33]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [42]:
df = pd.read_csv("datasets/exams.csv")


In [43]:
# Check data
print(df.head())

   gender race/ethnicity parental level of education         lunch  \
0  female        group B           bachelor's degree      standard   
1  female        group C                some college      standard   
2  female        group B             master's degree      standard   
3    male        group A          associate's degree  free/reduced   
4    male        group C                some college      standard   

  test preparation course  math score  reading score  writing score  
0                    none          72             72             74  
1               completed          69             90             88  
2                    none          90             95             93  
3                    none          47             57             44  
4                    none          76             78             75  


In [44]:
print(df.columns)


Index(['gender', 'race/ethnicity', 'parental level of education', 'lunch',
       'test preparation course', 'math score', 'reading score',
       'writing score'],
      dtype='object')


In [46]:
X = df.drop(columns=['gender', 'race/ethnicity', 'parental level of education', 'lunch','test preparation course', 'reading score', 'writing score'])
y = df['math score']

In [47]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


NameError: name 'train_test_split' is not defined