In [1]:
#importing library
# Install required libraries (if needed)
!pip install pandas numpy scikit-learn

# Import libraries
import pandas as pd
import numpy as np
from sklearn.utils import shuffle




In [2]:
def divide_and_shuffle_dataset(file_path, dependent_var, independent_vars, ratios=(0.7, 0.2, 0.1)):
    """
    Divides a dataset into training, validation, and testing datasets after shuffling.

    Parameters:
        file_path (str): Path to the dataset file (.mat, .csv, .xls).
        dependent_var (str): The name of the dependent variable.
        independent_vars (list): List of independent variable names.
        ratios (tuple): Ratios for training, validation, and testing splits.

    Returns:
        dict: Contains 'train', 'val', 'test' datasets and corresponding labels.
    """
    # Load the dataset
    if file_path.endswith('.csv'):
        data = pd.read_csv(file_path)
    elif file_path.endswith('.xls') or file_path.endswith('.xlsx'):
        data = pd.read_excel(file_path)
    else:
        raise ValueError("Unsupported file format. Please use .csv or .xls/.xlsx files.")

    # Select only the required columns
    columns = independent_vars + [dependent_var]
    if not all(col in data.columns for col in columns):
        raise ValueError("One or more specified columns are not found in the dataset.")

    data = data[columns]

    # Shuffle the data
    data = shuffle(data, random_state=42).reset_index(drop=True)

    # Split into features (X) and labels (y)
    X = data[independent_vars].values
    y = data[dependent_var].values

    # Compute split indices
    total_size = len(data)
    train_end = int(total_size * ratios[0])
    val_end = train_end + int(total_size * ratios[1])

    # Split the data
    X_train, y_train = X[:train_end], y[:train_end]
    X_val, y_val = X[train_end:val_end], y[train_end:val_end]
    X_test, y_test = X[val_end:], y[val_end:]

    return {
        'train': (X_train, y_train),
        'val': (X_val, y_val),
        'test': (X_test, y_test)
    }


In [3]:
#Set Independent and Dependent Variables
dependent_var = 'Traffic fatalities'
independent_vars = [
    'Urban Population',
    'Rural Population',
    'Fatalities involving high blood alcohol',
    'Licensed drivers (thousands)',
    'Registered vehicles (thousands)',
    'Vehicle-miles traveled (millions)'
]


In [4]:
#Run the Function
file_path = '/content/Accidents_with_Label.csv - Accidents_with_Label.csv.csv'  # Update with your file's name

# Call the function
results = divide_and_shuffle_dataset(
    file_path=file_path,
    dependent_var=dependent_var,
    independent_vars=independent_vars,
    ratios=(0.7, 0.2, 0.1)
)

# Display sizes of each dataset
print({key: (len(value[0]), len(value[1])) for key, value in results.items()})



{'train': (35, 35), 'val': (10, 10), 'test': (6, 6)}


In [5]:
#Inspect the Results
# Training dataset
X_train, y_train = results['train']
print("Training Features:\n", X_train[:5])  # Display the first 5 rows
print("Training Labels:\n", y_train[:5])    # Display the first 5 labels

# Testing dataset
X_test, y_test = results['test']
print("Testing Features:\n", X_test[:5])  # Display the first 5 rows
print("Testing Labels:\n", y_test[:5])    # Display the first 5 labels


Training Features:
 [[7.4195240e+06 2.5189200e+06 3.6700000e+02 7.1034040e+03 8.6274770e+03
  1.0332600e+05]
 [4.8470750e+06 3.2022380e+06 4.9600000e+02 6.1221370e+03 6.2988360e+03
  9.5903000e+04]
 [1.0910332e+07 1.5089610e+06 5.1700000e+02 8.0576830e+03 9.5076630e+03
  1.0913500e+05]
 [8.5910400e+05 4.3484900e+05 8.1000000e+01 9.4298300e+02 1.3937020e+03
  1.4729000e+04]
 [2.2545660e+06 1.1960880e+06 2.4500000e+02 2.3696210e+03 3.2356400e+03
  4.6443000e+04]]
Training Labels:
 [1159 1557 1356  260  774]
Testing Features:
 [[1.388560e+06 1.456098e+06 3.170000e+02 1.896008e+03 1.991650e+03
  3.943100e+04]
 [4.874650e+05 4.147300e+05 1.000000e+02 7.128800e+02 1.056668e+03
  1.120700e+04]
 [7.938850e+06 4.755000e+05 2.270000e+02 5.799532e+03 6.374167e+03
  7.284400e+04]
 [8.333780e+05 9.749660e+05 1.140000e+02 1.292036e+03 1.415954e+03
  2.030200e+04]
 [2.465539e+06 1.981561e+06 3.940000e+02 3.613138e+03 4.588837e+03
  5.903500e+04]]
Testing Labels:
 [ 900  229  731  411 1154]
