# Data Science X Logistic Regression

This school project is an introduction to data cleaning, visualization and model training.

## Data retrieval

In [None]:
# The first thing we need to do is to download and the dataset that we will be working with.
!wget https://cdn.intra.42.fr/document/document/12379/datasets.tgz

# Then, unzip it.
!tar -xvf datasets.tgz

#And remove the zip file.
!rm datasets.tgz

## Data preprocessing

preprocessing tasks:  
-  remove unecessary data
-  fill in missing numerical values by the median of their respective columns
-  fill in missing categorical values by the most frequent of their respective columns
-  move the target values into a separate array

In [None]:
import numpy as np  # Importing the numpy library, which provides support for working with arrays and matrices.

# Load the header of the CSV file.
header = np.genfromtxt('./datasets/dataset_train.csv', delimiter=',', dtype=str, max_rows=1)  # Reads the CSV file and extracts only the first row (header).

# Define dictionaries for mapping values.
hand_mapping = {'Left': '0', 'Right': '1'}  # Dictionary to convert "Best Hand" column values to numerical values.
house_mapping = {'Ravenclaw': '0', 'Slytherin': '1', 'Gryffindor': '2', 'Hufflepuff': '3'}  # Dictionary to convert "Hogwarts House" column values to numerical values.

# Determine indices of columns to be dropped.
columns_to_drop = ['Index', 'First Name', 'Last Name', 'Birthday']  # List of column names to be dropped.
indices_to_drop = [np.where(header == col_name)[0][0] for col_name in columns_to_drop]  # Find the index of each column name to be dropped in the header.

# Load the CSV without the header and drop the specified columns.
data = np.genfromtxt('/mnt/data/dataset_train.csv', delimiter=',', dtype=str, skip_header=1)  # Reads the CSV file, skipping the header.
data_without_dropped_cols = np.delete(data, indices_to_drop, axis=1)  # Deletes the specified columns.


# Fill "Best Hand" missing values by the most frequent best hand value.
hand_values, counts = np.unique(data[:, 5], return_counts=True)
most_frequent_value = hand_values[np.argmax(counts)]

# Convert "Best Hand" values into numerical values.
best_hand_index = np.where(header == 'Best Hand')[0][0]  # Find the index of the "Best Hand" column in the header.
for i, row in enumerate(data_without_dropped_cols):  # Loop through each row of data.
    data_without_dropped_cols[i, best_hand_index] = hand_mapping.get(row[best_hand_index], row[best_hand_index])  # Replace the string value with its corresponding numerical value using the hand_mapping dictionary.

# Extract Hogwarts house values and convert them to numerical values.
house_index = np.where(header == 'Hogwarts House')[0][0]  # Find the index of the "Hogwarts House" column in the header.
houses_array = np.array([[house_mapping[row[house_index]]] for row in data_without_dropped_cols])  # Create a new 2D array with numerical values for houses.

# Remove the Hogwarts house column from the main dataset.
data_without_houses = np.delete(data_without_dropped_cols, house_index, axis=1)  # Delete the "Hogwarts House" column.

# Convert the data into floats and handle missing values.
data_without_houses[data_without_houses == ''] = np.nan  # Replace empty strings with np.nan, which represents "Not a Number".
data_float = data_without_houses.astype(np.float64)  # Convert the data to float type.
for col in range(data_float.shape[1]):  # Loop through each column.
    median = np.nanmedian(data_float[:, col])  # Calculate the median of the column, ignoring nan values.
    nan_indices = np.isnan(data_float[:, col])  # Find the indices of nan values in the column.
    data_float[nan_indices, col] = median  # Replace nan values with the median.

