In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import copy
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler


def load_data(file_name):
    """
    Loads the data from a student dataset file_name and converts it to a training set (x_train, y_train).
    The input x_train includes the features ["sex", "age", "Pstatus", "Mjob", "Fjob", "higher", "activities"],
    the output y_train contains the final grade G3.

    Parameters:
        file_name (string): path to a student dataset

    Returns:
        x_train (ndarray): Shape(m, 7), m - number of training examples (students) Input to the model
        y_train (ndarray): Shape(m,) Output of the model
    """
    # importing the dataset
    data = pd.read_csv(file_name)

    # Editing the raw dataset to get x_train and y_train
    data = data[["school", "sex", "age", "Mjob", "Fjob", "higher", "activities", "G3"]]

    # Turning categorical features into numbers
    # Dummy matrices + Label Encoding
    non_num = data.select_dtypes(include="object")
    encoder = LabelEncoder()
    for column in non_num.columns:
        if len(non_num[column].unique()) == 2:
            data[column] = encoder.fit_transform(data[column])
        
        else:
            non_num[column] = non_num[column].apply(lambda x: column[0].lower() + "_" + x)
            dummies = pd.get_dummies(non_num[column])
            dummies = dummies.drop([dummies.columns[-1]], axis = 1)
            data = pd.concat([data, dummies], axis=1)
            data = data.drop([column], axis=1)

    # Extracting x_train and y_train from the table
    x_train = data.drop(["G3"], axis=1)
    y_train = data["G3"]

    return x_train, y_train

In [27]:
def pd_to_np(x,y):
    """
    
    Converts a Dataframe to a Numpy array.
    
    Parameters:
        x (pandas dataframe): Training set as DataFrame
        y (pandas dataframe): Output set as DataFrame
    
    Returns:
        x (ndarray): Training set as Numpy array
        y (ndarray): Output set as Numpy array
    """
        
    x = x.to_numpy()
    y = y.to_numpy()
    
    x = x.astype('float64')
    y = y.astype('float64')
    
    return x, y

In [28]:
def normalize(x):
    """
    
    Performs feature scaling in the range [0,1] by division of each feature by its maximum value.
    
    Parameters:
        x (ndarray): Training set (features of students)
    
    Returns:
        x (ndarray): Training set exposed to feature scaling (input to the model)
    """
    
    x = x.astype('float64')
    for column in range(x.shape[1]):
        x[:,column] = x[:,column]/x[:,column].max()
        
    return x


In [29]:
ex = np.array([[1,2,3],
               [4,5,6],
               [7,8,9],
               [100,100,100]])
normalize(ex)

array([[0.01, 0.02, 0.03],
       [0.04, 0.05, 0.06],
       [0.07, 0.08, 0.09],
       [1.  , 1.  , 1.  ]], dtype=float32)

In [30]:
ex_norm = normalize(ex)
ex_norm[0][0].dtype

dtype('float32')

In [31]:
x_train, y_train = load_data("student-mat.csv")

type(x_train)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  non_num[column] = non_num[column].apply(lambda x: column[0].lower() + "_" + x)


pandas.core.frame.DataFrame

In [32]:
x_train, y_train = pd_to_np(x_train, y_train)
x_train

  x.dtype = "f"


array([[0.0e+00, 0.0e+00, 2.5e-44, ..., 0.0e+00, 0.0e+00, 0.0e+00],
       [0.0e+00, 0.0e+00, 0.0e+00, ..., 0.0e+00, 0.0e+00, 0.0e+00],
       [0.0e+00, 0.0e+00, 2.4e-44, ..., 0.0e+00, 1.4e-45, 0.0e+00],
       ...,
       [0.0e+00, 0.0e+00, 0.0e+00, ..., 0.0e+00, 0.0e+00, 0.0e+00],
       [1.4e-45, 1.4e-45, 2.7e-44, ..., 0.0e+00, 0.0e+00, 0.0e+00],
       [0.0e+00, 0.0e+00, 0.0e+00, ..., 0.0e+00, 0.0e+00, 0.0e+00]],
      dtype=float32)

In [21]:
x_train.dtype

dtype('float32')

In [25]:
normalize(x_train)[:,2]

array([0.8181818 , 0.        , 0.77272725, 0.        , 0.6818182 ,
       0.        , 0.6818182 , 0.        , 0.72727275, 0.        ,
       0.72727275, 0.        , 0.72727275, 0.        , 0.77272725,
       0.        , 0.6818182 , 0.        , 0.6818182 , 0.        ,
       0.6818182 , 0.        , 0.6818182 , 0.        , 0.6818182 ,
       0.        , 0.6818182 , 0.        , 0.6818182 , 0.        ,
       0.72727275, 0.        , 0.72727275, 0.        , 0.72727275,
       0.        , 0.77272725, 0.        , 0.72727275, 0.        ,
       0.6818182 , 0.        , 0.6818182 , 0.        , 0.72727275,
       0.        , 0.72727275, 0.        , 0.6818182 , 0.        ,
       0.72727275, 0.        , 0.6818182 , 0.        , 0.6818182 ,
       0.        , 0.72727275, 0.        , 0.72727275, 0.        ,
       0.6818182 , 0.        , 0.6818182 , 0.        , 0.6818182 ,
       0.        , 0.6818182 , 0.        , 0.72727275, 0.        ,
       0.6818182 , 0.        , 0.6818182 , 0.        , 0.72727