In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import copy
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler


def load_data(file_name):
    """
    Loads the data from a student dataset file_name and converts it to a training set (x_train, y_train).
    The input x_train includes the features ["sex", "age", "Pstatus", "Mjob", "Fjob", "higher", "activities"],
    the output y_train contains the final grade G3.

    Parameters:
        file_name (string): path to a student dataset

    Returns:
        x_train (ndarray): Shape(m, 7), m - number of training examples (students) Input to the model
        y_train (ndarray): Shape(m,) Output of the model
    """
    # importing the dataset
    data = pd.read_csv(file_name)

    # Editing the raw dataset to get x_train and y_train
    data = data[["school", "sex", "age", "Mjob", "Fjob", "higher", "activities", "G3"]]

    # Turning categorical features into numbers
    # Dummy matrices + Label Encoding
    non_num = data.select_dtypes(include="object")
    encoder = LabelEncoder()
    for column in non_num.columns:
        if len(non_num[column].unique()) == 2:
            data[column] = encoder.fit_transform(data[column])
        
        else:
            non_num[column] = non_num[column].apply(lambda x: column[0].lower() + "_" + x)
            dummies = pd.get_dummies(non_num[column])
            dummies = dummies.drop([dummies.columns[-1]], axis = 1)
            data = pd.concat([data, dummies], axis=1)
            data = data.drop([column], axis=1)

    # Extracting x_train and y_train from the table
    x_train = data.drop(["G3"], axis=1)
    y_train = data["G3"]

    return x_train, y_train

In [2]:
    # Normalizing the data
    scaler = StandardScaler()
    x_train = pd.DataFrame(scaler.fit_transform(x_train), columns=x_train.columns)

    x_train = x_train.to_numpy()
    y_train = y_train.to_numpy()

NameError: name 'x_train' is not defined

In [17]:
def pd_to_np(x,y):
    x = x.to_numpy()
    y = y.to_numpy()
    
    x.dtype = "f"
    y.dtype = "f"
    
    return x, y
    

    
def normalize(x):
    for column in range(x.shape[1]):
        x[:,column] = x[:,column]/x[:,column].max()
    return x


In [30]:
x_train, y_train = load_data("student-mat.csv")

type(x_train)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  non_num[column] = non_num[column].apply(lambda x: column[0].lower() + "_" + x)


pandas.core.frame.DataFrame

In [19]:
x_train, y_train = pd_to_np(x_train,y_train)
x_train.dtype
y_train.dtype
    

  x.dtype = "f"


dtype('float32')

In [20]:
x_train = normalize(x_train)
x_train

array([[0.        , 0.        , 0.8181818 , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.77272725, ..., 0.        , 1.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [1.        , 1.        , 0.8636364 , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]], dtype=float32)

In [21]:
type(x_train)

numpy.ndarray

In [22]:
normalize(x_train)


array([[0.        , 0.        , 0.8181818 , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.77272725, ..., 0.        , 1.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [1.        , 1.        , 0.8636364 , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]], dtype=float32)

In [23]:
example = np.array([[1,2.234,3],
                   [4.35645,5,6],
                   [7.4535,8,9],
                   [10,10,10]])
normalize(example)


array([[0.1     , 0.2234  , 0.3     ],
       [0.435645, 0.5     , 0.6     ],
       [0.74535 , 0.8     , 0.9     ],
       [1.      , 1.      , 1.      ]])

In [24]:
1/10

0.1

In [25]:
example[0][0]

0.1

In [26]:
ex = np.array([[1,2,3],
               [4,5,6],
               [7,8,9],
               [100,100,100]], dtype='f')
normalize(ex)

array([[0.01, 0.02, 0.03],
       [0.04, 0.05, 0.06],
       [0.07, 0.08, 0.09],
       [1.  , 1.  , 1.  ]], dtype=float32)

In [27]:
normalize(ex).dtype

dtype('float32')