In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import copy
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler


def load_data(file_name):
    """
    Loads the data from a student dataset file_name and converts it to a training set (x_train, y_train).
    The input x_train includes the features ["sex", "age", "Pstatus", "Mjob", "Fjob", "higher", "activities"],
    the output y_train contains the final grade G3.

    Parameters:
        file_name (string): path to a student dataset

    Returns:
        x_train (ndarray): Shape(m, 7), m - number of training examples (students) Input to the model
        y_train (ndarray): Shape(m,) Output of the model
    """
    # importing the dataset
    data = pd.read_csv(file_name)

    # Editing the raw dataset to get x_train and y_train
    data = data[["school", "sex", "age", "Mjob", "Fjob", "higher", "activities", "G3"]]

    # Turning categorical features into numbers
    # Dummy matrices + Label Encoding
    non_num = data.select_dtypes(include="object")
    encoder = LabelEncoder()
    for column in non_num.columns:
        if len(non_num[column].unique()) == 2:
            data[column] = encoder.fit_transform(data[column])
        
        else:
            non_num[column] = non_num[column].apply(lambda x: column[0].lower() + "_" + x)
            dummies = pd.get_dummies(non_num[column])
            dummies = dummies.drop([dummies.columns[-1]], axis = 1)
            data = pd.concat([data, dummies], axis=1)
            data = data.drop([column], axis=1)

    # Extracting x_train and y_train from the table
    x_train = data.drop(["G3"], axis=1)
    y_train = data["G3"]

    return x_train, y_train

In [2]:
def pd_to_np(x,y):
    """
    
    Converts a Dataframe to a Numpy array.
    
    Parameters:
        x (pandas dataframe): Training set as DataFrame
        y (pandas dataframe): Output set as DataFrame
    
    Returns:
        x (ndarray): Training set as Numpy array
        y (ndarray): Output set as Numpy array
    """
        
    x = x.to_numpy()
    y = y.to_numpy()
    
    x = x.astype('float64')
    y = y.astype('float64')
    
    return x, y

In [10]:
def normalize(x):
    """
    
    Performs feature scaling in the range [0,1] by division of each feature by its maximum value.
    
    Parameters:
        x (ndarray): Training set (features of students)
    
    Returns:
        x (ndarray): Training set exposed to feature scaling (input to the model)
    """
    
    x = x.astype('float64')
    for column in range(x.shape[1]):
        x[:,column] = x[:,column]/x[:,column].max()
        
    return x

In [11]:
x_train, y_train = load_data("student-mat.csv")
x_train

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  non_num[column] = non_num[column].apply(lambda x: column[0].lower() + "_" + x)


Unnamed: 0,school,sex,age,higher,activities,m_at_home,m_health,m_other,m_services,f_at_home,f_health,f_other,f_services
0,0,0,18,1,0,1,0,0,0,0,0,0,0
1,0,0,17,1,0,1,0,0,0,0,0,1,0
2,0,0,15,1,0,1,0,0,0,0,0,1,0
3,0,0,15,1,1,0,1,0,0,0,0,0,1
4,0,0,16,1,0,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,1,1,20,1,0,0,0,0,1,0,0,0,1
391,1,1,17,1,0,0,0,0,1,0,0,0,1
392,1,1,21,1,0,0,0,1,0,0,0,1,0
393,1,1,18,1,0,0,0,0,1,0,0,1,0


In [12]:
x, y = pd_to_np(x_train, y_train)
x

array([[ 0.,  0., 18., ...,  0.,  0.,  0.],
       [ 0.,  0., 17., ...,  0.,  1.,  0.],
       [ 0.,  0., 15., ...,  0.,  1.,  0.],
       ...,
       [ 1.,  1., 21., ...,  0.,  1.,  0.],
       [ 1.,  1., 18., ...,  0.,  1.,  0.],
       [ 1.,  1., 19., ...,  0.,  0.,  0.]])

In [13]:
x.shape

(395, 13)

In [14]:
x.shape

(395, 13)

In [16]:
normalize(x).shape
normalize(x)

array([[0.        , 0.        , 0.81818182, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.77272727, ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.68181818, ..., 0.        , 1.        ,
        0.        ],
       ...,
       [1.        , 1.        , 0.95454545, ..., 0.        , 1.        ,
        0.        ],
       [1.        , 1.        , 0.81818182, ..., 0.        , 1.        ,
        0.        ],
       [1.        , 1.        , 0.86363636, ..., 0.        , 0.        ,
        0.        ]])

In [23]:
example = np.array([[1,2.234,3],
                   [4.35645,5,6],
                   [7.4535,8,9],
                   [10,10,10]])
normalize(example)


array([[0.1     , 0.2234  , 0.3     ],
       [0.435645, 0.5     , 0.6     ],
       [0.74535 , 0.8     , 0.9     ],
       [1.      , 1.      , 1.      ]])

In [73]:
technologies = {
    'Courses':[1,2,3,4],
    'Fee' :[20000,25000,22000,30000],
    'Duration':[1,2,3,4],
    'Discount':[1000,2300,1200,2000]
              }
index_labels=['r1','r2','r3','r4']
df = pd.DataFrame(technologies,index=index_labels)
type(df)

pandas.core.frame.DataFrame

In [87]:
df

Unnamed: 0,Courses,Fee,Duration,Discount
r1,1,20000,1,1000
r2,2,25000,2,2300
r3,3,22000,3,1200
r4,4,30000,4,2000


In [88]:
df.to_numpy()

array([[    1, 20000,     1,  1000],
       [    2, 25000,     2,  2300],
       [    3, 22000,     3,  1200],
       [    4, 30000,     4,  2000]], dtype=int64)

In [89]:
pd_to_np(df)

array([[1.0e+00, 2.0e+04, 1.0e+00, 1.0e+03],
       [2.0e+00, 2.5e+04, 2.0e+00, 2.3e+03],
       [3.0e+00, 2.2e+04, 3.0e+00, 1.2e+03],
       [4.0e+00, 3.0e+04, 4.0e+00, 2.0e+03]])

In [90]:
pd_to_np(df).shape

(4, 4)

In [91]:
df.astype('float64')

Unnamed: 0,Courses,Fee,Duration,Discount
r1,1.0,20000.0,1.0,1000.0
r2,2.0,25000.0,2.0,2300.0
r3,3.0,22000.0,3.0,1200.0
r4,4.0,30000.0,4.0,2000.0
