# 15.0 Pandas

In [44]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

In [45]:
df = pd.read_csv('15.0-pandas-data.csv')
df.head()

Unnamed: 0,t,timedate,Ph,YTi,VAHU,TAHU,Ta,Ps,Vw,YTi_FF
0,1,2023-07-01 00:00:00,0.017944,19.495,0.0,24.059999,17.7,1,0.0,19.5525
1,2,2023-07-01 00:10:00,0.012085,19.415,0.0,24.059999,17.7,1,0.0,19.6425
2,3,2023-07-01 00:20:00,0.017944,19.375,0.0,24.059999,17.8,1,0.0,19.6325
3,4,2023-07-01 00:30:00,0.012085,19.395,0.0,24.059999,17.8,1,0.0,19.325
4,5,2023-07-01 00:40:00,0.02417,19.505,0.0,24.12,17.9,1,0.0,19.44


In [46]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13248 entries, 0 to 13247
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   t         13248 non-null  int64  
 1   timedate  13248 non-null  object 
 2   Ph        13247 non-null  float64
 3   YTi       13247 non-null  float64
 4   VAHU      13247 non-null  float64
 5   TAHU      13247 non-null  float64
 6   Ta        13248 non-null  float64
 7   Ps        13248 non-null  int64  
 8   Vw        13248 non-null  float64
 9   YTi_FF    13247 non-null  float64
dtypes: float64(7), int64(2), object(1)
memory usage: 1.0+ MB


## Drop Unused Columns

In [47]:
df = df.drop(['t', 'timedate', 'Ph', 'Vw'], axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13248 entries, 0 to 13247
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   YTi     13247 non-null  float64
 1   VAHU    13247 non-null  float64
 2   TAHU    13247 non-null  float64
 3   Ta      13248 non-null  float64
 4   Ps      13248 non-null  int64  
 5   YTi_FF  13247 non-null  float64
dtypes: float64(5), int64(1)
memory usage: 621.1 KB


## Check for missing values

In [48]:
df.isnull().sum()

YTi       1
VAHU      1
TAHU      1
Ta        0
Ps        0
YTi_FF    1
dtype: int64

## Drop the missing values

In [49]:
df = df.dropna()

## Check for missing values (again)

In [50]:
df.isnull().sum()

YTi       0
VAHU      0
TAHU      0
Ta        0
Ps        0
YTi_FF    0
dtype: int64

## check for duplicated values

In [51]:
df.duplicated().sum()

np.int64(0)

## Get X Columns (pandas) Dataset

In [52]:
X = df[['VAHU', 'TAHU', 'Ta', 'Ps', 'YTi_FF']]
X

Unnamed: 0,VAHU,TAHU,Ta,Ps,YTi_FF
0,0.0,24.059999,17.7,1,19.552500
1,0.0,24.059999,17.7,1,19.642500
2,0.0,24.059999,17.8,1,19.632500
3,0.0,24.059999,17.8,1,19.325000
4,0.0,24.120000,17.9,1,19.440000
...,...,...,...,...,...
13243,0.0,22.809999,15.9,1,21.175000
13244,0.0,22.809999,15.9,1,21.252500
13245,0.0,22.809999,16.0,2,21.260000
13246,0.0,22.809999,16.1,1,21.145000


## Get X Columns (pandas) Dataset as a (numpy) 2d Matrix

In [53]:
X = X.values
X

array([[ 0.        , 24.05999947, 17.7       ,  1.        , 19.55249977],
       [ 0.        , 24.05999947, 17.7       ,  1.        , 19.64249992],
       [ 0.        , 24.05999947, 17.8       ,  1.        , 19.63249969],
       ...,
       [ 0.        , 22.80999947, 16.        ,  2.        , 21.26000023],
       [ 0.        , 22.80999947, 16.1       ,  1.        , 21.1449995 ],
       [ 0.        , 22.80999947, 16.2       ,  1.        , 21.17999935]])

## Get y Column (pandas) Dataset

In [54]:
y = df['YTi']
y

0        19.4950
1        19.4150
2        19.3750
3        19.3950
4        19.5050
          ...   
13243    20.7875
13244    20.7200
13245    20.6975
13246    20.7175
13247    20.7100
Name: YTi, Length: 13247, dtype: float64

## Get y Column (pandas) Dataset as a (numpy) 1d Array

In [55]:
y = y.values
y

array([19.49499989, 19.41499996, 19.375     , ..., 20.69750023,
       20.71749973, 20.71000004])

## Scale y

In [56]:
scY = MinMaxScaler()
y = scY.fit_transform(y)
y

ValueError: Expected 2D array, got 1D array instead:
array=[19.49499989 19.41499996 19.375      ... 20.69750023 20.71749973
 20.71000004].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

## Reshape y (numpy) 1d Array into a (numpy) 2d Matrix of 1 Column

In [57]:
y = y.reshape(-1, 1)
y

array([[19.49499989],
       [19.41499996],
       [19.375     ],
       ...,
       [20.69750023],
       [20.71749973],
       [20.71000004]])

## Scale y (again)

In [58]:
scY = MinMaxScaler()
y = scY.fit_transform(y)
y

array([[0.41056421],
       [0.40288115],
       [0.39903961],
       ...,
       [0.52605045],
       [0.52797117],
       [0.52725091]])

## Reshape examples

### 1d array

In [59]:
y = y.reshape(-1, )
y

array([0.41056421, 0.40288115, 0.39903961, ..., 0.52605045, 0.52797117,
       0.52725091])

### 2d array (1 row, many columns)

In [60]:
y = y.reshape(1, -1)
y

array([[0.41056421, 0.40288115, 0.39903961, ..., 0.52605045, 0.52797117,
        0.52725091]])

### 2d array (many rows, 1 columns)

In [61]:
y = y.reshape(-1, 1)
y

array([[0.41056421],
       [0.40288115],
       [0.39903961],
       ...,
       [0.52605045],
       [0.52797117],
       [0.52725091]])