# Data Preprocessing

In [1]:
# import libraries
import torch
import os
import time
import timeit
import numpy as np
from os import path
import pandas as pd

In [2]:
# Creaing a folder
os.makedirs(os.path.join('data'), exist_ok=True)
# Creating (Writing inside) a CSV file 
data_file = os.path.join('data', 'houses.csv')
with open(data_file, 'w') as f:
     f.write('''Country,Rooms,Price,Size
USA,NA,12500,200
NA,3,16000,150
France,4,17800,170
NA,NA,14000,90''')

In [3]:
# checking  if the CSV file exists 
data_file = os.path.join('data', 'houses.csv')
if path.exists(data_file):
    print("True")

True


In [4]:
# Reading a CSV file 
data_file = os.path.join('data', 'houses.csv')
if path.exists(data_file):
    data = pd.read_csv(data_file)
    print(data)

  Country  Rooms  Price  Size
0     USA    NaN  12500   200
1     NaN    3.0  16000   150
2  France    4.0  17800   170
3     NaN    NaN  14000    90


In [5]:
# Data Frame indexing and slicing
# Indexing specific rows and columns
print(data.loc[[1,3],['Country', 'Price']])

  Country  Price
1     NaN  16000
3     NaN  14000


In [6]:
# Column indexing
data.loc[:,['Price']]

Unnamed: 0,Price
0,12500
1,16000
2,17800
3,14000


In [7]:
# Row indexing
data.loc[0]

Country      USA
Rooms        NaN
Price      12500
Size         200
Name: 0, dtype: object

In [8]:
# More about indexing
data.Country[0], data.Price[0]

('USA', 12500)

In [9]:
# Slicing rows and columns
print(data.loc[0:2,['Country', 'Price']])

  Country  Price
0     USA  12500
1     NaN  16000
2  France  17800


In [10]:
# Data Frame Type
data.dtypes

Country     object
Rooms      float64
Price        int64
Size         int64
dtype: object

In [11]:
# Get column type
data['Price'].dtype

dtype('int64')

# Data Analysis

In [12]:
# Describing Data Frame 
data.describe()

Unnamed: 0,Rooms,Price,Size
count,2.0,4.0,4.0
mean,3.5,15075.0,152.5
std,0.707107,2314.267343,46.457866
min,3.0,12500.0,90.0
25%,3.25,13625.0,135.0
50%,3.5,15000.0,160.0
75%,3.75,16450.0,177.5
max,4.0,17800.0,200.0


In [13]:
# Get max value of each column
data.describe().max()

Rooms        4.0
Price    17800.0
Size       200.0
dtype: float64

##### Memory saving
As we can see, the maximum value is 17800.0, which is less than the maximum value of uint16 (65535), so all we need in terms of memory is of type uint16

In [14]:
# Lets compute the amounts of used memory
data.info(memory_usage='deep'), data.memory_usage(deep=True).sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Country  2 non-null      object 
 1   Rooms    2 non-null      float64
 2   Price    4 non-null      int64  
 3   Size     4 non-null      int64  
dtypes: float64(1), int64(2), object(1)
memory usage: 411.0 bytes


(None, 411)

In [15]:
# Check if Price column is numeric
np.issubdtype(data['Price'].dtype, np.number)

True

In [16]:
# Get the name of each column
data.columns

Index(['Country', 'Rooms', 'Price', 'Size'], dtype='object')

In [17]:
# Convert pandas indexes to python list
data.columns.tolist()

['Country', 'Rooms', 'Price', 'Size']

In [18]:
# Before converting Data Frame Type to unit16
data.dtypes

Country     object
Rooms      float64
Price        int64
Size         int64
dtype: object

In [19]:
# Building a dictionary comprehension
start_time = time.time()
temp_cols_uint16 = {col: np.uint16 if np.issubdtype(data[col].dtype, np.number) else data[col].dtype for col in data.columns}
print(temp_cols_uint16)
print("Processing time: ", time.time()-start_time)

{'Country': dtype('O'), 'Rooms': <class 'numpy.uint16'>, 'Price': <class 'numpy.uint16'>, 'Size': <class 'numpy.uint16'>}
Processing time:  0.0009999275207519531


In [20]:
# Vectorize the check of Data Frame columns
start_time = time.time()
is_number = np.vectorize(lambda x: np.issubdtype(x, np.number) )
print(is_number(data.dtypes))
print("Processing time: ", time.time()-start_time)

[False  True  True  True]
Processing time:  0.002000093460083008


In [21]:
# Convert country column into string
# In Pandas string is represented as a Python Object
data['Country'] = data['Country'] .astype(str)
data.dtypes

Country     object
Rooms      float64
Price        int64
Size         int64
dtype: object

In [22]:
#start_time = time.time()
#is_number = np.vectorize(lambda x, y: {np.uint16, x} if np.issubdtype(y, np.number) else {})
#print(is_number(data.columns, data.dtypes))
#print("Processing time: ", time.time()-start_time)

In [23]:
# Replace Nan with 0
tmp_data = data.fillna(0)
tmp_data

Unnamed: 0,Country,Rooms,Price,Size
0,USA,0.0,12500,200
1,,3.0,16000,150
2,France,4.0,17800,170
3,,0.0,14000,90


In [24]:
# After converting numerical columns into uint16
data = tmp_data.astype(temp_cols_uint16)
data.dtypes  

Country    object
Rooms      uint16
Price      uint16
Size       uint16
dtype: object

In [25]:
# To get the mostly-accurate memory usage after convertion to uint16
data.info(memory_usage='deep'), data.memory_usage(deep=True).sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Country  4 non-null      object
 1   Rooms    4 non-null      uint16
 2   Price    4 non-null      uint16
 3   Size     4 non-null      uint16
dtypes: object(1), uint16(3)
memory usage: 395.0 bytes


(None, 395)

#### By converting only Data Frame columns to uint16, we have optimized the memory by 3.89 %

In [26]:
# Changing the position of a column (method 1)
temp_cols = data.columns.tolist()
temp_cols =  temp_cols[:-2] + temp_cols[-1:]  + temp_cols[-2:-1] 
temp_cols

['Country', 'Rooms', 'Size', 'Price']

In [27]:
# Changing the position of a column (method 2)
#col = data.pop('Size')
#data = data.insert(3, 'Size', col)
#data

In [28]:
# The new Data Frame
data = data[temp_cols]
data

Unnamed: 0,Country,Rooms,Size,Price
0,USA,0,200,12500
1,,3,150,16000
2,France,4,170,17800
3,,0,90,14000


# Data Preparation

In [29]:
# we use iloc to select columns
inputs, targets = data.iloc[:, 0:3], data.iloc[:, 3]
print(inputs,"\n")
print(targets)

  Country  Rooms  Size
0     USA      0   200
1     nan      3   150
2  France      4   170
3     nan      0    90 

0    12500
1    16000
2    17800
3    14000
Name: Price, dtype: uint16


In [30]:
# transform pandas tables into numpy matrices
inputs.values, targets.values

(array([['USA', 0, 200],
        ['nan', 3, 150],
        ['France', 4, 170],
        ['nan', 0, 90]], dtype=object),
 array([12500, 16000, 17800, 14000], dtype=uint16))

In [31]:
# dealing with missing data
# in this example, we will apply imputation heuristics. For categorical input fields, we can treat NaN as a category.
# dummy_na=True shows the column of the Nan
inputs = pd.get_dummies(inputs, dummy_na=True) 
inputs

Unnamed: 0,Rooms,Size,Country_France,Country_USA,Country_nan,Country_nan.1
0,0,200,0,1,0,0
1,3,150,0,0,1,0
2,4,170,1,0,0,0
3,0,90,0,0,1,0


In [32]:
# dealing with missing data
# in this example, we replace the NaN entries with the mean value of the corresponding column.
# we use fillna to fill the Nan entry
inputs =  inputs.fillna(inputs.mean())
inputs

Unnamed: 0,Rooms,Size,Country_France,Country_USA,Country_nan,Country_nan.1
0,0,200,0,1,0,0
1,3,150,0,0,1,0
2,4,170,1,0,0,0
3,0,90,0,0,1,0


# Conversion to the Tensor Format

In [36]:
# Data frame type
data.dtypes

Country    object
Rooms      uint16
Size       uint16
Price      uint16
dtype: object

Pytorch, for now, does not support unsigned 16-bit integer.
The only supported types are: float64, float32, float16, complex64, complex128, int64, int32, int16, int8, uint8, and bool.

In [34]:
# Converting data frame to signed 32-bit integer (int32)
inputs, targets = inputs.astype(np.int32), targets.astype(np.int32)
inputs.dtypes, targets.dtypes

(Rooms             int32
 Size              int32
 Country_France    int32
 Country_USA       int32
 Country_nan       int32
 Country_nan       int32
 dtype: object,
 dtype('int32'))

In [35]:
# Convertng data frame to tensor
X, y = torch.tensor(inputs.values), torch.tensor(targets.values)
X, y

(tensor([[  0, 200,   0,   1,   0,   0],
         [  3, 150,   0,   0,   1,   0],
         [  4, 170,   1,   0,   0,   0],
         [  0,  90,   0,   0,   1,   0]], dtype=torch.int32),
 tensor([12500, 16000, 17800, 14000], dtype=torch.int32))