In [73]:
import imageio
import torch
import os
import pandas as pd
import numpy as np

In [8]:
%%HTML
<h3> Importing Image or video data </h3>

In [2]:
#imageio imports image into numpy arrays
img_arr = imageio.imread('./data/bobby.jpg')
img_arr.shape #height, width, RBG chanel


(720, 1280, 3)

In [3]:
#convert image to proper layout for Pytorch of C, H, W
#where 2 = chanel index, 0 = height, ...

img = torch.from_numpy(img_arr)
output = img.permute(2,0,1)

In [4]:
#we want a batch of 100 RGB images of 256 pixel height and width 
#Here we init the batch and specfiy that each color be represented as an 8 bit integer
batch_size = 100 
batch = torch.zeros(100,3,256,256, dtype=torch.uint8)

In [5]:
#Import all png images and store as a tensors
data_dir = './data/cats/'
filenames = [name for name in os.listdir(data_dir) if os.path.splitext(name)[-1] =='.png']
for i , filename in enumerate(filenames):
    img_arr = imageio.imread(os.path.join(data_dir, filename))
    img_t= torch.from_numpy(img_arr)
    img_t = img_t.permute(2,0,1)
    img_t = img_t[:3]
    batch[i] = img_t
    

In [6]:
batch = batch.float()
batch /=255.0

In [7]:
#Here we find the mean nad stf of input to scale it so output has zero mean and
# std of 1 accross each channel
n_channels = batch.shape[1]
for c in range(n_channels):
    mean = torch.mean(batch[:, c])
    std = torch.std(batch[:,c])
    batch[:, c] = (batch[:, c] - mean)/std
    

In [9]:
%%HTML
<h3> Volumetric Data </h3>

In [10]:
#CT Data comes in series of images of slices of the body from head to toe
#they only have one channel because things are in greyscale
#We can stack the indicidual 2D slcies into a 3D tensor to build volumentric data
#represents the 3D anatomy of subhject 
#These tehnsors have a 5 D shape N x C x D x H x W


In [12]:
#Us volread functiono of image io to load ct scans
dir_path = "./data/p1ch4/volumetric-dicom/2-LUNG 3.0  B70f-04083"
vol_arr = imageio.volread(dir_path, 'DICOM')
vol_arr.shape

Reading DICOM (examining files): 1/99 files (1.0%99/99 files (100.0%)
  Found 1 correct series.
Reading DICOM (loading data): 56/99  (56.699/99  (100.0%)


(99, 512, 512)

In [13]:
#Use Unsqueeze to make room fo the channel dimension
vol = torch.from_numpy(vol_arr).float()
vol = torch.transpose(vol, 0, 2)
vol = torch.unsqueeze(vol, 0)
vol.shape

torch.Size([1, 512, 512, 99])

In [None]:
#Now we can create the 5D data set by stacking the volumes along batch directoin

In [14]:
%%HTML 
<h3> Tabular Data </h3>

In [None]:
#Tabular data is spreadsheet data
#We need to encode the heterogenous data into tensor floats

In [21]:
wineData = pd.read_csv(r'./data/p1ch4/tabular-wine/winequality-white.csv', sep=";")

In [22]:
wineData.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [23]:
wine_numpy = wineData.to_numpy()

In [24]:
wine_numpy

array([[ 7.  ,  0.27,  0.36, ...,  0.45,  8.8 ,  6.  ],
       [ 6.3 ,  0.3 ,  0.34, ...,  0.49,  9.5 ,  6.  ],
       [ 8.1 ,  0.28,  0.4 , ...,  0.44, 10.1 ,  6.  ],
       ...,
       [ 6.5 ,  0.24,  0.19, ...,  0.46,  9.4 ,  6.  ],
       [ 5.5 ,  0.29,  0.3 , ...,  0.38, 12.8 ,  7.  ],
       [ 6.  ,  0.21,  0.38, ...,  0.32, 11.8 ,  6.  ]])

In [26]:
wine_numpy.shape, wineData.columns

((4898, 12),
 Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
        'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
        'pH', 'sulphates', 'alcohol', 'quality'],
       dtype='object'))

In [27]:
wineTensor = torch.from_numpy(wine_numpy)
wineTensor.shape, wineTensor.dtype

(torch.Size([4898, 12]), torch.float64)

In [28]:
#We 3 types of numerical values her
#continuous, ordinal, and categorical values
#continuous values are like values of mass. Its ok to say that A is mass 10 and is twice as massive as B wwith mass 5
#Ordinal values are like continuous but their fixed relationship is not true. Small = 1 medium = 2 large = 3
#We know that large is bigger than medium but we don't know by how much
#we cannot take an average of these values or anything
#categorical have neither order nor numerical meaning

In [30]:
#Take out wine score coulmn ans save as yTr

xTr = wineTensor[:,:-1]
xTr.shape

torch.Size([4898, 11])

In [40]:
yTr = wineTensor[:,-1]
yTr.shape

torch.Size([4898])

In [33]:
yTr = yTr.long()

In [34]:
yTr

tensor([6, 6, 6,  ..., 6, 7, 6])

In [35]:
#We can decide to keep yTr as is and do regression or we can use OneHotEncoder on yTr
#if scores are purely discrete oneHotEncoder is apprpriate


In [36]:
#Trye OneHotEncoder using Scatter_ method
#Remember trailing _ methods act inplace
yTr_onehot = torch.zeros(yTr.shape[0],10)
yTr_onehot.scatter_(1, yTr.unsqueeze(1), 1.0)

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 1., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [37]:
yTr_onehot.shape

torch.Size([4898, 10])

In [38]:
yTr_unsq= yTr.unsqueeze(1)
yTr_unsq

tensor([[6],
        [6],
        [6],
        ...,
        [6],
        [7],
        [6]])

In [50]:
xTr_mean = torch.mean(xTr, 0)
xTr_mean

tensor([6.8548e+00, 2.7824e-01, 3.3419e-01, 6.3914e+00, 4.5772e-02, 3.5308e+01,
        1.3836e+02, 9.9403e-01, 3.1883e+00, 4.8985e-01, 1.0514e+01],
       dtype=torch.float64)

In [51]:
#find variance
xTr_var = torch.var(xTr, 0)
xTr_var

tensor([7.1211e-01, 1.0160e-02, 1.4646e-02, 2.5726e+01, 4.7733e-04, 2.8924e+02,
        1.8061e+03, 8.9455e-06, 2.2801e-02, 1.3025e-02, 1.5144e+00],
       dtype=torch.float64)

In [52]:
#normalze data 
xTr_normalized = (xTr - xTr_mean) / torch.sqrt(xTr_var)
xTr_normalized

tensor([[ 1.7208e-01, -8.1762e-02,  2.1326e-01,  ..., -1.2468e+00,
         -3.4915e-01, -1.3930e+00],
        [-6.5743e-01,  2.1587e-01,  4.7996e-02,  ...,  7.3995e-01,
          1.3417e-03, -8.2419e-01],
        [ 1.4756e+00,  1.7450e-02,  5.4378e-01,  ...,  4.7505e-01,
         -4.3677e-01, -3.3663e-01],
        ...,
        [-4.2043e-01, -3.7940e-01, -1.1915e+00,  ..., -1.3130e+00,
         -2.6153e-01, -9.0545e-01],
        [-1.6054e+00,  1.1666e-01, -2.8253e-01,  ...,  1.0049e+00,
         -9.6251e-01,  1.8574e+00],
        [-1.0129e+00, -6.7703e-01,  3.7852e-01,  ...,  4.7505e-01,
         -1.4882e+00,  1.0448e+00]], dtype=torch.float64)

In [55]:
#Find  rows in yTr that have a score <=3 
bad_i = yTr <=3
bad_i.shape, bad_i.dtype, bad_i.sum()
#Here all values are True or false. Only 20 values are True

(torch.Size([4898]), torch.bool, tensor(20))

In [59]:
#diplay True values
badWines = xTr[bad_i]
badWines.shape

torch.Size([20, 11])

In [61]:
#find bad mid and good wines
midWines = xTr[(yTr > 3) & (yTr < 7)]
goodWines = xTr[yTr >=7]

bad_mean = torch.mean(badWines, 0)
mid_mean = torch.mean(midWines, 0)
good_mean = torch.mean(goodWines, 0)

for i, args in enumerate(zip(wineData.columns, bad_mean, mid_mean, good_mean)):
    print('{:2} {:20} {:6.2f} {:6.2f} {:6.2f}'.format(i, *args))

 0 fixed acidity          7.60   6.89   6.73
 1 volatile acidity       0.33   0.28   0.27
 2 citric acid            0.34   0.34   0.33
 3 residual sugar         6.39   6.71   5.26
 4 chlorides              0.05   0.05   0.04
 5 free sulfur dioxide   53.33  35.42  34.55
 6 total sulfur dioxide 170.60 141.83 125.25
 7 density                0.99   0.99   0.99
 8 pH                     3.19   3.18   3.22
 9 sulphates              0.47   0.49   0.50
10 alcohol               10.35  10.26  11.42


In [63]:
#Here we find inices of wines with sulfure less than threshold
#note Torch.lt is less than. computes x < y
total_sulfur_threshold = 141.83
total_sulfur_data = xTr[:,6]
predicted_i = torch.lt(total_sulfur_data, total_sulfur_threshold)
predicted_i.shape, predicted_i.dtype, predicted_i.sum()

(torch.Size([4898]), torch.bool, tensor(2727))

In [64]:
actual_i = yTr > 5
actual_i.shape, actual_i.dtype, actual_i.sum()

(torch.Size([4898]), torch.bool, tensor(3258))

In [69]:
#find accuracy
n_matches = torch.sum(actual_i & predicted_i).item()
n_predicted = torch.sum(predicted_i).item()
n_actual = torch.sum(actual_i).item()
n_matches, n_matches/n_predicted, n_matches/ n_actual

(2018, 0.74000733406674, 0.6193984039287906)

In [70]:
%%HTML
<h3> Time Series Data </h3>

In [71]:
#Now we will take DC Bike share data and process it
#we will take two flad 2D data sets and make into 1 3D set
#We want one axis to increase at one dat per index incremenat and another acis that represents hours of the da
#third acis will be column data 

In [76]:
bikes_pandas = pd.read_csv(r"./data/p1ch4/bike-sharing-dataset/hour-fixed.csv")

In [79]:
bikes_pandas.head()

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
1,2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
2,3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
3,4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
4,5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1


In [75]:
#Here we conver the date string to numbers corresponding to day of the month in column 1
bikes_numpy = np.loadtxt("./data/p1ch4/bike-sharing-dataset/hour-fixed.csv",
                         dtype=np.float32,
                         delimiter=",",
                         skiprows=1,
                         converters={1: lambda x: float(x[8:10])})
bikes = torch.from_numpy(bikes_numpy)
bikes

tensor([[1.0000e+00, 1.0000e+00, 1.0000e+00,  ..., 3.0000e+00, 1.3000e+01,
         1.6000e+01],
        [2.0000e+00, 1.0000e+00, 1.0000e+00,  ..., 8.0000e+00, 3.2000e+01,
         4.0000e+01],
        [3.0000e+00, 1.0000e+00, 1.0000e+00,  ..., 5.0000e+00, 2.7000e+01,
         3.2000e+01],
        ...,
        [1.7377e+04, 3.1000e+01, 1.0000e+00,  ..., 7.0000e+00, 8.3000e+01,
         9.0000e+01],
        [1.7378e+04, 3.1000e+01, 1.0000e+00,  ..., 1.3000e+01, 4.8000e+01,
         6.1000e+01],
        [1.7379e+04, 3.1000e+01, 1.0000e+00,  ..., 1.2000e+01, 3.7000e+01,
         4.9000e+01]])

In [None]:
#Here rows are successive time points in hours

In [80]:
#17520 hiurs, 17 columns 
#we will reshape to have 3 axis: day, hour, and 17 feature columns
#stride tells us hor to how to navigate the flattened stored data
#to move one hour move 17 places in storage 
bikes.shape, bikes.stride()

(torch.Size([17520, 17]), (17, 1))

In [81]:
#the view method changes the way the tensor looks at the given data in storage 
#calling view returns a new tensor that changes the number of dimensions and striding info without changing storage 
#Allows you to rearrange data at no cost
#-1 is a place hoder for "left over indecies" after the other dimensiots are assigned
daily_bikes = bikes.view(-1, 24, bikes.shape[1])
daily_bikes.shape, daily_bikes.stride()

(torch.Size([730, 24, 17]), (408, 17, 1))

In [83]:
#We want N x C x L ordering where N is num samples C is num of sequences L is length
daily_bikes = daily_bikes.transpose(1,2)
daily_bikes.shape, daily_bikes.stride()

(torch.Size([730, 17, 24]), (408, 1, 17))

In [86]:
#encode Weather using Onehot
first_day = bikes[:24].long()
weather_onehot = torch.zeros(first_day.shape[0], 4)
first_day[:,9]

tensor([1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 2, 2, 2, 2])

In [87]:
weather_onehot.scatter_(
    dim=1,
    index=first_day[:,9].unsqueeze(1).long() -1,
    value = 1.0)

tensor([[1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [0., 1., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 0., 1., 0.],
        [0., 0., 1., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.]])

In [88]:
#concatenate to orignal data set
torch.cat((bikes[:24], weather_onehot), 1)[:1]

tensor([[ 1.0000,  1.0000,  1.0000,  0.0000,  1.0000,  0.0000,  0.0000,  6.0000,
          0.0000,  1.0000,  0.2400,  0.2879,  0.8100,  0.0000,  3.0000, 13.0000,
         16.0000,  1.0000,  0.0000,  0.0000,  0.0000]])

In [89]:
#Now create daily weather onehot
daily_weather_onehot = torch.zeros(daily_bikes.shape[0], 4, daily_bikes.shape[2])
daily_weather_onehot.shape

torch.Size([730, 4, 24])

In [90]:
daily_weather_onehot.scatter_(1, daily_bikes[:,9,:].long().unsqueeze(1) - 1, 1.0)
daily_weather_onehot.shape

torch.Size([730, 4, 24])

In [91]:
#concat along C 
daily_bikes = torch.cat((daily_bikes, daily_weather_onehot), 1)

In [92]:
daily_bikes[:, 9, :] = (daily_bikes[:, 9, :] - 1.0) / 3.0

In [93]:
# In[13]:
temp = daily_bikes[:, 10, :]
temp_min = torch.min(temp)
temp_max = torch.max(temp)
daily_bikes[:, 10, :] = (daily_bikes[:, 10, :] - temp_min) / (temp_max - temp_min)

In [94]:
# In[14]:
temp = daily_bikes[:, 10, :]
daily_bikes[:, 10, :] = (daily_bikes[:, 10, :] - torch.mean(temp)) / torch.std(temp)

In [95]:
%%HTML
<h3> Text Data </h3>

In [96]:
#Text analysis is NLP natural language processing
#we will start off with character based convert 2 tensor , then word based
#Load Pride and Prejudice 


with open('./data/p1ch4/jane-austen/1342-0.txt', encoding='utf8') as f:
    text = f.read()

In [97]:
#Now we can OneHotEncode on a character level
#we will make all characters lowecrcase and ignore punctuation and numbers
#each character will be represented as a vector

lines = text.split('\n')
line = lines[200]
line

'“Impossible, Mr. Bennet, impossible, when I am not acquainted with him'

In [98]:
#create tensor for line
letter_t = torch.zeros(len(line),128)
letter_t.shape

torch.Size([70, 128])

In [99]:
#Now we set 1 in the right position to represent a given character on each row
# In[5]:
for i, letter in enumerate(line.lower().strip()):
    letter_index = ord(letter) if ord(letter) < 128 else 0
    letter_t[i][letter_index] = 1
#This sentence has now been onehotencoded
#We can also use word or character embeddings to do thi 

In [100]:
letter_t

tensor([[1., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [107]:
#define clean_words 
def clean_words(input_str):
    '''
    Input:
        input_str: input string text to be cleaned. Text will
        be made lowercase, and punctuation will be removed.
    Output:
        word_list: list of lowercase words from input_str
    
    '''
    punctuation = '.,;:"!?_-`"'
    word_list = input_str.lower().replace('\n', ' ').split()
    word_list = [word.strip(punctuation) for word in word_list]
    return word_list

In [109]:
words_in_line= clean_words(line)
line, words_in_line

('“Impossible, Mr. Bennet, impossible, when I am not acquainted with him',
 ['“impossible',
  'mr',
  'bennet',
  'impossible',
  'when',
  'i',
  'am',
  'not',
  'acquainted',
  'with',
  'him'])

In [111]:
#next build a mapping of words to indecies in your encoding
word_list= sorted(set(clean_words(text)))
word2index_dict= {word: i for (i, word) in enumerate(word_list)}

len(word2index_dict), word2index_dict['impossible']

(8484, 3828)

In [112]:
#this is now the mapped word 2 index 
word2index_dict

{'': 0,
 '#1342]': 1,
 '$5,000)': 2,
 "'_she": 3,
 "'ah": 4,
 "'as-is'": 5,
 "'bingley": 6,
 "'had": 7,
 "'having": 8,
 "'i": 9,
 "'keep": 10,
 "'lady": 11,
 "'lately": 12,
 "'lydia": 13,
 "'mr": 14,
 "'oh": 15,
 "'s": 16,
 "'this": 17,
 "'tis": 18,
 "'violently": 19,
 "'yes,'": 20,
 "'you": 21,
 '($1': 22,
 '(801)': 23,
 '(a)': 24,
 '(an': 25,
 '(and': 26,
 '(any': 27,
 '(available': 28,
 '(b)': 29,
 '(by': 30,
 '(c)': 31,
 '(comparatively': 32,
 '(does': 33,
 '(for': 34,
 '(glancing': 35,
 '(if': 36,
 '(lady': 37,
 '(like': 38,
 '(most': 39,
 '(my': 40,
 '(or': 41,
 '(trademark/copyright)': 42,
 '(unasked': 43,
 '(what': 44,
 '(who': 45,
 '(www.gutenberg.org)': 46,
 '(“the': 47,
 '*': 48,
 '***': 49,
 '*****': 50,
 '1': 51,
 '1.a': 52,
 '1.b': 53,
 '1.c': 54,
 '1.d': 55,
 '1.e': 56,
 '1.e.1': 57,
 '1.e.2': 58,
 '1.e.3': 59,
 '1.e.4': 60,
 '1.e.5': 61,
 '1.e.6': 62,
 '1.e.7': 63,
 '1.e.8': 64,
 '1.e.9': 65,
 '1.f': 66,
 '1.f.1': 67,
 '1.f.2': 68,
 '1.f.3': 69,
 '1.f.4': 70,
 '1.f.5': 

In [113]:
word_t = torch.zeros(len(words_in_line), len(word2index_dict))
for i, word in enumerate(words_in_line):
    word_index = word2index_dict[word]
    word_t[i][word_index] = 1
    print('{:2} {:4} {}'.format(i, word_index, word))

print(word_t.shape)

 0 8324 “impossible
 1 4905 mr
 2  891 bennet
 3 3828 impossible
 4 8017 when
 5 3740 i
 6  445 am
 7 5054 not
 8  247 acquainted
 9 8094 with
10 3619 him
torch.Size([11, 8484])


In [114]:
%%HTML
<h3> Text Embeddings </h3>

In [None]:
#As text can be represented as binary digits it can also be represented 
#as floating point numbers
#Embeddings are useful when onehotencodings become too comebrsome