In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**One-hot representation**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import seaborn as sns

In [None]:
# Cesar Vallejo poem
#Black Stone On Top Of A White Stone

corpus = [
"I shall die in Paris, in a rainstorm, On a day I already remember. I shall die in Paris- it does not bother me- Doubtless on a Thursday, like today, in autumn.",
"It shall be a Thursday, because today, Thursday As I put down these lines, I have set my shoulders To the evil. Never like today have I turned,And headed my whole journey to the ways where I am alone.",
"César Vallejo is dead. They struck him,All of them, though he did nothing to them,They hit him hard with a stick and hard also With the end of a rope. Witnesses are: the Thursdays, The shoulder bones, the loneliness, the rain, and the roads..."]

In [None]:
corpus

**Term-Frequency**

In [None]:
one_hot_vectorizer = CountVectorizer(binary = True)
one_hot = one_hot_vectorizer.fit_transform(corpus).toarray()

xticklabels = one_hot_vectorizer.get_feature_names_out()

sns.heatmap(one_hot, annot = True, 
            cbar = False, xticklabels = xticklabels,
           yticklabels = ['Sentence 1','Sentence 2', 'Sentence 3'])

**Term-Frequency-Inverse-Document-Frequency (TF-IDF)** 

The IDF representation penalizes common tokens and rewards rare tokens in the vector representation.

The IDF(w) of a token w is defined with respect to a corpus as:

IDF(w) = log(N/n_w)

N: total number of documents
n_w: number of documents containing the word w

The TF-IDF score is the product TF(w) * IDF(w) 


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
tfidf = tfidf_vectorizer.fit_transform(corpus).toarray()

xticklabels = one_hot_vectorizer.get_feature_names_out()

sns.heatmap(tfidf, annot = True, 
            cbar = False, xticklabels = xticklabels,
           yticklabels = ['Sentence 1','Sentence 2', 'Sentence 3'])

**Note**: In deep learning, it is rare to see inputs encoded using heuristic representations like TF-IDF because the goal is to learn a representation. Often, we start with a one-hot encoding using integer indices and a special "embedding lookup" layer to construct inputs to the neural network.

# **Pytorch Basics**

In [None]:
import torch
print("Pytorch version：")
print(torch.__version__)
print("CUDA Version: ")
print(torch.version.cuda)
print("cuDNN version is :")
print(torch.backends.cudnn.version())

In [None]:
def describe(x):
    print("Type: {}".format(x.type()))
    print("Shape/size: {}".format(x.shape))
    print("Values: \n{}".format(x))

### Creating a tensor in PyTorch

In [None]:
describe(torch.Tensor(2,3))

In [None]:
describe(torch.rand(2,3)) #uniform random
describe(torch.randn(2,3)) # random normal

In [None]:
describe(torch.zeros(2,3))

x = torch.ones(2,3)
describe(x)
x.fill_(5) #in-place operation
describe(x)


In [None]:
#from lists
x = torch.Tensor([[1,2,3],[4,5,6]])
describe(x)

In [None]:
#from Numpy array to PyTorch tensor 
import numpy as np

npy = np.random.rand(2,3)
describe(torch.from_numpy(npy))

### Tensor Types and Size

The default tensor type when using the torch.Tensor constructor is toch.FloatTensor

In [None]:
x = torch.FloatTensor([[1,2,3],[4,5,6]])

describe(x)

In [None]:
x = x.long() #changing the tensor type to LongTensor
describe(x)

In [None]:
x = torch.tensor([[1,2,3],[4,5,6]], dtype = torch.int64)

describe(x)


In [None]:
x = x.float()

describe(x)

### Tensor operations

In [None]:
x = torch.randn(2,3)

describe(x)

In [None]:
describe(x.add(x))

In [None]:
describe(x+x)

In [None]:
#Dimension-based tensor operations

x = torch.arange(6)

describe(x)

In [None]:
x = x.view(2,3)

describe(x)

In [None]:
describe(torch.sum(x, dim = 0))

In [None]:
describe(torch.sum(x, dim = 1))

In [None]:
describe(torch.transpose(x,0,1))

### Indexing, Slicing and Joining 

In [None]:
x = torch.arange(6).view(2,3)
describe(x)

In [None]:
describe(x[:1,:2]) 

In [None]:
describe(x[0,1])

In [None]:
#Complex indexing
indices = torch.LongTensor([0,2]) #indices are LongTensor
describe(indices)

describe(torch.index_select(x, dim = 1, index = indices)) #take only columns 0 and 2

In [None]:
indices = torch.LongTensor([0, 0])
describe(torch.index_select(x, dim = 0, index = indices)) #take two times the row 0

In [None]:
row_indices = torch.arange(2).long() #0,1
col_indices = torch.LongTensor([0,1])  

describe(x[row_indices, col_indices]) #0,0 and 1, 1

In [None]:
#Concatenating tensors
x = torch.arange(6).view(2,3)
describe(x)

In [None]:
describe(torch.cat([x,x], dim = 0)) 

In [None]:
describe(torch.cat([x,x], dim = 1))

In [None]:
describe(torch.stack([x,x]))

In [None]:
#Linear algebra
x1 = torch.arange(6).view(2,3).float()
describe(x1)

In [None]:
x2 = torch.ones(3,2)
x2[:,1] +=1
describe(x2)

In [None]:
describe(torch.mm(x1,x2))