## Loading text

In [332]:
from langchain.document_loaders import PyPDFLoader

loader = PyPDFLoader("../Essay.pdf")
pdf = loader.load()
pdf

[Document(page_content="Why one should write?  \n \nWhen we look into the lives of some of the most successful leaders, ancient philosophers, innovators, \nphilanthropists from ancient times to modern times, there is one quality that distinguishes them from \nthe rest: Clarity of thought!  It is this clarity of thought that helps them build great organizations, \nlead people, bring new innova tions and inspire generations.  \nSo the ultimate question revolves around how they became so clear with their thoughts? Steve Jobs \nwas very clear about what he wanted his produ cts to be and what kind of people he wanted in his \norganization. Chanakya was very clear with his vision of a united India and its leadership. Lee Kuan \nYew, a brilliant statesman, was clear with his vision of a modern and economic powerhouse island \nstate, Singapore. Modi is clear with his vision of India -2047 in 2022. Elon Musk was clear with his \nvision of reusable rockets.  \nAchieving clarity is a compounding 

In [333]:
text = pdf[0].page_content
print(text)

Why one should write?  
 
When we look into the lives of some of the most successful leaders, ancient philosophers, innovators, 
philanthropists from ancient times to modern times, there is one quality that distinguishes them from 
the rest: Clarity of thought!  It is this clarity of thought that helps them build great organizations, 
lead people, bring new innova tions and inspire generations.  
So the ultimate question revolves around how they became so clear with their thoughts? Steve Jobs 
was very clear about what he wanted his produ cts to be and what kind of people he wanted in his 
organization. Chanakya was very clear with his vision of a united India and its leadership. Lee Kuan 
Yew, a brilliant statesman, was clear with his vision of a modern and economic powerhouse island 
state, Singapore. Modi is clear with his vision of India -2047 in 2022. Elon Musk was clear with his 
vision of reusable rockets.  
Achieving clarity is a compounding process that comes from experience a

# Text preprocessing

### Steps:
#### Data Cleaning:
- Converting uppercases to lowercase
- Removing punctuation marks
- Tokenization
- Removing stopwords
- Stemming
- Lemmatization
- **Text embeddings**

In [334]:
# Converting uppercases to lowercase

text = text.lower()
print(text[:100], "\n", text[-100:])

why one should write?  
 
when we look into the lives of some of the most successful leaders, ancien 
 .com/pritika_mehta/status/1790799256783393204?t=qosur91exny43vlf0qzodg& s=08  
 
@silver_cule on x  


In [335]:
# Removing punctuation marks

import re

text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)  # Remove URLs
text = re.sub(r'\n', ' ', text)  # Remove newline characters
text = re.sub(r'[^\w\s]', '', text)

text

'why one should write     when we look into the lives of some of the most successful leaders ancient philosophers innovators  philanthropists from ancient times to modern times there is one quality that distinguishes them from  the rest clarity of thought  it is this clarity of thought that helps them build great organizations  lead people bring new innova tions and inspire generations   so the ultimate question revolves around how they became so clear with their thoughts steve jobs  was very clear about what he wanted his produ cts to be and what kind of people he wanted in his  organization chanakya was very clear with his vision of a united india and its leadership lee kuan  yew a brilliant statesman was clear with his vision of a modern and economic powerhouse island  state singapore modi is clear with his vision of india 2047 in 2022 elon musk was clear with his  vision of reusable rockets   achieving clarity is a compounding process that comes from experience and learning all of 

In [336]:
# Tokenization - breaking sentences into smaller units
import nltk
# nltk.download('punkt')

from nltk.tokenize import word_tokenize

text = word_tokenize(text)
text, len(text)

(['why',
  'one',
  'should',
  'write',
  'when',
  'we',
  'look',
  'into',
  'the',
  'lives',
  'of',
  'some',
  'of',
  'the',
  'most',
  'successful',
  'leaders',
  'ancient',
  'philosophers',
  'innovators',
  'philanthropists',
  'from',
  'ancient',
  'times',
  'to',
  'modern',
  'times',
  'there',
  'is',
  'one',
  'quality',
  'that',
  'distinguishes',
  'them',
  'from',
  'the',
  'rest',
  'clarity',
  'of',
  'thought',
  'it',
  'is',
  'this',
  'clarity',
  'of',
  'thought',
  'that',
  'helps',
  'them',
  'build',
  'great',
  'organizations',
  'lead',
  'people',
  'bring',
  'new',
  'innova',
  'tions',
  'and',
  'inspire',
  'generations',
  'so',
  'the',
  'ultimate',
  'question',
  'revolves',
  'around',
  'how',
  'they',
  'became',
  'so',
  'clear',
  'with',
  'their',
  'thoughts',
  'steve',
  'jobs',
  'was',
  'very',
  'clear',
  'about',
  'what',
  'he',
  'wanted',
  'his',
  'produ',
  'cts',
  'to',
  'be',
  'and',
  'what',
  '

In [337]:
# nltk.download('stopwords')

from nltk.corpus import stopwords
stopwrds = set(stopwords.words('english'))
text = [word for word in text if word not in stopwrds]

text, len(text)

(['one',
  'write',
  'look',
  'lives',
  'successful',
  'leaders',
  'ancient',
  'philosophers',
  'innovators',
  'philanthropists',
  'ancient',
  'times',
  'modern',
  'times',
  'one',
  'quality',
  'distinguishes',
  'rest',
  'clarity',
  'thought',
  'clarity',
  'thought',
  'helps',
  'build',
  'great',
  'organizations',
  'lead',
  'people',
  'bring',
  'new',
  'innova',
  'tions',
  'inspire',
  'generations',
  'ultimate',
  'question',
  'revolves',
  'around',
  'became',
  'clear',
  'thoughts',
  'steve',
  'jobs',
  'clear',
  'wanted',
  'produ',
  'cts',
  'kind',
  'people',
  'wanted',
  'organization',
  'chanakya',
  'clear',
  'vision',
  'united',
  'india',
  'leadership',
  'lee',
  'kuan',
  'yew',
  'brilliant',
  'statesman',
  'clear',
  'vision',
  'modern',
  'economic',
  'powerhouse',
  'island',
  'state',
  'singapore',
  'modi',
  'clear',
  'vision',
  'india',
  '2047',
  '2022',
  'elon',
  'musk',
  'clear',
  'vision',
  'reusable',


In [338]:
# Stemming - converting words to their root form by removing suffix

from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
text = [stemmer.stem(word) for word in text]
text, len(text)

(['one',
  'write',
  'look',
  'live',
  'success',
  'leader',
  'ancient',
  'philosoph',
  'innov',
  'philanthropist',
  'ancient',
  'time',
  'modern',
  'time',
  'one',
  'qualiti',
  'distinguish',
  'rest',
  'clariti',
  'thought',
  'clariti',
  'thought',
  'help',
  'build',
  'great',
  'organ',
  'lead',
  'peopl',
  'bring',
  'new',
  'innova',
  'tion',
  'inspir',
  'gener',
  'ultim',
  'question',
  'revolv',
  'around',
  'becam',
  'clear',
  'thought',
  'steve',
  'job',
  'clear',
  'want',
  'produ',
  'ct',
  'kind',
  'peopl',
  'want',
  'organ',
  'chanakya',
  'clear',
  'vision',
  'unit',
  'india',
  'leadership',
  'lee',
  'kuan',
  'yew',
  'brilliant',
  'statesman',
  'clear',
  'vision',
  'modern',
  'econom',
  'powerhous',
  'island',
  'state',
  'singapor',
  'modi',
  'clear',
  'vision',
  'india',
  '2047',
  '2022',
  'elon',
  'musk',
  'clear',
  'vision',
  'reusabl',
  'rocket',
  'achiev',
  'clariti',
  'compound',
  'process',


In [339]:
# Lemmatization - reducing words to their base form (removes grammar too)

# nltk.download('wordnet')

from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
text = [lemmatizer.lemmatize(word) for word in text]
text, len(text)

(['one',
  'write',
  'look',
  'live',
  'success',
  'leader',
  'ancient',
  'philosoph',
  'innov',
  'philanthropist',
  'ancient',
  'time',
  'modern',
  'time',
  'one',
  'qualiti',
  'distinguish',
  'rest',
  'clariti',
  'thought',
  'clariti',
  'thought',
  'help',
  'build',
  'great',
  'organ',
  'lead',
  'peopl',
  'bring',
  'new',
  'innova',
  'tion',
  'inspir',
  'gener',
  'ultim',
  'question',
  'revolv',
  'around',
  'becam',
  'clear',
  'thought',
  'steve',
  'job',
  'clear',
  'want',
  'produ',
  'ct',
  'kind',
  'peopl',
  'want',
  'organ',
  'chanakya',
  'clear',
  'vision',
  'unit',
  'india',
  'leadership',
  'lee',
  'kuan',
  'yew',
  'brilliant',
  'statesman',
  'clear',
  'vision',
  'modern',
  'econom',
  'powerhous',
  'island',
  'state',
  'singapor',
  'modi',
  'clear',
  'vision',
  'india',
  '2047',
  '2022',
  'elon',
  'musk',
  'clear',
  'vision',
  'reusabl',
  'rocket',
  'achiev',
  'clariti',
  'compound',
  'process',


## Embeddings

In [340]:
# Bag-of-words
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vec = vectorizer.fit_transform(text)
vec

<303x191 sparse matrix of type '<class 'numpy.int64'>'
	with 296 stored elements in Compressed Sparse Row format>

In [341]:
vec.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [342]:
vec.toarray().shape, len(vec.toarray())

((303, 191), 303)

In [343]:
# TF-IDF

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
vec = tfidf.fit_transform([' '.join(text)])
vec, vec.toarray()

(<1x191 sparse matrix of type '<class 'numpy.float64'>'
 	with 191 stored elements in Compressed Sparse Row format>,
 array([[0.03438071, 0.03438071, 0.03438071, 0.03438071, 0.03438071,
         0.03438071, 0.03438071, 0.03438071, 0.03438071, 0.03438071,
         0.06876142, 0.03438071, 0.03438071, 0.03438071, 0.03438071,
         0.03438071, 0.03438071, 0.03438071, 0.03438071, 0.03438071,
         0.03438071, 0.03438071, 0.03438071, 0.03438071, 0.03438071,
         0.06876142, 0.17190354, 0.24066496, 0.06876142, 0.03438071,
         0.03438071, 0.03438071, 0.03438071, 0.03438071, 0.03438071,
         0.03438071, 0.06876142, 0.06876142, 0.03438071, 0.03438071,
         0.03438071, 0.03438071, 0.03438071, 0.03438071, 0.06876142,
         0.06876142, 0.03438071, 0.03438071, 0.03438071, 0.03438071,
         0.06876142, 0.03438071, 0.13752283, 0.03438071, 0.06876142,
         0.06876142, 0.03438071, 0.03438071, 0.03438071, 0.03438071,
         0.03438071, 0.13752283, 0.10314212, 0.03438071

In [344]:
vec.shape

(1, 191)

TF-IDF output vector shape is 191 because 191 is the unique words in the dictionary

In [345]:
len(set(text))

197

### Word2Vec

In [346]:
from gensim.models import Word2Vec

word2vec_model = Word2Vec(sentences=[text], vector_size=256, window=5, min_count=1, workers=4)
word2vec_model.wv["time"], word2vec_model.wv["time"].shape

(array([ 2.0998209e-03, -3.1989540e-03,  2.2462173e-03, -1.3577599e-03,
         6.4680651e-05, -2.9108978e-03, -2.4161167e-03, -1.2798700e-03,
         3.3333083e-03,  2.1597974e-03, -3.7919593e-03, -2.5669027e-03,
         1.3369778e-03, -3.3658948e-03,  2.6003518e-03,  2.4308192e-03,
        -3.5906693e-03,  3.3997025e-03,  3.0508628e-03, -6.5531267e-04,
         6.6926930e-04, -2.2380208e-03,  2.4070891e-03,  2.9569750e-03,
         1.5279471e-03, -3.4499706e-03,  1.9530968e-03, -1.1883777e-03,
        -8.9018833e-04,  2.3857395e-03,  7.1540527e-04,  1.8206530e-03,
        -2.4912816e-03,  2.0601500e-03, -1.4123489e-03, -8.7181579e-05,
         2.4644413e-03,  1.5707405e-03,  2.9188904e-03, -1.5493409e-03,
         1.3417165e-03,  2.9000121e-03, -1.2841771e-03,  2.2443491e-03,
        -3.6424717e-03,  3.4360469e-03,  1.4801110e-03,  2.0498100e-03,
         1.8382872e-03,  1.8225689e-03, -1.2432886e-03, -2.4717154e-03,
        -9.0665178e-04,  5.7024776e-04, -1.1877742e-03, -3.35555

### Custom Neural Networks

**Tensorflow**

- Tokenize and pad the text data: Convert the text into a sequence of integers and ensure all sequences are of the same length.
- Create an Embedding Layer: This layer will learn the word embeddings during the training process.
- Build and Train a Neural Network: A simple neural network is built to demonstrate how embeddings can be used in a model.

In [347]:
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Flatten, Dense, Embedding, Input
from keras.preprocessing.sequence import pad_sequences

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

In [348]:
loader = PyPDFLoader("../Essay.pdf")
pdf = loader.load()
texts = pdf[0].page_content

tokenizer = Tokenizer()

tokenizer.fit_on_texts([texts])
sequences = tokenizer.sequences_to_texts([texts])
word_index = tokenizer.word_index
data = pad_sequences(sequences, maxlen=100)

data, word_index, sequences

(array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]),
 {'and': 1,
  'of': 2,
  'you': 3,
  'the': 4,
  'to': 5,
  'it': 6,
  'a': 7,
  'with': 8,
  'clear': 9,
  'your': 10,
  'will': 11,
  'thought': 12,
  'thoughts': 13,
  'his': 14,
  'vision': 15,
  'have': 16,
  'is': 17,
  'that': 18,
  'clarity': 19,
  'this': 20,
  'their': 21,
  'all': 22,
  'on': 23,
  'if': 24,
  'more': 25,
  'own': 26,
  'them': 27,
  'great': 28,
  'was': 29,
  'still': 30,
  'writes': 31,
  'writing': 32,
  'read': 33,
  'world': 34,
  'thinking': 35,
  'one': 36,
  'write': 37,
  'when': 38,
  'we': 39,
  'successful': 40,
  'from': 41,
  'so': 42,
  'be': 43,
  'stanford': 44,
  'rational': 45,
  'reference': 46,


In [349]:
model = Sequential()
model.add(Input(shape=(100,)))
model.add(Embedding(input_dim=len(word_index) + 1, output_dim=50))
model.add(Flatten())
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.summary()

In [350]:
embeddings = model.layers[0].get_weights()
embeddings

[array([[-0.02512523, -0.01105338,  0.02746291, ...,  0.01068907,
          0.01513441,  0.04634647],
        [ 0.04623956, -0.03883426, -0.04147968, ...,  0.02223964,
          0.04551185, -0.02689468],
        [ 0.03663354,  0.03946867, -0.02766166, ..., -0.00412615,
         -0.01824318, -0.01580949],
        ...,
        [-0.02833327,  0.04230395,  0.00673345, ...,  0.0171258 ,
          0.00715173, -0.04241968],
        [ 0.03718172, -0.04353098,  0.03903783, ...,  0.03379052,
         -0.03228267,  0.00209916],
        [-0.02020103,  0.00921692,  0.02337616, ..., -0.04237109,
          0.02445373, -0.00690494]], dtype=float32)]

In [351]:
embeddings[0].shape, np.array(embeddings).shape, embeddings[0][0]

((296, 50),
 (1, 296, 50),
 array([-0.02512523, -0.01105338,  0.02746291,  0.0160997 , -0.00400461,
         0.01870452, -0.01462947, -0.03163086,  0.04223472,  0.00501963,
        -0.0069471 ,  0.00254001, -0.00270056,  0.02006621, -0.01246317,
         0.00831616,  0.03870913,  0.0433082 ,  0.02488854, -0.00732232,
         0.00588877, -0.00497587,  0.02897041,  0.02597919, -0.00982196,
        -0.02861018,  0.02116032,  0.00772822,  0.03275988, -0.00300257,
        -0.03427359,  0.01998648,  0.04185734,  0.04500622, -0.02020888,
        -0.00604839, -0.03648573, -0.03633721,  0.02859508, -0.00323703,
         0.00022928, -0.04902254,  0.00938939, -0.02703464, -0.03712098,
        -0.00291177,  0.00627325,  0.01068907,  0.01513441,  0.04634647],
       dtype=float32))

### Embedding with PyTorch

In [352]:
import torch
from torch import nn

class CustomEmbedding(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(embedding_dim*max_len, 32)
        self.fc2 = nn.Linear(32, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.embedding(x)
        x = self.flatten(x)
        x = torch.relu(self.fc1(x))
        x = self.sigmoid(self.fc2(x))
        return x
    
vocab_size = 1000
embedding_dim = 50

model = CustomEmbedding(vocab_size, embedding_dim)
model

CustomEmbedding(
  (embedding): Embedding(1000, 50)
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (fc1): Linear(in_features=5000, out_features=32, bias=True)
  (fc2): Linear(in_features=32, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [353]:
vocab = {word: idx for idx, word in enumerate(set(text), 1)}
vocab['<PAD>'] = 0
vocab

{'build': 1,
 'job': 2,
 'leadership': 3,
 'becam': 4,
 'compani': 5,
 'memori': 6,
 'smarter': 7,
 'destin': 8,
 'editor': 9,
 'listen': 10,
 'view': 11,
 'owneven': 12,
 'understa': 13,
 'perspect': 14,
 'yew': 15,
 'x': 16,
 'achiev': 17,
 'contribut': 18,
 'leader': 19,
 'explor': 20,
 'deepli': 21,
 'act': 22,
 'thiel': 23,
 'tech': 24,
 'action': 25,
 'read': 26,
 'polit': 27,
 'innova': 28,
 'ultim': 29,
 '2': 30,
 'ancient': 31,
 'particular': 32,
 'imposs': 33,
 'almost': 34,
 'unit': 35,
 'modern': 36,
 'forc': 37,
 'concentr': 38,
 'anyth': 39,
 'founder': 40,
 'someth': 41,
 'individu': 42,
 'ration': 43,
 'world': 44,
 'one': 45,
 '3': 46,
 'modi': 47,
 'ocean': 48,
 'vision': 49,
 'critic': 50,
 'experi': 51,
 'technolog': 52,
 'year': 53,
 'econom': 54,
 'innov': 55,
 'critica': 56,
 'knowledg': 57,
 'produ': 58,
 'publish': 59,
 'think': 60,
 'compound': 61,
 'distinguish': 62,
 'magazin': 63,
 'gener': 64,
 'inspir': 65,
 'accumul': 66,
 'implement': 67,
 'qualiti': 68

In [354]:
sequences = [vocab[word] for word in text]
sequences

[45,
 142,
 169,
 134,
 177,
 19,
 31,
 117,
 55,
 170,
 31,
 74,
 36,
 74,
 45,
 68,
 62,
 84,
 95,
 155,
 95,
 155,
 83,
 1,
 87,
 159,
 196,
 123,
 149,
 158,
 28,
 184,
 65,
 64,
 29,
 138,
 128,
 152,
 4,
 85,
 155,
 115,
 2,
 85,
 69,
 58,
 197,
 151,
 123,
 69,
 159,
 173,
 85,
 49,
 35,
 191,
 3,
 141,
 70,
 15,
 183,
 122,
 85,
 49,
 36,
 54,
 111,
 133,
 192,
 129,
 47,
 85,
 49,
 191,
 161,
 86,
 120,
 97,
 85,
 49,
 109,
 98,
 17,
 95,
 61,
 101,
 185,
 51,
 108,
 42,
 75,
 100,
 71,
 79,
 20,
 66,
 155,
 53,
 175,
 49,
 25,
 162,
 150,
 8,
 90,
 23,
 103,
 93,
 89,
 93,
 59,
 43,
 155,
 99,
 49,
 52,
 121,
 126,
 34,
 9,
 63,
 87,
 40,
 91,
 171,
 177,
 5,
 74,
 186,
 182,
 177,
 112,
 42,
 45,
 63,
 139,
 9,
 143,
 24,
 78,
 178,
 54,
 189,
 124,
 186,
 50,
 168,
 179,
 92,
 130,
 160,
 131,
 142,
 76,
 181,
 131,
 142,
 174,
 113,
 131,
 142,
 175,
 126,
 131,
 142,
 142,
 147,
 107,
 155,
 144,
 164,
 110,
 93,
 89,
 142,
 26,
 26,
 104,
 57,
 104,
 57,
 135,
 44,
 13,


In [355]:
max_len = 100
padded_sequences = pad_sequences([sequences], maxlen=max_len, padding='post', value=vocab['<PAD>'])
padded_sequences

array([[ 83,  38, 142, 165,  50,  60,  69,  18, 136,  44, 148, 194,  87,
        168, 118,  10,  87,  72,  33,  77,  95, 155,  56, 154,  60, 125,
        142, 142,  32, 176, 137,  21, 167, 155, 143, 176,  96, 180, 102,
         80,  14, 157, 185,  11,  43,  88, 176,  27, 117,  52,  73,  39,
         41,  12, 106, 155, 116,  37,  60,  26,  20, 125, 155, 196,  43,
        146,  60,  32, 155, 101, 172, 187, 163,  81, 195, 156, 153,  56,
        154,  60,  85, 155, 142,  83, 190,  44, 127, 140, 186,   7, 119,
         48,  57, 114,   6, 144,  46, 132, 145,  16]])

In [356]:
input_data = torch.tensor(padded_sequences)

output = model(input_data)
output

tensor([[0.4941]], grad_fn=<SigmoidBackward0>)

In [357]:
embeddings = model.embedding.weight.data
embeddings

tensor([[-0.9829, -0.9709, -2.1096,  ..., -1.2366,  0.3206, -1.6520],
        [-0.1264,  2.1732,  0.3023,  ..., -0.8546,  0.4334,  0.4450],
        [-1.6231, -0.9528, -2.6533,  ...,  1.9212,  0.0248, -0.5950],
        ...,
        [-0.5661,  0.1613, -1.0765,  ...,  0.8664,  0.5197,  0.2507],
        [-0.3159,  0.1289, -2.0218,  ..., -0.1215, -1.3478, -0.1950],
        [ 0.5462, -1.4281,  0.2241,  ..., -1.2926, -0.5213,  0.2129]])

In [359]:
embeddings[0], embeddings[0].shape

(tensor([-9.8295e-01, -9.7092e-01, -2.1096e+00, -9.7167e-01,  4.9749e-02,
         -1.2873e+00, -4.3187e-01,  8.4556e-02, -7.0579e-01,  1.3990e+00,
          7.9793e-01,  1.2456e+00,  8.9982e-01,  3.6533e-01, -7.1166e-01,
          2.7252e+00,  1.3123e+00,  6.8170e-01,  6.0724e-01,  4.2044e-02,
         -5.2090e-01, -4.2473e-01, -4.1889e-01,  1.2420e+00,  2.6352e+00,
         -1.0248e+00, -6.0962e-02, -1.6443e+00,  1.5773e+00,  4.1810e-01,
          1.0654e+00, -1.2692e-01, -3.9233e-01,  9.1162e-01, -5.2665e-02,
         -1.0496e+00, -7.9073e-04, -4.7253e-02,  3.2728e-01,  6.6188e-01,
         -1.5781e+00, -5.4398e-01,  9.0601e-01, -1.8019e-01, -4.0885e-01,
          4.1832e-01,  1.6633e+00, -1.2366e+00,  3.2058e-01, -1.6520e+00]),
 torch.Size([50]))

These weights from Tensorflow and PyTorch are the random initialized weights. We get the actual trained weights once the model is trained.

### Using transformers

In [360]:
from transformers import DistilBertTokenizer, DistilBertModel
import torch

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

inputs = tokenizer(texts, return_tensors='pt', max_length=512, padding='max_length', truncation=True)
input_ids = inputs['input_ids']
attention_mask = inputs['attention_mask']

inputs

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

{'input_ids': tensor([[  101,  2339,  2028,  2323,  4339,  1029,  2043,  2057,  2298,  2046,
          1996,  3268,  1997,  2070,  1997,  1996,  2087,  3144,  4177,  1010,
          3418, 17586,  1010,  7601,  7103,  6591,  1010, 15246,  2015,  2013,
          3418,  2335,  2000,  2715,  2335,  1010,  2045,  2003,  2028,  3737,
          2008, 27343,  2068,  2013,  1996,  2717,  1024, 15563,  1997,  2245,
           999,  2009,  2003,  2023, 15563,  1997,  2245,  2008,  7126,  2068,
          3857,  2307,  4411,  1010,  2599,  2111,  1010,  3288,  2047,  7601,
          7103, 14841,  5644,  1998, 18708,  8213,  1012,  2061,  1996,  7209,
          3160, 19223,  2105,  2129,  2027,  2150,  2061,  3154,  2007,  2037,
          4301,  1029,  3889,  5841,  2001,  2200,  3154,  2055,  2054,  2002,
          2359,  2010,  4013,  8566, 14931,  2015,  2000,  2022,  1998,  2054,
          2785,  1997,  2111,  2002,  2359,  1999,  2010,  3029,  1012,  9212,
          4817,  3148,  2001,  2200,  

In [363]:
with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask)
    embeddings = outputs.last_hidden_state

# Print embeddings for the first token
print(embeddings[0][0])


tensor([-1.3446e-01, -6.4056e-03, -3.0854e-01, -2.2085e-02, -1.0983e-01,
        -1.7662e-01,  2.7308e-02,  6.9829e-01, -1.0097e-01, -1.9704e-01,
         6.1672e-02, -1.3034e-01, -1.2726e-01,  2.9505e-01,  6.8639e-02,
         2.2685e-01, -1.3505e-01,  2.9142e-01, -1.9803e-01,  1.2749e-01,
        -1.4753e-01, -2.5106e-01,  3.4505e-01,  1.8927e-01,  3.3258e-01,
        -2.8398e-01, -9.8964e-02, -2.9080e-01, -3.2336e-06,  1.8176e-01,
        -1.0859e-02,  3.5984e-01, -1.0792e-01, -1.6037e-01,  2.8406e-02,
        -3.8096e-01,  6.0867e-02, -7.6361e-02,  3.7197e-02,  4.0410e-01,
        -8.8508e-02,  1.5684e-01,  9.2603e-02,  6.6359e-02, -1.7370e-01,
        -1.5983e-01, -3.5477e+00,  2.0832e-01, -1.1465e-01, -1.0011e-01,
         1.2687e-01, -9.3858e-02, -2.3133e-01,  1.9782e-01,  3.8516e-01,
         5.9919e-01, -2.9942e-01,  2.3713e-01, -2.4138e-01, -1.5494e-01,
         4.0187e-01,  2.5696e-02, -2.5842e-01, -1.7345e-01,  1.6330e-01,
        -5.6136e-02, -1.0160e-01,  2.3750e-01, -5.8

In [366]:
embeddings.shape, embeddings[0].shape, embeddings[0][0].shape

(torch.Size([1, 512, 768]), torch.Size([512, 768]), torch.Size([768]))