In [5]:
paragraph = 'Nvidia Corporation[a] (/ɛnˈvɪdiə/, en-VID-ee-ə) is an American multinational corporation and technology company headquartered in Santa Clara, California, and incorporated in Delaware.[5] It is a software and fabless company which designs and supplies graphics processing units (GPUs), application programming interfaces (APIs) for data science and high-performance computing, as well as system on a chip units (SoCs) for the mobile computing and automotive market. Nvidia is also a dominant supplier of artificial intelligence (AI) hardware and software.[6][7][8]'
paragraph

'Nvidia Corporation[a] (/ɛnˈvɪdiə/, en-VID-ee-ə) is an American multinational corporation and technology company headquartered in Santa Clara, California, and incorporated in Delaware.[5] It is a software and fabless company which designs and supplies graphics processing units (GPUs), application programming interfaces (APIs) for data science and high-performance computing, as well as system on a chip units (SoCs) for the mobile computing and automotive market. Nvidia is also a dominant supplier of artificial intelligence (AI) hardware and software.[6][7][8]'

In [6]:
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

### Tokenization -- > Converts Paragraphs into words

In [7]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\TESS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [9]:
sentences = nltk.sent_tokenize(paragraph)
print(sentences)

['Nvidia Corporation[a] (/ɛnˈvɪdiə/, en-VID-ee-ə) is an American multinational corporation and technology company headquartered in Santa Clara, California, and incorporated in Delaware.', '[5] It is a software and fabless company which designs and supplies graphics processing units (GPUs), application programming interfaces (APIs) for data science and high-performance computing, as well as system on a chip units (SoCs) for the mobile computing and automotive market.', 'Nvidia is also a dominant supplier of artificial intelligence (AI) hardware and software.', '[6][7][8]']


### Stemming -- > The words into it's root form

In [10]:
stemmer = PorterStemmer()

In [12]:
stemmer.stem('history')

'histori'

### Lemmatization --> Words into base form with meaning

In [13]:
from nltk.stem import WordNetLemmatizer

In [14]:
lemmatizer = WordNetLemmatizer()

In [17]:
lemmatizer.lemmatize('goes')

'go'

In [20]:
len(sentences)

4

### Cleaning the corpus

In [18]:
import re
corpus = []
for i in range(len(sentences)):
    review = re.sub('[^a-zA-Z]', ' ', sentences[i])
    review = review.lower()
    corpus.append(review)

In [19]:
print(corpus)

['nvidia corporation a     n v di    en vid ee    is an american multinational corporation and technology company headquartered in santa clara  california  and incorporated in delaware ', '    it is a software and fabless company which designs and supplies graphics processing units  gpus   application programming interfaces  apis  for data science and high performance computing  as well as system on a chip units  socs  for the mobile computing and automotive market ', 'nvidia is also a dominant supplier of artificial intelligence  ai  hardware and software ', '         ']


In [None]:
# Stemming
for i in corpus:
    words = nltk.word_tokenize(i)
    for word in words:
        if word not in set(stopwords.words('english')):
            print(stemmer.stem(word))

nvidia
corpor
n
v
di
en
vid
ee
american
multin
corpor
technolog
compani
headquart
santa
clara
california
incorpor
delawar
softwar
fabless
compani
design
suppli
graphic
process
unit
gpu
applic
program
interfac
api
data
scienc
high
perform
comput
well
system
chip
unit
soc
mobil
comput
automot
market
nvidia
also
domin
supplier
artifici
intellig
ai
hardwar
softwar


In [49]:
#Lemmatization
final_data = []
for i in corpus:
    words = nltk.word_tokenize(i)
    transform_sentence = ' '
    for word in words:
        if word not in set(stopwords.words('english')):
            lem_word = lemmatizer.lemmatize(word)
            transform_sentence = transform_sentence + lem_word + ' ' 
            print(lem_word)
    final_data.append(transform_sentence)

nvidia
corporation
n
v
di
en
vid
ee
american
multinational
corporation
technology
company
headquartered
santa
clara
california
incorporated
delaware
software
fable
company
design
supply
graphic
processing
unit
gpus
application
programming
interface
apis
data
science
high
performance
computing
well
system
chip
unit
socs
mobile
computing
automotive
market
nvidia
also
dominant
supplier
artificial
intelligence
ai
hardware
software


In [52]:
corpus = final_data

In [58]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(binary=True)

In [59]:
X = cv.fit_transform(corpus)

In [60]:
cv.vocabulary_  #feature number not the count

{'nvidia': 32,
 'corporation': 12,
 'di': 16,
 'en': 19,
 'vid': 45,
 'ee': 18,
 'american': 2,
 'multinational': 31,
 'technology': 43,
 'company': 10,
 'headquartered': 24,
 'santa': 36,
 'clara': 9,
 'california': 7,
 'incorporated': 26,
 'delaware': 14,
 'software': 39,
 'fable': 20,
 'design': 15,
 'supply': 41,
 'graphic': 22,
 'processing': 34,
 'unit': 44,
 'gpus': 21,
 'application': 4,
 'programming': 35,
 'interface': 28,
 'apis': 3,
 'data': 13,
 'science': 37,
 'high': 25,
 'performance': 33,
 'computing': 11,
 'well': 46,
 'system': 42,
 'chip': 8,
 'socs': 38,
 'mobile': 30,
 'automotive': 6,
 'market': 29,
 'also': 1,
 'dominant': 17,
 'supplier': 40,
 'artificial': 5,
 'intelligence': 27,
 'ai': 0,
 'hardware': 23}

In [61]:
corpus[0]

' nvidia corporation n v di en vid ee american multinational corporation technology company headquartered santa clara california incorporated delaware '

In [62]:
X[0].toarray()

array([[0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0,
        0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
        0, 1, 0]], dtype=int64)