In [29]:
import numpy as np
import nltk
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer


In [30]:
text=['''Data science is an interdisciplinary academic field[1] that uses statistics, scientific computing, scientific methods, processing, scientific visualization, algorithms and systems to extract or extrapolate knowledge from potentially noisy, structured, or unstructured data.[2]

Data science also integrates domain knowledge from the underlying application domain (e.g., natural sciences, information technology, and medicine).[3] Data science is multifaceted and can be described as a science, a research paradigm, a research method, a discipline, a workflow, and a profession.[4]

Data science is "a concept to unify statistics, data analysis, informatics, and their related methods" to "understand and analyze actual phenomena" with data.[5] It uses techniques and theories drawn from many fields within the context of mathematics, statistics, computer science, information science, and domain knowledge.[6] However, data science is different from computer science and information science. Turing Award winner Jim Gray imagined data science as a "fourth paradigm" of science (empirical, theoretical, computational, and now data-driven) and asserted that "everything about science is changing because of the impact of information technology" and the data deluge.[7][8]

A data scientist is a professional who creates programming code and combines it with statistical knowledge to summarize data.[9]''']

## Tokenization

In [31]:
tokens=[word_tokenize(t.lower()) for t in text]

print(f'Tokens: {tokens}')

Tokens: [['data', 'science', 'is', 'an', 'interdisciplinary', 'academic', 'field', '[', '1', ']', 'that', 'uses', 'statistics', ',', 'scientific', 'computing', ',', 'scientific', 'methods', ',', 'processing', ',', 'scientific', 'visualization', ',', 'algorithms', 'and', 'systems', 'to', 'extract', 'or', 'extrapolate', 'knowledge', 'from', 'potentially', 'noisy', ',', 'structured', ',', 'or', 'unstructured', 'data', '.', '[', '2', ']', 'data', 'science', 'also', 'integrates', 'domain', 'knowledge', 'from', 'the', 'underlying', 'application', 'domain', '(', 'e.g.', ',', 'natural', 'sciences', ',', 'information', 'technology', ',', 'and', 'medicine', ')', '.', '[', '3', ']', 'data', 'science', 'is', 'multifaceted', 'and', 'can', 'be', 'described', 'as', 'a', 'science', ',', 'a', 'research', 'paradigm', ',', 'a', 'research', 'method', ',', 'a', 'discipline', ',', 'a', 'workflow', ',', 'and', 'a', 'profession', '.', '[', '4', ']', 'data', 'science', 'is', '``', 'a', 'concept', 'to', 'unify'

## POS Tagging

In [32]:
pos_tags=[nltk.pos_tag(token) for token in tokens]

print(f'Pos-Tagging: {pos_tags}')

Pos-Tagging: [[('data', 'NNS'), ('science', 'NN'), ('is', 'VBZ'), ('an', 'DT'), ('interdisciplinary', 'JJ'), ('academic', 'JJ'), ('field', 'NN'), ('[', 'VBD'), ('1', 'CD'), (']', 'NN'), ('that', 'WDT'), ('uses', 'VBZ'), ('statistics', 'NNS'), (',', ','), ('scientific', 'JJ'), ('computing', 'NN'), (',', ','), ('scientific', 'JJ'), ('methods', 'NNS'), (',', ','), ('processing', 'NN'), (',', ','), ('scientific', 'JJ'), ('visualization', 'NN'), (',', ','), ('algorithms', 'NN'), ('and', 'CC'), ('systems', 'NNS'), ('to', 'TO'), ('extract', 'VB'), ('or', 'CC'), ('extrapolate', 'VB'), ('knowledge', 'NN'), ('from', 'IN'), ('potentially', 'RB'), ('noisy', 'JJ'), (',', ','), ('structured', 'JJ'), (',', ','), ('or', 'CC'), ('unstructured', 'JJ'), ('data', 'NNS'), ('.', '.'), ('[', '$'), ('2', 'CD'), (']', 'NNP'), ('data', 'NNS'), ('science', 'NN'), ('also', 'RB'), ('integrates', 'VBZ'), ('domain', 'VBP'), ('knowledge', 'NN'), ('from', 'IN'), ('the', 'DT'), ('underlying', 'VBG'), ('application', 'N

## Stop Word Removal

In [33]:
import string

stop_words = set(stopwords.words("english"))

no_stop_word = [[word for word in token_list if word not in stop_words and word not in string.punctuation] for token_list in tokens]
print("Tokens after stop words removal:", no_stop_word)


Tokens after stop words removal: [['data', 'science', 'interdisciplinary', 'academic', 'field', '1', 'uses', 'statistics', 'scientific', 'computing', 'scientific', 'methods', 'processing', 'scientific', 'visualization', 'algorithms', 'systems', 'extract', 'extrapolate', 'knowledge', 'potentially', 'noisy', 'structured', 'unstructured', 'data', '2', 'data', 'science', 'also', 'integrates', 'domain', 'knowledge', 'underlying', 'application', 'domain', 'e.g.', 'natural', 'sciences', 'information', 'technology', 'medicine', '3', 'data', 'science', 'multifaceted', 'described', 'science', 'research', 'paradigm', 'research', 'method', 'discipline', 'workflow', 'profession', '4', 'data', 'science', '``', 'concept', 'unify', 'statistics', 'data', 'analysis', 'informatics', 'related', 'methods', "''", '``', 'understand', 'analyze', 'actual', 'phenomena', "''", 'data', '5', 'uses', 'techniques', 'theories', 'drawn', 'many', 'fields', 'within', 'context', 'mathematics', 'statistics', 'computer', '

## Stemming

In [34]:
stemmer=PorterStemmer()
stemmed_tokens=[[stemmer.stem(words) for words in token] for token in tokens]
print("Stemming Token: ",stemmed_tokens)

Stemming Token:  [['data', 'scienc', 'is', 'an', 'interdisciplinari', 'academ', 'field', '[', '1', ']', 'that', 'use', 'statist', ',', 'scientif', 'comput', ',', 'scientif', 'method', ',', 'process', ',', 'scientif', 'visual', ',', 'algorithm', 'and', 'system', 'to', 'extract', 'or', 'extrapol', 'knowledg', 'from', 'potenti', 'noisi', ',', 'structur', ',', 'or', 'unstructur', 'data', '.', '[', '2', ']', 'data', 'scienc', 'also', 'integr', 'domain', 'knowledg', 'from', 'the', 'underli', 'applic', 'domain', '(', 'e.g.', ',', 'natur', 'scienc', ',', 'inform', 'technolog', ',', 'and', 'medicin', ')', '.', '[', '3', ']', 'data', 'scienc', 'is', 'multifacet', 'and', 'can', 'be', 'describ', 'as', 'a', 'scienc', ',', 'a', 'research', 'paradigm', ',', 'a', 'research', 'method', ',', 'a', 'disciplin', ',', 'a', 'workflow', ',', 'and', 'a', 'profess', '.', '[', '4', ']', 'data', 'scienc', 'is', '``', 'a', 'concept', 'to', 'unifi', 'statist', ',', 'data', 'analysi', ',', 'informat', ',', 'and', 't

## Lemmatization

In [35]:
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [[lemmatizer.lemmatize(word) for word in token] for token in tokens]
print("Lemmatized Tokens:", lemmatized_tokens)

Lemmatized Tokens: [['data', 'science', 'is', 'an', 'interdisciplinary', 'academic', 'field', '[', '1', ']', 'that', 'us', 'statistic', ',', 'scientific', 'computing', ',', 'scientific', 'method', ',', 'processing', ',', 'scientific', 'visualization', ',', 'algorithm', 'and', 'system', 'to', 'extract', 'or', 'extrapolate', 'knowledge', 'from', 'potentially', 'noisy', ',', 'structured', ',', 'or', 'unstructured', 'data', '.', '[', '2', ']', 'data', 'science', 'also', 'integrates', 'domain', 'knowledge', 'from', 'the', 'underlying', 'application', 'domain', '(', 'e.g.', ',', 'natural', 'science', ',', 'information', 'technology', ',', 'and', 'medicine', ')', '.', '[', '3', ']', 'data', 'science', 'is', 'multifaceted', 'and', 'can', 'be', 'described', 'a', 'a', 'science', ',', 'a', 'research', 'paradigm', ',', 'a', 'research', 'method', ',', 'a', 'discipline', ',', 'a', 'workflow', ',', 'and', 'a', 'profession', '.', '[', '4', ']', 'data', 'science', 'is', '``', 'a', 'concept', 'to', 'uni

## Term frequence and Inverse Document frequency

In [36]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X=vectorizer.fit_transform(text)
vectorizer.get_feature_names_out()

array(['about', 'academic', 'actual', 'algorithms', 'also', 'an',
       'analysis', 'analyze', 'and', 'application', 'as', 'asserted',
       'award', 'be', 'because', 'can', 'changing', 'code', 'combines',
       'computational', 'computer', 'computing', 'concept', 'context',
       'creates', 'data', 'deluge', 'described', 'different',
       'discipline', 'domain', 'drawn', 'driven', 'empirical',
       'everything', 'extract', 'extrapolate', 'field', 'fields',
       'fourth', 'from', 'gray', 'however', 'imagined', 'impact',
       'informatics', 'information', 'integrates', 'interdisciplinary',
       'is', 'it', 'jim', 'knowledge', 'many', 'mathematics', 'medicine',
       'method', 'methods', 'multifaceted', 'natural', 'noisy', 'now',
       'of', 'or', 'paradigm', 'phenomena', 'potentially', 'processing',
       'profession', 'professional', 'programming', 'related', 'research',
       'science', 'sciences', 'scientific', 'scientist', 'statistical',
       'statistics', 'struc

In [37]:


tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform([" ".join(doc) for doc in lemmatized_tokens])
feature_names = tfidf_vectorizer.get_feature_names_out()
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)

print("TF-IDF Representation:")
print(tfidf_df)


TF-IDF Representation:
      about  academic    actual  algorithm      also        an  analysis  \
0  0.035007  0.035007  0.035007   0.035007  0.035007  0.035007  0.035007   

    analyze       and  application  ...  understand     unify  unstructured  \
0  0.035007  0.455091     0.035007  ...    0.035007  0.035007      0.035007   

         us  visualization       who    winner      with    within  workflow  
0  0.070014       0.035007  0.035007  0.035007  0.070014  0.035007  0.035007  

[1 rows x 98 columns]
