In [143]:
# import libraries used
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

import nltk
from nltk import sent_tokenize, word_tokenize, RegexpTokenizer, pos_tag
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import pandas as pd
from nltk.probability import FreqDist
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Tasks 

### a. To implement label encoding and one hot encoding on textual data

#### Performing preprocessing operations such as tokenization, punctuation removal and stop word removal before operating on the data


In [43]:
sentence = "This is a demo text used for testing the various built in methods of nltk library"

# Case uniformity
sentence = sentence.lower()

# Tokenization and Stopword removal
stopword = stopwords.words('english')
word_tokens = nltk.word_tokenize(sentence)
removing_stopwords = [word for word in word_tokens if word not in stopword]
print(removing_stopwords)

['demo', 'text', 'used', 'testing', 'various', 'built', 'methods', 'nltk', 'library']


#### Label Encoding:

In [85]:
# Creating an initial dataframe
dog_types = ("affenpinscher", 
"Afghan hound", 
"Airedale terrier", 
"Akita", 
"Alaskan Malamute", 
"American Staffordshire terrier", 
"American water spaniel", 
"Australian cattle dog", 
"Australian shepherd", 
"Australian terrier", 
"basenji", 
"basset hound", 
"beagle", 
"bearded collie", 
"Bedlington terrier", 
"Bernese mountain dog", 
"bichon frise", 
"black and tan coonhound", 
"bloodhound", 
"border collie", 
"border terrier", 
"borzoi", 
"Boston terrier", 
"bouvier des Flandres", 
"boxer", 
"briard", 
"Brittany", 
"Brussels griffon", 
"bull terrier", 
"bulldog", 
"bullmastiff", 
"cairn terrier", 
"Canaan dog", 
"Chesapeake Bay retriever", 
"Chihuahua", 
"Chinese crested", 
"Chinese shar-pei", 
"chow chow", 
"Clumber spaniel", 
"cocker spaniel", 
"collie", 
"curly-coated retriever", 
"dachshund", 
"Dalmatian", 
"Doberman pinscher", 
"English cocker spaniel", 
"English setter", 
"English springer spaniel", 
"English toy spaniel", 
"Eskimo dog", 
"Finnish spitz", 
"flat-coated retriever", 
"fox terrier", 
"foxhound", 
"French bulldog", 
"German shepherd", 
"German shorthaired pointer", 
"German wirehaired pointer", 
"golden retriever", 
"Gordon setter", 
"Great Dane", 
"greyhound", 
"Irish setter", 
"Irish water spaniel", 
"Irish wolfhound", 
"Jack Russell terrier", 
"Japanese spaniel", 
"keeshond", 
"Kerry blue terrier", 
"komondor", 
"kuvasz", 
"Labrador retriever", 
"Lakeland terrier", 
"Lhasa apso", 
"Maltese", 
"Manchester terrier", 
"mastiff", 
"Mexican hairless", 
"Newfoundland", 
"Norwegian elkhound", 
"Norwich terrier", 
"otterhound", 
"papillon", 
"Pekingese", 
"pointer", 
"Pomeranian", 
"poodle", 
"pug", 
"puli", 
"Rhodesian ridgeback", 
"Rottweiler", 
"Saint Bernard", 
"saluki", 
"Samoyed", 
"schipperke", 
"schnauzer", 
"Scottish deerhound", 
"Scottish terrier", 
"Sealyham terrier", 
"Shetland sheepdog", 
"shih tzu", 
"Siberian husky", 
"silky terrier", 
"Skye terrier", 
"Staffordshire bull terrier", 
"soft-coated wheaten terrier", 
"Sussex spaniel", 
"spitz", 
"Tibetan terrier", 
"vizsla", 
"Weimaraner", 
"Welsh terrier", 
"West Highland white terrier", 
"whippet", 
"Yorkshire terrier")

dogs_df = pd.DataFrame(dog_types, columns = ['Dog_Types'])

# Creating instance of labelencoder
labelencoder = LabelEncoder()
dogs_df['Dog_Types_Categories'] = labelencoder.fit_transform(dogs_df['Dog_Types'])

In [86]:
dogs_df

Unnamed: 0,Dog_Types,Dog_Types_Categories
0,affenpinscher,68
1,Afghan hound,0
2,Airedale terrier,1
3,Akita,2
4,Alaskan Malamute,3
...,...,...
110,Weimaraner,64
111,Welsh terrier,65
112,West Highland white terrier,66
113,whippet,114


In [87]:
dogs_df['Dog_Types_Categories']

0       68
1        0
2        1
3        2
4        3
      ... 
110     64
111     65
112     66
113    114
114     67
Name: Dog_Types_Categories, Length: 115, dtype: int32

#### One Hot Encoding:

#### Using sci-kit learn library approach:


In [88]:
# Creating instance of one-hot-encoder
enc = OneHotEncoder(handle_unknown='ignore') # ‘ignore’ : When an unknown category is encountered during transform, the resulting one-hot encoded columns for this feature will be all zeros. In the inverse transform, an unknown category will be denoted as None.


# passing Dog_Types_Categories column (label encoded values of bridge_types)
enc_df = pd.DataFrame(enc.fit_transform(dogs_df[['Dog_Types_Categories']]).toarray())

# Merge with main df bridge_df on key values
dogs_df = dogs_df.join(enc_df)


In [89]:
dogs_df

Unnamed: 0,Dog_Types,Dog_Types_Categories,0,1,2,3,4,5,6,7,...,105,106,107,108,109,110,111,112,113,114
0,affenpinscher,68,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Afghan hound,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Airedale terrier,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Akita,2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Alaskan Malamute,3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110,Weimaraner,64,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
111,Welsh terrier,65,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
112,West Highland white terrier,66,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
113,whippet,114,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


#### Get Dummies method:

In [32]:
# Creating an initial dataframe
dog_types = ("affenpinscher", 
"Afghan hound", 
"Airedale terrier", 
"Akita", 
"Alaskan Malamute", 
"American Staffordshire terrier", 
"American water spaniel", 
"Australian cattle dog", 
"Australian shepherd", 
"Australian terrier", 
"basenji", 
"basset hound", 
"beagle", 
"bearded collie", 
"Bedlington terrier", 
"Bernese mountain dog", 
"bichon frise", 
"black and tan coonhound", 
"bloodhound", 
"border collie", 
"border terrier", 
"borzoi", 
"Boston terrier", 
"bouvier des Flandres", 
"boxer", 
"briard", 
"Brittany", 
"Brussels griffon", 
"bull terrier", 
"bulldog", 
"bullmastiff", 
"cairn terrier", 
"Canaan dog", 
"Chesapeake Bay retriever", 
"Chihuahua", 
"Chinese crested", 
"Chinese shar-pei", 
"chow chow", 
"Clumber spaniel", 
"cocker spaniel", 
"collie", 
"curly-coated retriever", 
"dachshund", 
"Dalmatian", 
"Doberman pinscher", 
"English cocker spaniel", 
"English setter", 
"English springer spaniel", 
"English toy spaniel", 
"Eskimo dog", 
"Finnish spitz", 
"flat-coated retriever", 
"fox terrier", 
"foxhound", 
"French bulldog", 
"German shepherd", 
"German shorthaired pointer", 
"German wirehaired pointer", 
"golden retriever", 
"Gordon setter", 
"Great Dane", 
"greyhound", 
"Irish setter", 
"Irish water spaniel", 
"Irish wolfhound", 
"Jack Russell terrier", 
"Japanese spaniel", 
"keeshond", 
"Kerry blue terrier", 
"komondor", 
"kuvasz", 
"Labrador retriever", 
"Lakeland terrier", 
"Lhasa apso", 
"Maltese", 
"Manchester terrier", 
"mastiff", 
"Mexican hairless", 
"Newfoundland", 
"Norwegian elkhound", 
"Norwich terrier", 
"otterhound", 
"papillon", 
"Pekingese", 
"pointer", 
"Pomeranian", 
"poodle", 
"pug", 
"puli", 
"Rhodesian ridgeback", 
"Rottweiler", 
"Saint Bernard", 
"saluki", 
"Samoyed", 
"schipperke", 
"schnauzer", 
"Scottish deerhound", 
"Scottish terrier", 
"Sealyham terrier", 
"Shetland sheepdog", 
"shih tzu", 
"Siberian husky", 
"silky terrier", 
"Skye terrier", 
"Staffordshire bull terrier", 
"soft-coated wheaten terrier", 
"Sussex spaniel", 
"spitz", 
"Tibetan terrier", 
"vizsla", 
"Weimaraner", 
"Welsh terrier", 
"West Highland white terrier", 
"whippet", 
"Yorkshire terrier")

dogs_df = pd.DataFrame(dog_types, columns = ['Dog_Types'])

dum_df = pd.get_dummies(dogs_df, columns=["Dog_Types"], prefix=["Type_is"] )

# Merge with main df bridge_df on key values
dogs_df = dogs_df.join(dum_df)

In [33]:
dogs_df

Unnamed: 0,Dog_Types,Type_is_Afghan hound,Type_is_Airedale terrier,Type_is_Akita,Type_is_Alaskan Malamute,Type_is_American Staffordshire terrier,Type_is_American water spaniel,Type_is_Australian cattle dog,Type_is_Australian shepherd,Type_is_Australian terrier,...,Type_is_puli,Type_is_saluki,Type_is_schipperke,Type_is_schnauzer,Type_is_shih tzu,Type_is_silky terrier,Type_is_soft-coated wheaten terrier,Type_is_spitz,Type_is_vizsla,Type_is_whippet
0,affenpinscher,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,Afghan hound,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,Airedale terrier,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,Akita,False,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,Alaskan Malamute,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110,Weimaraner,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
111,Welsh terrier,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
112,West Highland white terrier,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
113,whippet,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True


### b.	To implement Bag of Words (BoW) feature engineering technique on textual data

#### Using user defined function after preprocessing:

In [4]:
doc1 = 'Game of Thrones is an amazing tv series!'
doc2 = 'Game of Thrones is the best tv series!'
doc3 = 'Game of Thrones is so great'

l_doc1 = re.sub(r"[^a-zA-Z0-9]", " ", doc1.lower()).split()
l_doc2 = re.sub(r"[^a-zA-Z0-9]", " ", doc2.lower()).split()
l_doc3 = re.sub(r"[^a-zA-Z0-9]", " ", doc3.lower()).split()

wordset12 = np.union1d(l_doc1,l_doc2)
wordset =  np.union1d(wordset12,l_doc3)
print(wordset)


['amazing' 'an' 'best' 'game' 'great' 'is' 'of' 'series' 'so' 'the'
 'thrones' 'tv']


In [5]:
def calculateBOW(wordset,l_doc):
  tf_diz = dict.fromkeys(wordset,0)
  for word in l_doc:
      tf_diz[word]=l_doc.count(word)
  return tf_diz

bow1 = calculateBOW(wordset,l_doc1)
bow2 = calculateBOW(wordset,l_doc2)
bow3 = calculateBOW(wordset,l_doc3)
df_bow = pd.DataFrame([bow1,bow2,bow3])
df_bow.head()

Unnamed: 0,amazing,an,best,game,great,is,of,series,so,the,thrones,tv
0,1,1,0,1,0,1,1,1,0,0,1,1
1,0,0,1,1,0,1,1,1,0,1,1,1
2,0,0,0,1,1,1,1,0,1,0,1,0


#### Using sci-kit learn library:

In [29]:
doc1 = 'Game of Thrones is an amazing tv series!'
doc2 = 'Game of Thrones is the best tv series!'
doc3 = 'Game of Thrones is so great'

CountVec = CountVectorizer(ngram_range=(1,1), # to use bigrams ngram_range=(2,2)
                           stop_words='english')

# Transform
Count_data = CountVec.fit_transform([doc1, doc2, doc3])
 
# Initializing the dataframe
cv_dataframe = pd.DataFrame(Count_data.toarray(), columns=CountVec.get_feature_names_out())
cv_dataframe.head()

Unnamed: 0,amazing,best,game,great,series,thrones,tv
0,1,0,1,0,1,1,1
1,0,1,1,0,1,1,1
2,0,0,1,1,0,1,0


### c.	To implement TF-IDF feature engineering technique

In [48]:
documents = ["Inflation has increased unemployment", 
             "The company has increased its sales", 
              "Fear increased his pulse"]
# Preprocessing
def return_preprocessed_document(document):
    document = document.lower()
    
    # Word Tokenization
    words = word_tokenize(document)
    
    # Stop Words removal
    words = [word for word in words if word not in stopwords.words("english")]

    # Forming the complete sentence using String join
    document = " ".join(words)
    return document
documents = [return_preprocessed_document(document) for document in documents]

In [49]:
documents

['inflation increased unemployment',
 'company increased sales',
 'fear increased pulse']

In [51]:
# Creation of a TF-IDF model using Tfidf vectorizer function.

vectorizer = TfidfVectorizer()
tfidf_model = vectorizer.fit_transform(documents)
print(tfidf_model)  

  (0, 6)	0.652490884512534
  (0, 2)	0.3853716274664007
  (0, 3)	0.652490884512534
  (1, 5)	0.652490884512534
  (1, 0)	0.652490884512534
  (1, 2)	0.3853716274664007
  (2, 4)	0.652490884512534
  (2, 1)	0.652490884512534
  (2, 2)	0.3853716274664007


In [53]:
pd.DataFrame(tfidf_model.toarray(), columns = vectorizer.get_feature_names_out())

Unnamed: 0,company,fear,increased,inflation,pulse,sales,unemployment
0,0.0,0.0,0.385372,0.652491,0.0,0.0,0.652491
1,0.652491,0.0,0.385372,0.0,0.0,0.652491,0.0
2,0.0,0.652491,0.385372,0.0,0.652491,0.0,0.0


#### From the above output, we can infer the following:
##### We created our TF-IDF model where the sample sentences are converted into matrix format with higher weights assigned to semantically important words in a document such as inflation and unemployment in:
#### sentence 1, company and sales in sentence 2, and fear and pulse in sentence. 3. While frequent word across all documents, increased, assigned with lower weights, i.e., 0.385372

#### Additional task of classifying documents:

In [109]:
file = open("textfile.txt", 'r', encoding="mbcs")
d = {}
for i in file.read().split():
    print(i)
file.close()

Medical:
Hospital
Emergency
Room
(ER)
Intensive
Care
Unit
(ICU)
Operating
Room
(OR)
Exam
Diagnosis
Prescription
Urine
sample
Blood
sample
Hypertension
Cast
Vein
Syringe
Painkiller/pain
reliever
Numb
Dosage
Biopsy
(of
abnormal
cells)
Finanace:
1.
Amortization:
Amortization
is
a
method
of
spreading
an
intangible
asset's
cost
over
the
course
of
its
useful
life.
Intangible
assets
are
non-physical
assets
that
are
essential
to
a
company,
such
as
a
trademark,
patent,
copyright,
or
franchise
agreement.
2.
Assets:
Assets
are
items
you
own
that
can
provide
future
benefit
to
your
business,
such
as
cash,
inventory,
real
estate,
office
equipment,
or
accounts
receivable,
which
are
payments
due
to
a
company
by
its
customers.
There
are
different
types
of
assets,
including:
Current
Assets:
Which
can
be
converted
to
cash
within
a
year
Fixed
Assets:
Which
canâ€™t
immediately
be
turned
into
cash,
but
are
tangible
items
that
a
company
owns
and
uses
to
generate
long-term
income
3.
Asset
Allocation:
Asset
al

In [99]:
# We take two documents containing text about medical and finance related words.
# We need to read the text file and classify which field the document belongs to.
# This can be done by counting the words relating to finance and medical and seeing which count is larger (simplest method).

file = open("textfile.txt", 'r', encoding="mbcs")
d = {}
for i in file.read().split():
    if i in d:
        d[i] += 1
    else:
        d[i] = 1
print(d)
file.close()

{'Medical:': 1, 'Hospital': 1, 'Emergency': 1, 'Room': 2, '(ER)': 1, 'Intensive': 1, 'Care': 1, 'Unit': 1, '(ICU)': 1, 'Operating': 2, '(OR)': 1, 'Exam': 1, 'Diagnosis': 1, 'Prescription': 1, 'Urine': 1, 'sample': 2, 'Blood': 1, 'Hypertension': 1, 'Cast': 1, 'Vein': 1, 'Syringe': 1, 'Painkiller/pain': 1, 'reliever': 1, 'Numb': 1, 'Dosage': 1, 'Biopsy': 1, '(of': 1, 'abnormal': 1, 'cells)': 1, 'Finanace:': 1, '1.': 1, 'Amortization:': 1, 'Amortization': 1, 'is': 11, 'a': 29, 'method': 1, 'of': 23, 'spreading': 1, 'an': 9, 'intangible': 1, "asset's": 1, 'cost': 1, 'over': 4, 'the': 26, 'course': 1, 'its': 3, 'useful': 1, 'life.': 1, 'Intangible': 1, 'assets': 3, 'are': 8, 'non-physical': 1, 'that': 7, 'essential': 1, 'to': 17, 'company,': 2, 'such': 3, 'as': 6, 'trademark,': 1, 'patent,': 1, 'copyright,': 1, 'or': 9, 'franchise': 1, 'agreement.': 1, '2.': 1, 'Assets:': 3, 'Assets': 2, 'items': 2, 'you': 9, 'own': 1, 'can': 7, 'provide': 2, 'future': 1, 'benefit': 1, 'your': 5, 'business,

In [140]:
medical_words = ["Medical", "Prescription", "hospital", "health", "exam", "Blood"]
finance_words = ["Invest", "market", "payment", "Withdraw", "Cash", "Depriciation", "Equity"]

med_d = {}
finan_d = {}

file = open("textfile.txt", 'r', encoding="mbcs")

for i in file.read().split():
    if i.lower() in med_d and i in medical_words:
            med_d[i] += 1
    else:
        med_d[i] = 1
print(med_d)
file.close()

print()
print()

file = open("textfile.txt", 'r', encoding="mbcs")
for i in file.read().split():
    if i in finan_d and i in finance_words:
        finan_d[i] += 1
    else:
        finan_d[i] = 1
print(finan_d)

{'Medical:': 1, 'Hospital': 1, 'Emergency': 1, 'Room': 1, '(ER)': 1, 'Intensive': 1, 'Care': 1, 'Unit': 1, '(ICU)': 1, 'Operating': 1, '(OR)': 1, 'Exam': 1, 'Diagnosis': 1, 'Prescription': 1, 'Urine': 1, 'sample': 1, 'Blood': 1, 'Hypertension': 1, 'Cast': 1, 'Vein': 1, 'Syringe': 1, 'Painkiller/pain': 1, 'reliever': 1, 'Numb': 1, 'Dosage': 1, 'Biopsy': 1, '(of': 1, 'abnormal': 1, 'cells)': 1, 'Finanace:': 1, '1.': 1, 'Amortization:': 1, 'Amortization': 1, 'is': 1, 'a': 1, 'method': 1, 'of': 1, 'spreading': 1, 'an': 1, 'intangible': 1, "asset's": 1, 'cost': 1, 'over': 1, 'the': 1, 'course': 1, 'its': 1, 'useful': 1, 'life.': 1, 'Intangible': 1, 'assets': 1, 'are': 1, 'non-physical': 1, 'that': 1, 'essential': 1, 'to': 1, 'company,': 1, 'such': 1, 'as': 1, 'trademark,': 1, 'patent,': 1, 'copyright,': 1, 'or': 1, 'franchise': 1, 'agreement.': 1, '2.': 1, 'Assets:': 1, 'Assets': 1, 'items': 1, 'you': 1, 'own': 1, 'can': 1, 'provide': 1, 'future': 1, 'benefit': 1, 'your': 1, 'business,': 1,

In [142]:
if (len(', '.join(str(x) for x in finan_d.values() if x == 2)) > len(', '.join(str(x) for x in med_d.values()  if x == 2))):
    print("The document is related to finance")
else:
    print("The document is related to medical")


The document is related to finance


#### Hence from content stored in the textfile, we can see that the document is related to finance.