In [1]:
import pandas as pd
import sklearn as sk
import math 

In [2]:
block_1 = "Our aim is to develop a good work culture among students, a culture where students from various technical backgrounds come together to teach, guide and collaborate with each other on various projects and grow together."

block_2 = "Keeping in mind the interest of the IT professionals and computer enthusiasts, CSI works towards making the profession an area of choice amongst all sections of the society. The promotion of Information Technology as a profession is the top priority of CSI today. To fulfill this objective, the CSI regularly organizes conferences, conventions, lectures, projects, and awards. And at the same time, it also ensures that regular training and skill updating are organized for the future IT professionals."

#split so each word have their own string
first_block = block_1.split(" ")
second_block = block_2.split(" ")

#join them to remove common duplicate words
total= set(first_block).union(set(second_block))

print(total)

{'IT', 'organizes', 'conventions,', 'together.', 'professionals', 'aim', 'various', 'as', 'conferences,', 'The', 'Information', 'training', 'enthusiasts,', 'towards', 'an', 'mind', 'profession', 'objective,', 'skill', 'works', 'making', 'guide', 'from', 'students,', 'technical', 'lectures,', 'that', 'on', 'computer', 'of', 'good', 'in', 'collaborate', 'top', 'today.', 'to', 'fulfill', 'priority', 'time,', 'professionals.', 'a', 'sections', 'Our', 'it', 'this', 'regularly', 'and', 'amongst', 'other', 'updating', 'with', 'area', 'And', 'for', 'develop', 'come', 'teach,', 'society.', 'also', 'projects', 'all', 'each', 'at', 'future', 'students', 'backgrounds', 'among', 'CSI', 'are', 'awards.', 'is', 'Keeping', 'together', 'projects,', 'same', 'Technology', 'culture', 'regular', 'promotion', 'grow', 'choice', 'work', 'the', 'organized', 'where', 'ensures', 'To', 'interest'}


In [3]:
wordDictA = dict.fromkeys(total, 0) 
wordDictB = dict.fromkeys(total, 0)

for word in first_block:
    wordDictA[word]+=1
    
for word in second_block:
    wordDictB[word]+=1

In [4]:
pd.DataFrame([wordDictA, wordDictB])

Unnamed: 0,IT,organizes,"conventions,",together.,professionals,aim,various,as,"conferences,",The,...,promotion,grow,choice,work,the,organized,where,ensures,To,interest
0,0,0,0,1,0,1,2,0,0,0,...,0,1,0,1,0,0,1,0,0,0
1,2,1,1,0,1,0,0,1,1,1,...,1,0,1,0,8,1,0,1,1,1


In [5]:
def computeTF(wordDict, doc):
    tfDict = {}
    corpusCount = len(doc)
    for word, count in wordDict.items():tfDict[word] = count/float(corpusCount)
    return(tfDict)

#running our sentences through the tf function:

tfFirst = computeTF(wordDictA, first_block)

tfSecond = computeTF(wordDictB, second_block)

#Converting to dataframe for visualization

tf = pd.DataFrame([tfFirst, tfSecond])

In [6]:
# Now we'll remove stopwords from the list

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))
filtered_sentence = [w for w in wordDictA if not w in stop_words]

print(filtered_sentence)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
['IT', 'organizes', 'conventions,', 'together.', 'professionals', 'aim', 'various', 'conferences,', 'The', 'Information', 'training', 'enthusiasts,', 'towards', 'mind', 'profession', 'objective,', 'skill', 'works', 'making', 'guide', 'students,', 'technical', 'lectures,', 'computer', 'good', 'collaborate', 'top', 'today.', 'fulfill', 'priority', 'time,', 'professionals.', 'sections', 'Our', 'regularly', 'amongst', 'updating', 'area', 'And', 'develop', 'come', 'teach,', 'society.', 'also', 'projects', 'future', 'students', 'backgrounds', 'among', 'CSI', 'awards.', 'Keeping', 'together', 'projects,', 'Technology', 'culture', 'regular', 'promotion', 'grow', 'choice', 'work', 'organized', 'ensures', 'To', 'interest']


In [7]:
# Now moving onto the IDF Part

def computeIDF(docList):
    idfDict = {}
    N = len(docList)   
    idfDict = dict.fromkeys(docList[0].keys(), 0)
    for word, val in idfDict.items(): idfDict[word] = math.log10(N / (float(val) + 1))     
    return(idfDict)

#inputing our sentences in the log file
idfs = computeIDF([wordDictA, wordDictB])

In [8]:
# Now we'll implement the IDF formula

def computeTFIDF(tfBow, idfs):
    tfidf = {}
    for word, val in tfBow.items(): tfidf[word] = val*idfs[word]
    return(tfidf)

#running our two sentences through the IDF:
idfFirst = computeTFIDF(tfFirst, idfs)
idfSecond = computeTFIDF(tfSecond, idfs)

#putting it in a dataframe
idf= pd.DataFrame([idfFirst, idfSecond])

print(idf)

         IT  organizes  conventions,  together.  professionals       aim  \
0  0.000000   0.000000      0.000000   0.008601       0.000000  0.008601   
1  0.007719   0.003859      0.003859   0.000000       0.003859  0.000000   

    various        as  conferences,       The  ...  promotion      grow  \
0  0.017202  0.000000      0.000000  0.000000  ...   0.000000  0.008601   
1  0.000000  0.003859      0.003859  0.003859  ...   0.003859  0.000000   

     choice      work       the  organized     where   ensures        To  \
0  0.000000  0.008601  0.000000   0.000000  0.008601  0.000000  0.000000   
1  0.003859  0.000000  0.030875   0.003859  0.000000  0.003859  0.003859   

   interest  
0  0.000000  
1  0.003859  

[2 rows x 88 columns]


In [9]:
# Above way was the generic/formulatic way of implementing TFIDF, This process can be made way more simpler by using
# sklearn library, example given below

from sklearn.feature_extraction.text import TfidfVectorizer

#Make sure all words are in lowercase

version_1 = "Developing a competitive culture where the students polish technical and professional attributes, gain experience and learn new skills while upgrading the already present skillset. For those fledglings who have a zeal to build a strong profile and are hunting for their Ikigai, CSI provides ample opportunities for those individuals too."
version_2 = "Personalized career guidance, Regular Logic and aptitude building activities, Industrial level project collaboration, Building a network with active collaborations across the globe, Periodic member exclusive conferences and seminars, Created a community for sharing skills and knowledge"

#calling the TfidfVectorizer
vectorize= TfidfVectorizer()

#fitting the model and passing our sentences right away:
response= vectorize.fit_transform([version_1.lower(), version_2.lower()])

In [10]:
print(response)

  (0, 61)	0.13915271943780658
  (0, 31)	0.13915271943780658
  (0, 40)	0.13915271943780658
  (0, 4)	0.13915271943780658
  (0, 48)	0.13915271943780658
  (0, 18)	0.13915271943780658
  (0, 30)	0.13915271943780658
  (0, 58)	0.13915271943780658
  (0, 29)	0.13915271943780658
  (0, 7)	0.13915271943780658
  (0, 46)	0.13915271943780658
  (0, 54)	0.13915271943780658
  (0, 9)	0.13915271943780658
  (0, 60)	0.13915271943780658
  (0, 67)	0.13915271943780658
  (0, 28)	0.13915271943780658
  (0, 65)	0.13915271943780658
  (0, 23)	0.13915271943780658
  (0, 59)	0.27830543887561315
  (0, 24)	0.2970249178760062
  (0, 53)	0.13915271943780658
  (0, 44)	0.13915271943780658
  (0, 3)	0.13915271943780658
  (0, 62)	0.13915271943780658
  (0, 64)	0.13915271943780658
  :	:
  (1, 21)	0.16649349332910351
  (1, 37)	0.16649349332910351
  (1, 41)	0.16649349332910351
  (1, 26)	0.16649349332910351
  (1, 0)	0.16649349332910351
  (1, 13)	0.16649349332910351
  (1, 1)	0.16649349332910351
  (1, 66)	0.16649349332910351
  (1, 38)	0