In [57]:
import pandas as pd

In [58]:
documentA = 'the man went out for a walk'
documentB = 'the children sat around the fire'

In [59]:
bagOfWordsA = documentA.split(' ')
bagOfWordsB = documentB.split(' ')

In [60]:
bagOfWordsA

['the', 'man', 'went', 'out', 'for', 'a', 'walk']

In [61]:
uniqueWords = set(bagOfWordsA).union(set(bagOfWordsB))
uniqueWords

{'a',
 'around',
 'children',
 'fire',
 'for',
 'man',
 'out',
 'sat',
 'the',
 'walk',
 'went'}

In [62]:
numOfWordsA = dict.fromkeys(uniqueWords, 0)
numOfWordsA

{'the': 0,
 'man': 0,
 'went': 0,
 'around': 0,
 'walk': 0,
 'fire': 0,
 'a': 0,
 'for': 0,
 'out': 0,
 'children': 0,
 'sat': 0}

In [63]:
for word in bagOfWordsA:
    numOfWordsA[word] += 1
numOfWordsA

{'the': 1,
 'man': 1,
 'went': 1,
 'around': 0,
 'walk': 1,
 'fire': 0,
 'a': 1,
 'for': 1,
 'out': 1,
 'children': 0,
 'sat': 0}

In [64]:
numOfWordsB = dict.fromkeys(uniqueWords, 0)

for word in bagOfWordsB:
    numOfWordsB[word] += 1

numOfWordsB

{'the': 2,
 'man': 0,
 'went': 0,
 'around': 1,
 'walk': 0,
 'fire': 1,
 'a': 0,
 'for': 0,
 'out': 0,
 'children': 1,
 'sat': 1}

In [65]:
def computeTF(wordDict, bagOfWords):
    tfDict = {}
    bagOfWordsCount = len(bagOfWords)
    for word, count in wordDict.items():
        tfDict[word] = count / float(bagOfWordsCount)
    return tfDict

In [66]:
tfA = computeTF(numOfWordsA,bagOfWordsA)
tfA

{'the': 0.14285714285714285,
 'man': 0.14285714285714285,
 'went': 0.14285714285714285,
 'around': 0.0,
 'walk': 0.14285714285714285,
 'fire': 0.0,
 'a': 0.14285714285714285,
 'for': 0.14285714285714285,
 'out': 0.14285714285714285,
 'children': 0.0,
 'sat': 0.0}

In [67]:
tfB = computeTF(numOfWordsB, bagOfWordsB)
tfB

{'the': 0.3333333333333333,
 'man': 0.0,
 'went': 0.0,
 'around': 0.16666666666666666,
 'walk': 0.0,
 'fire': 0.16666666666666666,
 'a': 0.0,
 'for': 0.0,
 'out': 0.0,
 'children': 0.16666666666666666,
 'sat': 0.16666666666666666}

In [68]:
def computeIDF(documents):
    import math
    N = len(documents)
    
    idfDict = dict.fromkeys(documents[0].keys(), 0)
    for document in documents:
        for word, val in document.items():
            if val > 0:
                idfDict[word] += 1
    
    for word, val in idfDict.items():
        idfDict[word] = math.log(N / float(val))
    return idfDict

In [69]:
idfs = computeIDF([numOfWordsA, numOfWordsB])
idfs

{'the': 0.0,
 'man': 0.6931471805599453,
 'went': 0.6931471805599453,
 'around': 0.6931471805599453,
 'walk': 0.6931471805599453,
 'fire': 0.6931471805599453,
 'a': 0.6931471805599453,
 'for': 0.6931471805599453,
 'out': 0.6931471805599453,
 'children': 0.6931471805599453,
 'sat': 0.6931471805599453}

In [70]:
def computeTFIDF(tfBagOfWords, idfs):
    tfidf = {}
    for word, val in tfBagOfWords.items():
        tfidf[word] = val * idfs[word]
    return tfidf

In [71]:
tfidfA = computeTFIDF(tfA, idfs)
tfidfA

{'the': 0.0,
 'man': 0.09902102579427789,
 'went': 0.09902102579427789,
 'around': 0.0,
 'walk': 0.09902102579427789,
 'fire': 0.0,
 'a': 0.09902102579427789,
 'for': 0.09902102579427789,
 'out': 0.09902102579427789,
 'children': 0.0,
 'sat': 0.0}

In [72]:
tfidfB = computeTFIDF(tfB, idfs)
tfidfB

{'the': 0.0,
 'man': 0.0,
 'went': 0.0,
 'around': 0.11552453009332421,
 'walk': 0.0,
 'fire': 0.11552453009332421,
 'a': 0.0,
 'for': 0.0,
 'out': 0.0,
 'children': 0.11552453009332421,
 'sat': 0.11552453009332421}

In [73]:
df = pd.DataFrame([tfidfA, tfidfB])
df.head()

Unnamed: 0,the,man,went,around,walk,fire,a,for,out,children,sat
0,0.0,0.099021,0.099021,0.0,0.099021,0.0,0.099021,0.099021,0.099021,0.0,0.0
1,0.0,0.0,0.0,0.115525,0.0,0.115525,0.0,0.0,0.0,0.115525,0.115525


In [74]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform([documentA, documentB])
print(vectors)

  (0, 8)	0.42615959880289433
  (0, 3)	0.42615959880289433
  (0, 5)	0.42615959880289433
  (0, 9)	0.42615959880289433
  (0, 4)	0.42615959880289433
  (0, 7)	0.3032160644503863
  (1, 2)	0.40740123733358447
  (1, 0)	0.40740123733358447
  (1, 6)	0.40740123733358447
  (1, 1)	0.40740123733358447
  (1, 7)	0.5797386715376657


In [50]:
feature_names = vectorizer.get_feature_names()
feature_names

AttributeError: 'TfidfVectorizer' object has no attribute 'get_feature_names'

In [51]:
dense = vectors.todense()
dense

matrix([[0.        , 0.        , 0.        , 0.4261596 , 0.4261596 ,
         0.4261596 , 0.        , 0.30321606, 0.4261596 , 0.4261596 ],
        [0.40740124, 0.40740124, 0.40740124, 0.        , 0.        ,
         0.        , 0.40740124, 0.57973867, 0.        , 0.        ]])

In [52]:
denselist = dense.tolist()
df = pd.DataFrame(denselist, columns=['children','walk','a','man','around','sat','out','the','went','for'])
df.head()

Unnamed: 0,children,walk,a,man,around,sat,out,the,went,for
0,0.0,0.0,0.0,0.42616,0.42616,0.42616,0.0,0.303216,0.42616,0.42616
1,0.407401,0.407401,0.407401,0.0,0.0,0.0,0.407401,0.579739,0.0,0.0


In [53]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer



data = pd.read_csv('Questions.csv', sep=',')
data.head()

Unnamed: 0.1,Unnamed: 0,Id,OwnerUserId,CreationDate,Score,Title,Body
0,0,469,147.0,2008-08-02T15:11:16Z,21,How can I find the full path to a font from it...,<p>I am using the Photoshop's javascript API t...
1,1,502,147.0,2008-08-02T17:01:58Z,27,Get a preview JPEG of a PDF on Windows?,<p>I have a cross-platform (Python) applicatio...
2,2,535,154.0,2008-08-02T18:43:54Z,40,Continuous Integration System for a Python Cod...,<p>I'm starting work on a hobby project with a...
3,3,594,116.0,2008-08-03T01:15:08Z,25,cx_Oracle: How do I iterate over a result set?,<p>There are several ways to iterate over a re...
4,4,683,199.0,2008-08-03T13:19:16Z,28,Using 'in' to match an attribute of Python obj...,<p>I don't remember whether I was dreaming or ...


In [54]:
len(data)

10000

In [55]:
X = data['Title']
y = data['Score']

VecModel = TfidfVectorizer()
X_Vec = VecModel.fit_transform(X)
X_Vec = pd.DataFrame.sparse.from_spmatrix(X_Vec)

print(f'The new shape for X is {X_Vec.shape}')
X_Vec.head()

The new shape for X is (10000, 6870)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,6860,6861,6862,6863,6864,6865,6866,6867,6868,6869
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [56]:
for i in range(5) : 
    print(sorted(list(X_Vec.iloc[i,:]),reverse = True)[:20])
    print('--------------------------------------------------')

[0.422178222176151, 0.3716395905037442, 0.32955184359276674, 0.32666367121717127, 0.3035936514537762, 0.28427933869442007, 0.27713713250250954, 0.2615059045427987, 0.18645448981827298, 0.18597326273423462, 0.17584122839792515, 0.14999446597627894, 0.12187283198225851, 0.11197776573544506, 0, 0, 0, 0, 0, 0]
--------------------------------------------------
[0.5426412233100997, 0.5289190014000659, 0.4187346787909365, 0.2942839268145828, 0.27016796115062247, 0.2329935255908615, 0.19122427073824194, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
--------------------------------------------------
[0.5645125193070204, 0.5215694064886434, 0.4786262936702665, 0.3573365447207418, 0.20580239279118884, 0.10081572147562867, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
--------------------------------------------------
[0.5090625627208644, 0.45228021334724877, 0.45228021334724877, 0.3752194639691171, 0.33991401883074684, 0.22478750070301515, 0.15773369219653524, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
-------