TFIDF = [Term Frequency-Inverse Document Frequency](https://en.wikipedia.org/wiki/Tf%E2%80%93idf)

In [45]:
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import os
from string import punctuation
from datetime import datetime

DIRNAME = 'blogs/'

english_stopwords = set(stopwords.words('english') + list(punctuation) +
                       ['..','...','....','``',"''",'/n'])
    

In [46]:
def clean_wordlist(filelist):
    '''
    Takes a list of filenames with XML content, opens these, and provides a wordlist 
    for all these posts, lemmatized without stopwords.
    '''
    corpus = ''
    lemmatizer = WordNetLemmatizer()
    for eachfile in filelist:
        xmltext = open(eachfile, encoding = 'utf8', errors = 'replace').read()
        soup = BeautifulSoup(xmltext, features = 'xml')
        corpus += '/n'.join([x.text.lower() for x in soup.findAll('post')])
    wordlist = [lemmatizer.lemmatize(word) for word in word_tokenize(corpus)
               if word not in english_stopwords]
    return wordlist
        

In [47]:
blogfiles = [ DIRNAME + fn for fn in os.listdir(DIRNAME) if not fn.startswith('.')]

In [48]:
print(datetime.now().strftime('%H:%M:%S'))
allblogs = [' '.join(clean_wordlist([x])) for x in blogfiles[:1000] ]
print(datetime.now().strftime('%H:%M:%S'))

11:08:15
11:08:37


In [51]:
allblogs[575]

"introverted 62.5 extroverted e 37.5 imaginative n 64.71 realistic 35.29 emotional f 63.64 intellectual 36.36 easygoing p 73.33 organized j 26.67 type infp idealist possible profession include information-graphics designer college professor researcher legal mediator social worker holistic health practitioner occupational therapist diversity manager human resource development specialist employment development specialist minister/priest/rabbi missionary psychologist writer urllink get ca n't wish used 's way time n't take seriously 've fallen house fallen 's one else around stand two foot fight stay incomplete thought heard sound water drop fell ground turned around pick pace tear mixed rain ran face gathered left strength held arm length set ran away come back fight day write remember hate month december must paste look face act way n't feel world decide 's better hide one know real 's hard see would took mask began heal try relax let go die 's thing know sometimes stop ca n't take anym

In [56]:
tfidf_vectorizer = TfidfVectorizer(use_idf=True)
tfidf_vectors = tfidf_vectorizer.fit_transform(allblogs)
# Creates a very large array. Documents at the top and then terms below.

In [57]:
as_array = tfidf_vectors.toarray()

In [61]:
feature_names = tfidf_vectorizer.get_feature_names_out()
# new version of sklearn requires additional "_out"

In [63]:
df = pd.DataFrame(as_array,columns=feature_names).T

In [64]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
00,0.0,0.057854,0.0,0.0,0.002331,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.006462,0.0,0.0,0.006001
000,0.0,0.000000,0.0,0.0,0.007195,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.040186,0.0,0.000000,0.0,0.0,0.003087
0000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000
000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000
000000000001,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
黄河大合唱第一乐章,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000
黄河船夫曲,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000
ａｖｏｉｄ,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000
ｇｏｏｄ,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000


In [68]:
df[10000:100010]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
bahrua,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,...,0.000000,0.0,0.00000,0.00000,0.000000,0.0,0.000000,0.000000,0.0,0.000000
bahstun,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,...,0.000000,0.0,0.00000,0.00000,0.000000,0.0,0.000000,0.000000,0.0,0.000000
baht,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,...,0.000000,0.0,0.00000,0.00000,0.000000,0.0,0.000000,0.000000,0.0,0.000000
bahumbug,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,...,0.000000,0.0,0.00000,0.00000,0.000000,0.0,0.000000,0.000000,0.0,0.000000
bahumut,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,...,0.000000,0.0,0.00000,0.00000,0.000000,0.0,0.000000,0.000000,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
weeel,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,...,0.000000,0.0,0.00000,0.00000,0.000000,0.0,0.000000,0.000000,0.0,0.000000
weehee,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,...,0.000000,0.0,0.00000,0.00000,0.000000,0.0,0.000000,0.000000,0.0,0.000000
weehooo,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,...,0.000000,0.0,0.00000,0.00000,0.000000,0.0,0.000000,0.000000,0.0,0.000000
week,0.0,0.004671,0.0,0.0,0.058725,0.008726,0.005694,0.0,0.007398,0.0,...,0.045014,0.0,0.01019,0.06085,0.020501,0.0,0.064432,0.016502,0.0,0.015747
