## Term weighting using TF.IDF

### A way to represent 'aboutness'  
Identify high frequency terms, but also account for scarcity

In [2]:
# usual imports
import nltk
import string
import csv
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.snowball import SnowballStemmer
nltk.download('punkt')
tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [3]:
# helper functions
def read_in_csv(csv_file):
    with open(csv_file, 'r', encoding='utf-8') as fp:
        reader = csv.reader(fp, delimiter=',', quotechar='"')
        data_read = [row[0] for row in reader]
    return data_read

def read_text_file(filename):
    file = open(filename, "r", encoding="utf-8") 
    return file.read()

def preprocess_text(text):
    text = text.replace("\n", " ")
    return text

def divide_into_sentences_nltk(text):
    sentences = tokenizer.tokenize(text)
    return sentences

def get_sentences(filename):
    sample_text = read_text_file(filename)
    sample_text = preprocess_text(sample_text)
    sentences = divide_into_sentences_nltk(sample_text)
    return sentences

In [9]:
# upload stopwords.csv and NLP_syllabus_blog_post.txt to colab
from google.colab import files
uploaded = files.upload()

Saving NLP_syllabus_blog_post.txt to NLP_syllabus_blog_post.txt


In [7]:
# use a custom stemmer and stop words list
stemmer = SnowballStemmer('english')
stopwords_file_path = "stopwords.csv"
stopword_list = read_in_csv(stopwords_file_path)
len(stopword_list)

476

In [11]:
sentences = get_sentences("NLP_syllabus_blog_post.txt")
sentences

['I am currently starting work on developing an undergraduate module in Natural Language Processing (level 6, 3rd year).',
 'Although I have been involved in the field of NLP for many years, recent times have witnessed a transformation of the field, not just in terms of its academic foundations, but also its practical application in industry and its attractiveness as a fulfilling and rewarding career choice.',
 'My sense is that some of the topics which I originally studied for my doctorate retain their appeal since the key ideas remain relevant despite radical changes in the implementation.',
 'However, others are more hostage to the technological fortunes of deep learning and other neural/distributional approaches.',
 'My view is that field benefits by being informed by more than one perspective: computer/data science may be a given, but cognitive science, information science and linguistics all have their contributions to make.',
 'Clearly, it is a tricky task to pack all this into 

In [12]:
# create a TF.IDF vectorizer using max & min cutoffs + stop word list, and fit to our data
tfidf_vectorizer = TfidfVectorizer(max_df=0.90, max_features=200000,
                                    min_df=0.05, stop_words=stopword_list,
                                    use_idf=True, ngram_range=(1,1))
tfidf_vectorizer = tfidf_vectorizer.fit(sentences)



In [13]:
# as before, but freq count is TF * IDF
tfidf_matrix = tfidf_vectorizer.transform(sentences)
print(tfidf_matrix)

  (0, 72)	0.28867513459481287
  (0, 71)	0.28867513459481287
  (0, 68)	0.28867513459481287
  (0, 57)	0.28867513459481287
  (0, 48)	0.28867513459481287
  (0, 41)	0.28867513459481287
  (0, 40)	0.28867513459481287
  (0, 37)	0.28867513459481287
  (0, 35)	0.28867513459481287
  (0, 19)	0.28867513459481287
  (0, 16)	0.28867513459481287
  (0, 1)	0.28867513459481287
  (1, 73)	0.22115877821995306
  (1, 70)	0.22115877821995306
  (1, 66)	0.22115877821995306
  (1, 64)	0.22115877821995306
  (1, 61)	0.22115877821995306
  (1, 54)	0.22115877821995306
  (1, 50)	0.22115877821995306
  (1, 47)	0.18358077888296445
  (1, 43)	0.22115877821995306
  (1, 33)	0.22115877821995306
  (1, 30)	0.22115877821995306
  (1, 25)	0.22115877821995306
  (1, 24)	0.22115877821995306
  :	:
  (4, 69)	0.21627962051261088
  (4, 55)	0.6488388615378327
  (4, 46)	0.1795306589672365
  (4, 39)	0.21627962051261088
  (4, 38)	0.21627962051261088
  (4, 32)	0.21627962051261088
  (4, 31)	0.21627962051261088
  (4, 26)	0.21627962051261088
  (4, 2

In [14]:
# it's bigger, but why?
tfidf_matrix.shape

(7, 74)

In [15]:
dense_matrix = tfidf_matrix.todense()
print(dense_matrix)

[[0.         0.28867513 0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.28867513 0.
  0.         0.28867513 0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.28867513
  0.         0.28867513 0.         0.         0.28867513 0.28867513
  0.         0.         0.         0.         0.         0.
  0.28867513 0.         0.         0.         0.         0.
  0.         0.         0.         0.28867513 0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.28867513 0.         0.         0.28867513
  0.28867513 0.        ]
 [0.         0.         0.22115878 0.         0.22115878 0.
  0.22115878 0.         0.22115878 0.         0.22115878 0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.367

In [16]:
dense_matrix.shape

(7, 74)

In [17]:
print(tfidf_vectorizer.get_feature_names())

['10', '3rd', 'academic', 'appeal', 'application', 'approaches', 'attractiveness', 'benefits', 'career', 'changes', 'choice', 'clearly', 'cognitive', 'computer', 'contributions', 'current', 'currently', 'data', 'deep', 'developing', 'distributional', 'doctorate', 'field', 'fortunes', 'foundations', 'fulfilling', 'given', 'hostage', 'ideas', 'implementation', 'industry', 'information', 'informed', 'involved', 'key', 'language', 'learning', 'level', 'linguistics', 'make', 'module', 'natural', 'neural', 'nlp', 'originally', 'pack', 'perspective', 'practical', 'processing', 'radical', 'recent', 'relevant', 'remain', 'retain', 'rewarding', 'science', 'sense', 'starting', 'studied', 'task', 'technological', 'terms', 'theoretical', 'thinking', 'times', 'topics', 'transformation', 'tricky', 'undergraduate', 'view', 'witnessed', 'work', 'year', 'years']


