# TF-IDF From Scratch

In [1]:
import pandas as pd 
import numpy as np

TWEET_DATA = pd.read_csv("Text_Preprocessing.csv", usecols=["label", "tweet_tokens_stemmed"])
TWEET_DATA.columns = ["label", "tweet"]

TWEET_DATA.head()

Unnamed: 0,label,tweet
0,0,"['maspiyuuu', 'bantu', 'bekas', 'anies', 'basw..."
1,1,"['lisaamartatara', 'fadlizon', 'bangga', 'bang..."
2,1,"['rustamibrahim', 'polling', 'gubernur', 'jawa..."
3,1,"['ojok', 'ampe', 'anies', 'baswedan', 'nikmat'..."
4,0,"['ghanieierfan', 'temu', 'haru', 'aniesbasweda..."


In [2]:
# convert list formated string to list
import ast

def convert_text_list(texts):
    texts = ast.literal_eval(texts)
    return [text for text in texts]

TWEET_DATA["tweet_list"] = TWEET_DATA["tweet"].apply(convert_text_list)


print(TWEET_DATA["tweet_list"][90])

print("\ntype : ", type(TWEET_DATA["tweet_list"][90]))

['geiszchalifah', 'bangun', 'era', 'anies', 'penting', 'masyarakat', 'bangun', 'gratis', 'utk', 'rakyat', 'uang', 'nya', 'dr', 'nbeda', 'ono', 'kpd', 'amp', 'untung', 'cukong', 'nmakanya', 'downgrade']

type :  <class 'list'>


In [3]:
def calc_TF(document):
    # Counts the number of times the word appears in review
    TF_dict = {}
    for term in document:
        if term in TF_dict:
            TF_dict[term] += 1
        else:
            TF_dict[term] = 1
    # Computes tf for each word
    for term in TF_dict:
        TF_dict[term] = TF_dict[term] / len(document)
    return TF_dict

TWEET_DATA["TF_dict"] = TWEET_DATA['tweet_list'].apply(calc_TF)

TWEET_DATA["TF_dict"].head()

0    {'maspiyuuu': 0.07142857142857142, 'bantu': 0....
1    {'lisaamartatara': 0.07142857142857142, 'fadli...
2    {'rustamibrahim': 0.07142857142857142, 'pollin...
3    {'ojok': 0.08333333333333333, 'ampe': 0.083333...
4    {'ghanieierfan': 0.07692307692307693, 'temu': ...
Name: TF_dict, dtype: object

In [4]:
# Check TF result
index = 90

print('%20s' % "term", "\t", "TF\n")
for key in TWEET_DATA["TF_dict"][index]:
    print('%20s' % key, "\t", TWEET_DATA["TF_dict"][index][key])

                term 	 TF

       geiszchalifah 	 0.047619047619047616
              bangun 	 0.09523809523809523
                 era 	 0.047619047619047616
               anies 	 0.047619047619047616
             penting 	 0.047619047619047616
          masyarakat 	 0.047619047619047616
              gratis 	 0.047619047619047616
                 utk 	 0.047619047619047616
              rakyat 	 0.047619047619047616
                uang 	 0.047619047619047616
                 nya 	 0.047619047619047616
                  dr 	 0.047619047619047616
               nbeda 	 0.047619047619047616
                 ono 	 0.047619047619047616
                 kpd 	 0.047619047619047616
                 amp 	 0.047619047619047616
              untung 	 0.047619047619047616
              cukong 	 0.047619047619047616
            nmakanya 	 0.047619047619047616
           downgrade 	 0.047619047619047616


In [5]:
def calc_DF(tfDict):
    count_DF = {}
    # Run through each document's tf dictionary and increment countDict's (term, doc) pair
    for document in tfDict:
        for term in document:
            if term in count_DF:
                count_DF[term] += 1
            else:
                count_DF[term] = 1
    return count_DF

DF = calc_DF(TWEET_DATA["TF_dict"])

In [6]:
n_document = len(TWEET_DATA)

def calc_IDF(__n_document, __DF):
    IDF_Dict = {}
    for term in __DF:
        IDF_Dict[term] = np.log(__n_document / (__DF[term] + 1))
    return IDF_Dict
  
#Stores the idf dictionary
IDF = calc_IDF(n_document, DF)

In [7]:
#calc TF-IDF
def calc_TF_IDF(TF):
    TF_IDF_Dict = {}
    #For each word in the review, we multiply its tf and its idf.
    for key in TF:
        TF_IDF_Dict[key] = TF[key] * IDF[key]
    return TF_IDF_Dict

#Stores the TF-IDF Series
TWEET_DATA["TF-IDF_dict"] = TWEET_DATA["TF_dict"].apply(calc_TF_IDF)

In [8]:
# Check TF-IDF result
index = 90

print('%20s' % "term", "\t", '%10s' % "TF", "\t", '%20s' % "TF-IDF\n")
for key in TWEET_DATA["TF-IDF_dict"][index]:
    print('%20s' % key, "\t", TWEET_DATA["TF_dict"][index][key] ,"\t" , TWEET_DATA["TF-IDF_dict"][index][key])


                term 	         TF 	              TF-IDF

       geiszchalifah 	 0.047619047619047616 	 0.3565731050581192
              bangun 	 0.09523809523809523 	 0.4547495147096688
                 era 	 0.047619047619047616 	 0.28295765235755843
               anies 	 0.047619047619047616 	 0.15689952663115753
             penting 	 0.047619047619047616 	 0.2649689616166619
          masyarakat 	 0.047619047619047616 	 0.20988274600184767
              gratis 	 0.047619047619047616 	 0.3183661301759793
                 utk 	 0.047619047619047616 	 0.19737626788157656
              rakyat 	 0.047619047619047616 	 0.17481754982266384
                uang 	 0.047619047619047616 	 0.2322302300527719
                 nya 	 0.047619047619047616 	 0.1754665417776927
                  dr 	 0.047619047619047616 	 0.21247607538910393
               nbeda 	 0.047619047619047616 	 0.426398679762568
                 ono 	 0.047619047619047616 	 0.3860511626012727
                 kpd 	 0.0476

In [9]:
# sort descending by value for DF dictionary 
sorted_DF = sorted(DF.items(), key=lambda kv: kv[1], reverse=True)[:50]

# Create a list of unique words from sorted dictionay `sorted_DF`
unique_term = [item[0] for item in sorted_DF]

def calc_TF_IDF_Vec(__TF_IDF_Dict):
    TF_IDF_vector = [0.0] * len(unique_term)

    # For each unique word, if it is in the review, store its TF-IDF value.
    for i, term in enumerate(unique_term):
        if term in __TF_IDF_Dict:
            TF_IDF_vector[i] = __TF_IDF_Dict[term]
    return TF_IDF_vector

TWEET_DATA["TF_IDF_Vec"] = TWEET_DATA["TF-IDF_dict"].apply(calc_TF_IDF_Vec)

print("print first row matrix TF_IDF_Vec Series\n")
print(TWEET_DATA["TF_IDF_Vec"][0])

print("\nmatrix size : ", len(TWEET_DATA["TF_IDF_Vec"][0]))

print first row matrix TF_IDF_Vec Series

[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.2199601102972785, 0.0, 0.0, 0.22999663077827417, 0.0, 0.2353492899467363, 0.0, 0.0, 0.0, 0.2503403900656905, 0.0, 0.0, 0.2565368848091658, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.29783301251384003, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]

matrix size :  50


In [10]:
# Convert Series to List
TF_IDF_Vec_List = np.array(TWEET_DATA["TF_IDF_Vec"].to_list())

# Sum element vector in axis=0 
sums = TF_IDF_Vec_List.sum(axis=0)

data = []

for col, term in enumerate(unique_term):
    data.append((term, sums[col]))
    
ranking = pd.DataFrame(data, columns=['term', 'rank'])
ranking.sort_values('rank', ascending=False)

Unnamed: 0,term,rank
0,jokowi,885.006775
1,banjir,392.070549
3,nkri,342.263361
2,bakar,329.04863
4,aniesbaswedan,316.057243
5,hutan,314.320327
6,indonesia,300.811656
8,ya,277.269327
7,jakarta,274.140503
11,anies,262.612076


# --------------------------------------------------------
# TF-IDF menggunakan Scikit-Learn

In [11]:
import pandas as pd 
import numpy as np

TWEET_DATA = pd.read_csv("Text_Preprocessing.csv", usecols=["label", "tweet_tokens_stemmed"])
TWEET_DATA.columns = ["label", "tweet"]

TWEET_DATA.head()

Unnamed: 0,label,tweet
0,0,"['maspiyuuu', 'bantu', 'bekas', 'anies', 'basw..."
1,1,"['lisaamartatara', 'fadlizon', 'bangga', 'bang..."
2,1,"['rustamibrahim', 'polling', 'gubernur', 'jawa..."
3,1,"['ojok', 'ampe', 'anies', 'baswedan', 'nikmat'..."
4,0,"['ghanieierfan', 'temu', 'haru', 'aniesbasweda..."


In [12]:
# join list of token as single document string
import ast

def join_text_list(texts):
    texts = ast.literal_eval(texts)
    return ' '.join([text for text in texts])
TWEET_DATA["tweet_join"] = TWEET_DATA["tweet"].apply(join_text_list)

TWEET_DATA["tweet_join"].head()

0    maspiyuuu bantu bekas anies baswedan layan war...
1    lisaamartatara fadlizon bangga banggain buzzer...
2    rustamibrahim polling gubernur jawa ridwan kam...
3    ojok ampe anies baswedan nikmat hsil proyek mo...
4    ghanieierfan temu haru aniesbaswedan tsuneo ya...
Name: tweet_join, dtype: object

### TF-IDF Scikit-Learn L2 Normalization

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

# banyaknya term yang akan digunakan, 
# di pilih berdasarkan top max_features 
# yang diurutkan berdasarkan term frequency seluruh corpus
max_features = 1000

# Feature Engineering 
print ("------- TF-IDF on Tweet data -------")

tf_idf = TfidfVectorizer(max_features=max_features, binary=True)
tfidf_mat = tf_idf.fit_transform(TWEET_DATA["tweet_join"]).toarray()

print("TF-IDF ", type(tfidf_mat), tfidf_mat.shape)

------- TF-IDF on Tweet data -------
TF-IDF  <class 'numpy.ndarray'> (23225, 1000)


In [20]:
terms = tf_idf.get_feature_names()

# sum tfidf frequency of each term through documents
sums = tfidf_mat.sum(axis=0)

# connecting term to its sums frequency
data = []
for col, term in enumerate(terms):
    data.append((term, sums[col] ))

ranking = pd.DataFrame(data, columns=['term','rank'])
ranking.sort_values('rank', ascending=False)

Unnamed: 0,term,rank
406,jokowi,1127.836120
102,banjir,467.411138
647,nkri,373.962467
92,bakar,363.434469
53,aniesbaswedan,362.952288
...,...,...
973,wafiul,17.451018
71,asal,17.266324
278,free,15.746759
983,west,14.867343


### TF-IDF Scikit-Learn L1 Normalization

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import normalize

max_features = 1000

# calc TF vector
cvect = CountVectorizer(max_features=max_features)
TF_vector = cvect.fit_transform(TWEET_DATA["tweet_join"])

# normalize TF vector
normalized_TF_vector = normalize(TF_vector, norm='l1', axis=1)

# calc IDF
tfidf = TfidfVectorizer(max_features=max_features, smooth_idf=False)
tfs = tfidf.fit_transform(TWEET_DATA["tweet_join"])
IDF_vector = tfidf.idf_

# hitung TF x IDF sehingga dihasilkan TFIDF matrix / vector
tfidf_mat = normalized_TF_vector.multiply(IDF_vector).toarray()

### TF-IDF Scikit-Learn L1 Norm unigram, bigram, trigram

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import normalize

max_features = 1000

# ngram_range (1, 3) to use unigram, bigram, trigram
cvect = CountVectorizer(max_features=max_features, ngram_range=(1,3))
counts = cvect.fit_transform(TWEET_DATA["tweet_join"])

normalized_counts = normalize(counts, norm='l1', axis=1)

tfidf = TfidfVectorizer(max_features=max_features, ngram_range=(1,3), smooth_idf=False)
tfs = tfidf.fit_transform(TWEET_DATA["tweet_join"])

tfidf_mat = normalized_counts.multiply(tfidf.idf_).toarray()

In [24]:
tfidf.get_feature_names()

['abud',
 'acara',
 'action',
 'ada',
 'adil',
 'adu',
 'aewin',
 'agama',
 'agung',
 'ahli',
 'ahok',
 'aibon',
 'aiekesthreem',
 'air',
 'airinnz',
 'aja',
 'ajak',
 'ajar',
 'akibat',
 'akibat bakar',
 'aksi',
 'aku',
 'akun',
 'alam',
 'alami',
 'alas',
 'alat',
 'alhamdulillah',
 'alih',
 'alir',
 'allah',
 'ama',
 'aman',
 'ambil',
 'amin',
 'amp',
 'an',
 'anak',
 'anakkolong',
 'anaklolina',
 'ancam',
 'andre',
 'anggap',
 'anggar',
 'anggota',
 'angkasapura',
 'angkasapura jokowi',
 'angkasapura jokowi kemenbumn',
 'angkat',
 'anies',
 'anies baswedan',
 'aniesbaswedan',
 'aniesbaswedan dkijakarta',
 'aniesbaswedan jokowi',
 'aniesbeliberitamediaonline',
 'aniesfokuskerja',
 'aniesgabisakerja',
 'anieskerjanyata',
 'aniesmundur',
 'anis',
 'anti',
 'antisipasi',
 'anyendia',
 'apa',
 'api',
 'arah',
 'ariestariico',
 'artfleck',
 'aryprasetyo',
 'asal',
 'asap',
 'asing',
 'asli',
 'atas',
 'atas banjir',
 'atur',
 'au',
 'australia',
 'awas',
 'ayo',
 'baca',
 'bacot',
 'badj