# TF-IDF From Scratch

In [1]:
import pandas as pd 
import numpy as np

TWEET_DATA = pd.read_csv("Text_Preprocessing_v2.csv", usecols=["label", "tweet_tokens_stemmed"])
TWEET_DATA.columns = ["label", "tweet"]

TWEET_DATA.head()

Unnamed: 0,label,tweet
0,1,"['lisaamartatara', 'fadlizon', 'bangga', 'bang..."
1,1,"['rustamibrahim', 'polling', 'gubernur', 'jawa..."
2,1,"['ojok', 'ampe', 'anies', 'baswedan', 'nikmat'..."
3,0,"['ghanieierfan', 'temu', 'haru', 'aniesbasweda..."
4,1,"['sebastianteza', 'jangn', 'nympek', 'anies', ..."


In [2]:
# convert list formated string to list
import ast

def convert_text_list(texts):
    texts = ast.literal_eval(texts)
    return [text for text in texts]

TWEET_DATA["tweet_list"] = TWEET_DATA["tweet"].apply(convert_text_list)


print(TWEET_DATA["tweet_list"][90])

print("\ntype : ", type(TWEET_DATA["tweet_list"][90]))

['idtodayco', 'media', 'bungkam', 'tutup', 'dampak', 'banjir', 'jateng', 'gubernur', 'pdipnhttpstcoxoamyep']

type :  <class 'list'>


In [3]:
def calc_TF(document):
    # Counts the number of times the word appears in review
    TF_dict = {}
    for term in document:
        if term in TF_dict:
            TF_dict[term] += 1
        else:
            TF_dict[term] = 1
    # Computes tf for each word
    for term in TF_dict:
        TF_dict[term] = TF_dict[term] / len(document)
    return TF_dict

TWEET_DATA["TF_dict"] = TWEET_DATA['tweet_list'].apply(calc_TF)

TWEET_DATA["TF_dict"].head()

0    {'maspiyuuu': 0.07142857142857142, 'bantu': 0....
1    {'lisaamartatara': 0.07142857142857142, 'fadli...
2    {'rustamibrahim': 0.07142857142857142, 'pollin...
3    {'ojok': 0.08333333333333333, 'ampe': 0.083333...
4    {'ghanieierfan': 0.07692307692307693, 'temu': ...
Name: TF_dict, dtype: object

In [4]:
# Check TF result
index = 90

print('%20s' % "term", "\t", "TF\n")
for key in TWEET_DATA["TF_dict"][index]:
    print('%20s' % key, "\t", TWEET_DATA["TF_dict"][index][key])

                term 	 TF

       geiszchalifah 	 0.047619047619047616
              bangun 	 0.09523809523809523
                 era 	 0.047619047619047616
               anies 	 0.047619047619047616
             penting 	 0.047619047619047616
          masyarakat 	 0.047619047619047616
              gratis 	 0.047619047619047616
                 utk 	 0.047619047619047616
              rakyat 	 0.047619047619047616
                uang 	 0.047619047619047616
                 nya 	 0.047619047619047616
                  dr 	 0.047619047619047616
               nbeda 	 0.047619047619047616
                 ono 	 0.047619047619047616
                 kpd 	 0.047619047619047616
                 amp 	 0.047619047619047616
              untung 	 0.047619047619047616
              cukong 	 0.047619047619047616
            nmakanya 	 0.047619047619047616
           downgrade 	 0.047619047619047616


In [5]:
def calc_DF(tfDict):
    count_DF = {}
    # Run through each document's tf dictionary and increment countDict's (term, doc) pair
    for document in tfDict:
        for term in document:
            if term in count_DF:
                count_DF[term] += 1
            else:
                count_DF[term] = 1
    return count_DF

DF = calc_DF(TWEET_DATA["TF_dict"])

In [6]:
n_document = len(TWEET_DATA)

def calc_IDF(__n_document, __DF):
    IDF_Dict = {}
    for term in __DF:
        IDF_Dict[term] = np.log(__n_document / (__DF[term] + 1))
    return IDF_Dict
  
#Stores the idf dictionary
IDF = calc_IDF(n_document, DF)

In [7]:
#calc TF-IDF
def calc_TF_IDF(TF):
    TF_IDF_Dict = {}
    #For each word in the review, we multiply its tf and its idf.
    for key in TF:
        TF_IDF_Dict[key] = TF[key] * IDF[key]
    return TF_IDF_Dict

#Stores the TF-IDF Series
TWEET_DATA["TF-IDF_dict"] = TWEET_DATA["TF_dict"].apply(calc_TF_IDF)

In [8]:
# Check TF-IDF result
index = 90

print('%20s' % "term", "\t", '%10s' % "TF", "\t", '%20s' % "TF-IDF\n")
for key in TWEET_DATA["TF-IDF_dict"][index]:
    print('%20s' % key, "\t", TWEET_DATA["TF_dict"][index][key] ,"\t" , TWEET_DATA["TF-IDF_dict"][index][key])


                term 	         TF 	              TF-IDF

       geiszchalifah 	 0.047619047619047616 	 0.3565731050581192
              bangun 	 0.09523809523809523 	 0.4547495147096688
                 era 	 0.047619047619047616 	 0.28295765235755843
               anies 	 0.047619047619047616 	 0.15689952663115753
             penting 	 0.047619047619047616 	 0.2649689616166619
          masyarakat 	 0.047619047619047616 	 0.20988274600184767
              gratis 	 0.047619047619047616 	 0.3183661301759793
                 utk 	 0.047619047619047616 	 0.19737626788157656
              rakyat 	 0.047619047619047616 	 0.17481754982266384
                uang 	 0.047619047619047616 	 0.2322302300527719
                 nya 	 0.047619047619047616 	 0.1754665417776927
                  dr 	 0.047619047619047616 	 0.21247607538910393
               nbeda 	 0.047619047619047616 	 0.426398679762568
                 ono 	 0.047619047619047616 	 0.3860511626012727
                 kpd 	 0.0476

In [9]:
# sort descending by value for DF dictionary 
sorted_DF = sorted(DF.items(), key=lambda kv: kv[1], reverse=True)[:50]

# Create a list of unique words from sorted dictionay `sorted_DF`
unique_term = [item[0] for item in sorted_DF]

def calc_TF_IDF_Vec(__TF_IDF_Dict):
    TF_IDF_vector = [0.0] * len(unique_term)

    # For each unique word, if it is in the review, store its TF-IDF value.
    for i, term in enumerate(unique_term):
        if term in __TF_IDF_Dict:
            TF_IDF_vector[i] = __TF_IDF_Dict[term]
    return TF_IDF_vector

TWEET_DATA["TF_IDF_Vec"] = TWEET_DATA["TF-IDF_dict"].apply(calc_TF_IDF_Vec)

print("print first row matrix TF_IDF_Vec Series\n")
print(TWEET_DATA["TF_IDF_Vec"][0])

print("\nmatrix size : ", len(TWEET_DATA["TF_IDF_Vec"][0]))

print first row matrix TF_IDF_Vec Series

[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.2199601102972785, 0.0, 0.0, 0.22999663077827417, 0.0, 0.2353492899467363, 0.0, 0.0, 0.0, 0.2503403900656905, 0.0, 0.0, 0.2565368848091658, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.29783301251384003, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]

matrix size :  50


In [10]:
# Convert Series to List
TF_IDF_Vec_List = np.array(TWEET_DATA["TF_IDF_Vec"].to_list())

# Sum element vector in axis=0 
sums = TF_IDF_Vec_List.sum(axis=0)

data = []

for col, term in enumerate(unique_term):
    data.append((term, sums[col]))
    
ranking = pd.DataFrame(data, columns=['term', 'rank'])
ranking.sort_values('rank', ascending=False)

Unnamed: 0,term,rank
0,jokowi,885.006775
1,banjir,392.070549
3,nkri,342.263361
2,bakar,329.04863
4,aniesbaswedan,316.057243
5,hutan,314.320327
6,indonesia,300.811656
8,ya,277.269327
7,jakarta,274.140503
11,anies,262.612076


# --------------------------------------------------------
# TF-IDF menggunakan Scikit-Learn

In [1]:
import pandas as pd 
import numpy as np

TWEET_DATA = pd.read_csv("Text_Preprocessing.csv", usecols=["label", "tweet_tokens_stemmed"])
TWEET_DATA.columns = ["label", "tweet"]

TWEET_DATA.head()

Unnamed: 0,label,tweet
0,0,"['maspiyuuu', 'bantu', 'bekas', 'anies', 'basw..."
1,1,"['lisaamartatara', 'fadlizon', 'bangga', 'bang..."
2,1,"['rustamibrahim', 'polling', 'gubernur', 'jawa..."
3,1,"['ojok', 'ampe', 'anies', 'baswedan', 'nikmat'..."
4,0,"['ghanieierfan', 'temu', 'haru', 'aniesbasweda..."


In [2]:
# join list of token as single document string
import ast

def join_text_list(texts):
    texts = ast.literal_eval(texts)
    return ' '.join([text for text in texts])
TWEET_DATA["tweet_join"] = TWEET_DATA["tweet"].apply(join_text_list)

TWEET_DATA["tweet_join"].head()

0    maspiyuuu bantu bekas anies baswedan layan war...
1    lisaamartatara fadlizon bangga banggain buzzer...
2    rustamibrahim polling gubernur jawa ridwan kam...
3    ojok ampe anies baswedan nikmat hsil proyek mo...
4    ghanieierfan temu haru aniesbaswedan tsuneo ya...
Name: tweet_join, dtype: object

### TF-IDF Scikit-Learn L2 Normalization

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

# banyaknya term yang akan digunakan, 
# di pilih berdasarkan top max_features 
# yang diurutkan berdasarkan term frequency seluruh corpus
max_features = 1000

# Feature Engineering 
print ("------- TF-IDF on Tweet data -------")

tf_idf = TfidfVectorizer(max_features=max_features, binary=True)
tfidf_mat = tf_idf.fit_transform(TWEET_DATA["tweet_join"]).toarray()

print("TF-IDF ", type(tfidf_mat), tfidf_mat.shape)

------- TF-IDF on Tweet data -------
TF-IDF  <class 'numpy.ndarray'> (23225, 1000)


In [55]:
terms = tf_idf.get_feature_names()

# sum tfidf frequency of each term through documents
sums = tfidf_mat.sum(axis=0)

# connecting term to its sums frequency
data = []
for col, term in enumerate(terms):
    data.append((term, sums[col] ))

ranking = pd.DataFrame(data, columns=['term','rank'])
ranking.sort_values('rank', ascending=False)

NameError: name 'tf_idf' is not defined

### TF-IDF Scikit-Learn L1 Normalization

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import normalize

max_features = 1000

# calc TF vector
cvect = CountVectorizer(max_features=max_features)
TF_vector = cvect.fit_transform(TWEET_DATA["tweet_join"])

# normalize TF vector
normalized_TF_vector = normalize(TF_vector, norm='l1', axis=1)

# calc IDF
tfidf = TfidfVectorizer(max_features=max_features, smooth_idf=False)
tfs = tfidf.fit_transform(TWEET_DATA["tweet_join"])
IDF_vector = tfidf.idf_

# hitung TF x IDF sehingga dihasilkan TFIDF matrix / vector
tfidf_mat = normalized_TF_vector.multiply(IDF_vector).toarray()

### TF-IDF Scikit-Learn L1 Norm unigram, bigram, trigram

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import normalize

max_features = 1000

# ngram_range (1, 3) to use unigram, bigram, trigram
cvect = CountVectorizer(max_features=max_features, ngram_range=(1,3))
counts = cvect.fit_transform(TWEET_DATA["tweet_join"])

normalized_counts = normalize(counts, norm='l1', axis=1)

tfidf = TfidfVectorizer(max_features=max_features, ngram_range=(1,3), smooth_idf=False)
tfs = tfidf.fit_transform(TWEET_DATA["tweet_join"])

tfidf_mat = normalized_counts.multiply(tfidf.idf_).toarray()

### TF-IDF vector unigram only / bigram only / trigram only

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import normalize

max_features = 1000


def generate_tfidf_mat(min_gram, max_gram):
    cvect = CountVectorizer(max_features=max_features, ngram_range=(min_gram, max_gram))
    counts = cvect.fit_transform(TWEET_DATA["tweet_join"])

    normalized_counts = normalize(counts, norm='l1', axis=1)

    tfidf = TfidfVectorizer(max_features=max_features, ngram_range=(min_gram, max_gram), smooth_idf=False)
    tfs = tfidf.fit_transform(TWEET_DATA["tweet_join"])

    tfidf_mat = normalized_counts.multiply(tfidf.idf_).toarray()
    
    TF = normalized_counts.toarray()
    IDF = tfidf.idf_
    TF_IDF = tfidf_mat
    return TF, IDF, TF_IDF, tfidf.get_feature_names()

# ngram_range (1, 1) to use unigram only
tf_mat_unigram, idf_mat_unigram, tfidf_mat_unigram, terms_unigram = generate_tfidf_mat(1,1)

# ngram_range (2, 2) to use bigram only
tf_mat_bigram, idf_mat_bigram, tfidf_mat_bigram, terms_bigram = generate_tfidf_mat(2,2)

# ngram_range (3, 3) to use trigram only
tf_mat_trigram, idf_mat_trigram, tfidf_mat_bigram, terms_trigram = generate_tfidf_mat(3,3)

In [20]:
# print tf-idf unigram
tfidf_mat_unigram

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [35]:
idx_sample = 0

print("Show TFIDF sample ke-" + str(idx_sample), "\n")
print(TWEET_DATA["tweet"][idx_sample], "\n")

print("\t\t\t", "TF", "\t\t", "IDF", "\t\t", "TF-IDF", "\t", "Term\n")
for i, item in enumerate(zip(tf_mat_unigram[idx_sample], idf_mat_unigram, tfidf_mat_unigram[idx_sample], terms_unigram)):
    if(item[2] != 0.0):
        print ("array position " + str(i) + "\t", 
               "%.6f" % item[0], "\t", 
               "%.6f" % item[1], "\t", 
               "%.6f" % item[2], "\t", 
               item[3])

Show TFIDF sample ke-0 

['maspiyuuu', 'bantu', 'bekas', 'anies', 'baswedan', 'layan', 'warga', 'negara', 'indonesia', 'httpstcobiwxvhsvknntinggal', 'lantik', 'aja', 'inimah', 'pru'] 

			 TF 		 IDF 		 TF-IDF 	 Term

array position 15	 0.100000 	 4.219953 	 0.421995 	 aja
array position 50	 0.100000 	 4.296052 	 0.429605 	 anies
array position 104	 0.100000 	 5.172452 	 0.517245 	 bantu
array position 108	 0.100000 	 5.507807 	 0.550781 	 baswedan
array position 118	 0.100000 	 6.975447 	 0.697545 	 bekas
array position 358	 0.100000 	 4.080378 	 0.408038 	 indonesia
array position 520	 0.100000 	 6.942111 	 0.694211 	 layan
array position 566	 0.100000 	 6.658535 	 0.665854 	 maspiyuuu
array position 624	 0.100000 	 4.504765 	 0.450477 	 negara
array position 981	 0.100000 	 4.593080 	 0.459308 	 warga


# Save to Excel

In [36]:
# check Matrix size

tf_mat_unigram.shape, tfidf_mat_unigram.shape, idf_mat_unigram.shape

((23225, 1000), (23225, 1000), (1000,))

In [56]:
def get_TF_unigram(row):
    idx = row.name
    return [tf for tf in tf_mat_unigram[idx] if tf != 0.0]

TWEET_DATA["TF_UNIGRAM"] = TWEET_DATA.apply(get_TF_unigram, axis=1)

def get_IDF_unigram(row):
    idx = row.name
    return [item[1] for item in zip(tf_mat_unigram[idx], idf_mat_unigram) if item[0] != 0.0]

TWEET_DATA["IDF_UNIGRAM"] = TWEET_DATA.apply(get_TF_unigram, axis=1)

def get_TFIDF_unigram(row):
    idx = row.name
    return [tfidf for tfidf in tfidf_mat_unigram if tfidf != 0.0]

TWEET_DATA["TFIDF_UNIGRAM"] = TWEET_DATA.apply(get_TF_unigram, axis=1)

TWEET_DATA[["tweet", "TF_UNIGRAM", "IDF_UNIGRAM", "TFIDF_UNIGRAM"]].head()

Unnamed: 0,tweet,TF_UNIGRAM,IDF_UNIGRAM,TFIDF_UNIGRAM
0,"['maspiyuuu', 'bantu', 'bekas', 'anies', 'basw...","[0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, ...","[0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, ...","[0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, ..."
1,"['lisaamartatara', 'fadlizon', 'bangga', 'bang...","[0.16666666666666666, 0.16666666666666666, 0.1...","[0.16666666666666666, 0.16666666666666666, 0.1...","[0.16666666666666666, 0.16666666666666666, 0.1..."
2,"['rustamibrahim', 'polling', 'gubernur', 'jawa...","[0.25, 0.25, 0.25, 0.25]","[0.25, 0.25, 0.25, 0.25]","[0.25, 0.25, 0.25, 0.25]"
3,"['ojok', 'ampe', 'anies', 'baswedan', 'nikmat'...","[0.14285714285714285, 0.14285714285714285, 0.1...","[0.14285714285714285, 0.14285714285714285, 0.1...","[0.14285714285714285, 0.14285714285714285, 0.1..."
4,"['ghanieierfan', 'temu', 'haru', 'aniesbasweda...","[0.25, 0.25, 0.25, 0.25]","[0.25, 0.25, 0.25, 0.25]","[0.25, 0.25, 0.25, 0.25]"


In [57]:
# save TFIDF Unigram to Excel

TWEET_DATA[["tweet", "TF_UNIGRAM", "IDF_UNIGRAM", "TFIDF_UNIGRAM"]].to_excel("TFIDF_Unigram.xlsx")

In [58]:
def get_TF_bigram(row):
    idx = row.name
    return [tf for tf in tf_mat_bigram[idx] if tf != 0.0]

TWEET_DATA["TF_BIGRAM"] = TWEET_DATA.apply(get_TF_bigram, axis=1)

def get_IDF_bigram(row):
    idx = row.name
    return [item[1] for item in zip(tf_mat_bigram[idx], idf_mat_bigram) if item[0] != 0.0]

TWEET_DATA["IDF_BIGRAM"] = TWEET_DATA.apply(get_TF_bigram, axis=1)

def get_TFIDF_bigram(row):
    idx = row.name
    return [tfidf for tfidf in tfidf_mat_bigram if tfidf != 0.0]

TWEET_DATA["TFIDF_BIGRAM"] = TWEET_DATA.apply(get_TF_bigram, axis=1)

TWEET_DATA[["tweet", "TF_BIGRAM", "IDF_BIGRAM", "TFIDF_BIGRAM"]].head()

Unnamed: 0,tweet,TF_BIGRAM,IDF_BIGRAM,TFIDF_BIGRAM
0,"['maspiyuuu', 'bantu', 'bekas', 'anies', 'basw...","[0.3333333333333333, 0.3333333333333333, 0.333...","[0.3333333333333333, 0.3333333333333333, 0.333...","[0.3333333333333333, 0.3333333333333333, 0.333..."
1,"['lisaamartatara', 'fadlizon', 'bangga', 'bang...",[],[],[]
2,"['rustamibrahim', 'polling', 'gubernur', 'jawa...",[1.0],[1.0],[1.0]
3,"['ojok', 'ampe', 'anies', 'baswedan', 'nikmat'...","[0.14285714285714285, 0.14285714285714285, 0.1...","[0.14285714285714285, 0.14285714285714285, 0.1...","[0.14285714285714285, 0.14285714285714285, 0.1..."
4,"['ghanieierfan', 'temu', 'haru', 'aniesbasweda...",[1.0],[1.0],[1.0]


In [59]:
# save TFIDF Bigram to Excel

TWEET_DATA[["tweet", "TF_BIGRAM", "IDF_BIGRAM", "TFIDF_BIGRAM"]].to_excel("TFIDF_Bigram.xlsx")

In [60]:
def get_TF_trigram(row):
    idx = row.name
    return [tf for tf in tf_mat_trigram[idx] if tf != 0.0]

TWEET_DATA["TF_trigram"] = TWEET_DATA.apply(get_TF_trigram, axis=1)

def get_IDF_trigram(row):
    idx = row.name
    return [item[1] for item in zip(tf_mat_trigram[idx], idf_mat_trigram) if item[0] != 0.0]

TWEET_DATA["IDF_trigram"] = TWEET_DATA.apply(get_TF_trigram, axis=1)

def get_TFIDF_trigram(row):
    idx = row.name
    return [tfidf for tfidf in tfidf_mat_trigram if tfidf != 0.0]

TWEET_DATA["TFIDF_trigram"] = TWEET_DATA.apply(get_TF_trigram, axis=1)

TWEET_DATA[["tweet", "TF_trigram", "IDF_trigram", "TFIDF_trigram"]].head()

Unnamed: 0,tweet,TF_trigram,IDF_trigram,TFIDF_trigram
0,"['maspiyuuu', 'bantu', 'bekas', 'anies', 'basw...",[1.0],[1.0],[1.0]
1,"['lisaamartatara', 'fadlizon', 'bangga', 'bang...",[],[],[]
2,"['rustamibrahim', 'polling', 'gubernur', 'jawa...",[],[],[]
3,"['ojok', 'ampe', 'anies', 'baswedan', 'nikmat'...","[0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.1...","[0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.1...","[0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.1..."
4,"['ghanieierfan', 'temu', 'haru', 'aniesbasweda...",[],[],[]


In [62]:
# save TFIDF Trigram to Excel

TWEET_DATA[["tweet", "TF_trigram", "IDF_trigram", "TFIDF_trigram"]].to_excel("TFIDF_Trigram.xlsx")