In [45]:
import math
from sklearn import preprocessing
import numpy as np
from scipy.sparse import csr_matrix


class TfidfVectorizer_manual:

    def fit(self, dataset):
        vocabulary = set()
        if isinstance(dataset, (list,)):
            for data_point in dataset:
                words_list = data_point.split(" ")
                for each_word in words_list:
                    vocabulary.add(each_word)
        vocabulary = list(vocabulary)
        vocabulary.sort()
        idf_values = self.idf_values(dataset, vocabulary)
        return vocabulary, idf_values

    def transform(self, dataset, vocabulary, idf_vocabulary):
        vocabulary_dict = {k: v for v, k in enumerate(vocabulary)}
        col = []
        row = []
        values = []
        for row_index in range(len(dataset)):
            values_row = []
            data_point_unique = list(set(dataset[row_index].split(" ")))
            data_points = dataset[row_index].split(" ")
            for each_feature in data_point_unique:
                col.append(vocabulary_dict[each_feature])
                row.append(row_index)
                idf = idf_vocabulary[vocabulary_dict[each_feature]]
                values_row.append(idf * (data_points.count(each_feature) / len(dataset[row_index].split(" "))))
            values_row_array = np.array(values_row).reshape(1, -1)
            values_normalized = preprocessing.normalize(values_row_array, norm='l2').tolist()
            values.extend(values_normalized[0])
        return csr_matrix((values, (row, col)), shape=(len(dataset), len(vocabulary)))

    def idf_values(self, dataset, sorted_vocabulary):
        idf_values = []
        no_of_document = len(dataset)
        for each_feature in sorted_vocabulary:
            each_feature_occurance = self.get_occurance(each_feature, dataset)
            idf_values.append(1 + math.log((no_of_document + 1) / (each_feature_occurance + 1)))
        return idf_values

    def get_occurance(self, feature, dataset):
        count = 0
        for each_datapoint in dataset:
            features_set = set(each_datapoint.split(" "))
            if feature in features_set:
                count += 1
        return count



In [46]:
corpus = [
    'this is the first document',
    'this document is the second document',
    'and this is the third one',
    'is this the first document',
]

tfidf = TfidfVectorizer_manual()
vocab, idf_values = tfidf.fit(corpus)

In [47]:
print(vocab)
print(idf_values)


['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']
[1.916290731874155, 1.2231435513142097, 1.5108256237659907, 1.0, 1.916290731874155, 1.916290731874155, 1.0, 1.916290731874155, 1.0]


In [48]:
csr_matrix = tfidf.transform(corpus, vocab, idf_values)
print(csr_matrix)

  (0, 1)	0.4697913855799205
  (0, 2)	0.580285823684436
  (0, 3)	0.3840852409148149
  (0, 6)	0.3840852409148149
  (0, 8)	0.3840852409148149
  (1, 1)	0.6876235979836937
  (1, 3)	0.2810886740337529
  (1, 5)	0.5386476208856762
  (1, 6)	0.2810886740337529
  (1, 8)	0.2810886740337529
  (2, 0)	0.511848512707169
  (2, 3)	0.267103787642168
  (2, 4)	0.511848512707169
  (2, 6)	0.267103787642168
  (2, 7)	0.511848512707169
  (2, 8)	0.267103787642168
  (3, 1)	0.4697913855799205
  (3, 2)	0.580285823684436
  (3, 3)	0.3840852409148149
  (3, 6)	0.3840852409148149
  (3, 8)	0.3840852409148149


### Sklearn Results

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectorizer.fit(corpus)
print(vectorizer.get_feature_names())
vectorizer.idf_

['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']


array([1.91629073, 1.22314355, 1.51082562, 1.        , 1.91629073,
       1.91629073, 1.        , 1.91629073, 1.        ])

In [29]:
csr_matrix = vectorizer.transform(corpus)

In [30]:
print(csr_matrix)

  (0, 8)	0.38408524091481483
  (0, 6)	0.38408524091481483
  (0, 3)	0.38408524091481483
  (0, 2)	0.5802858236844359
  (0, 1)	0.46979138557992045
  (1, 8)	0.281088674033753
  (1, 6)	0.281088674033753
  (1, 5)	0.5386476208856763
  (1, 3)	0.281088674033753
  (1, 1)	0.6876235979836938
  (2, 8)	0.267103787642168
  (2, 7)	0.511848512707169
  (2, 6)	0.267103787642168
  (2, 4)	0.511848512707169
  (2, 3)	0.267103787642168
  (2, 0)	0.511848512707169
  (3, 8)	0.38408524091481483
  (3, 6)	0.38408524091481483
  (3, 3)	0.38408524091481483
  (3, 2)	0.5802858236844359
  (3, 1)	0.46979138557992045


# Task 2

In [37]:
import math
from sklearn import preprocessing
import numpy as np
from scipy.sparse import csr_matrix
from itertools import islice
import pandas as pd


class TfidfVectorizer_limit:

    def fit(self, dataset):
        vocabulary = set()
        if isinstance(dataset, (list,)):
            for data_point in dataset:
                words_list = data_point.split(" ")
                for each_word in words_list:
                    vocabulary.add(each_word)
        vocabulary = list(vocabulary)
        vocabulary.sort()
        idf_values = self.idf_values(dataset, vocabulary)
        sorted_vocab_idf = {k: v for k, v in sorted(idf_values.items(), key=lambda item: item[1], reverse=True)}
        sorted_vocab_idf_50_elements = dict(islice(sorted_vocab_idf.items(), 50))
        vocabulary = sorted_vocab_idf_50_elements.keys()
        idf_values = list(sorted_vocab_idf_50_elements.values())
        return vocabulary, idf_values

    def idf_values(self, dataset, vocabulary):
        idf_values_vocab = {}
        no_of_document = len(dataset)
        for index in range(len(vocabulary)):
            each_feature_occurance = self.get_occurance(vocabulary[index], dataset)
            idf_values_vocab[vocabulary[index]] = (1 + math.log((no_of_document + 1) / (each_feature_occurance + 1)))
        return idf_values_vocab

    def get_occurance(self, feature, dataset):
        count = 0
        for each_datapoint in dataset:
            features_set = set(each_datapoint.split(" "))
            if feature in features_set:
                count += 1
        return count

    def transform(self, dataset, vocabulary, idf_vocabulary):
        vocabulary_dict = {k: v for v, k in enumerate(vocabulary)}
        col = []
        row = []
        values = []
        for row_index in range(len(dataset)):
            values_row = []
            data_point_split_list = list(set(dataset[row_index].split(" ")))
            for each_feature in data_point_split_list:
                if each_feature in vocabulary:
                    col.append(vocabulary_dict[each_feature])
                    row.append(row_index)
                    idf = idf_vocabulary[vocabulary_dict[each_feature]]
                    values_row.append(idf * (data_point_split_list.count(each_feature) / len(data_point_split_list)))
            if len(values_row) > 0:
                values_row_array = np.array(values_row).reshape(1, -1)
                values_normalized = preprocessing.normalize(values_row_array, norm='l2').tolist()
                values.extend(values_normalized[0])
        return csr_matrix((values, (row, col)), shape=(len(dataset), len(vocabulary)))

In [38]:
data = pd.read_csv("/Users/kunalbudhiraja/Desktop/Code/AppliedAI/cleaned_string.csv", header=None).to_numpy()
corpus_global = []

for each in data:
    corpus_global.append(each[0])

tfidf = TfidfVectorizer_limit()
vocab, idf = tfidf.fit(corpus_global)
sparse_matrix = tfidf.transform(corpus_global, vocab, idf)

In [39]:
print(vocab)

dict_keys(['aailiyah', 'abandoned', 'abroad', 'abstruse', 'academy', 'accents', 'accessible', 'acclaimed', 'accolades', 'accurate', 'accurately', 'achille', 'ackerman', 'actions', 'adams', 'add', 'added', 'admins', 'admiration', 'admitted', 'adrift', 'adventure', 'aesthetically', 'affected', 'affleck', 'afternoon', 'aged', 'ages', 'agree', 'agreed', 'aimless', 'aired', 'akasha', 'akin', 'alert', 'alike', 'allison', 'allow', 'allowing', 'alongside', 'amateurish', 'amaze', 'amazed', 'amazingly', 'amusing', 'amust', 'anatomist', 'angel', 'angela', 'angelina'])


In [40]:
print(idf)

[6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872]


In [41]:
print(sparse_matrix)

  (0, 30)	1.0
  (68, 24)	1.0
  (72, 29)	1.0
  (74, 31)	1.0
  (119, 33)	1.0
  (135, 3)	0.3779644730092272
  (135, 10)	0.3779644730092272
  (135, 18)	0.3779644730092272
  (135, 20)	0.3779644730092272
  (135, 36)	0.3779644730092272
  (135, 40)	0.3779644730092272
  (135, 41)	0.3779644730092272
  (176, 49)	1.0
  (181, 13)	1.0
  (192, 21)	1.0
  (193, 23)	1.0
  (216, 2)	1.0
  (222, 47)	1.0
  (225, 19)	1.0
  (227, 17)	1.0
  (241, 44)	1.0
  (270, 1)	1.0
  (290, 25)	1.0
  (333, 26)	1.0
  (334, 15)	1.0
  (341, 43)	1.0
  (344, 42)	1.0
  (348, 8)	1.0
  (377, 37)	1.0
  (409, 5)	1.0
  (430, 39)	1.0
  (457, 45)	1.0
  (461, 4)	1.0
  (465, 38)	1.0
  (475, 35)	1.0
  (493, 6)	1.0
  (500, 48)	1.0
  (548, 0)	0.7071067811865475
  (548, 32)	0.7071067811865475
  (608, 14)	1.0
  (612, 11)	1.0
  (620, 46)	1.0
  (632, 7)	1.0
  (644, 12)	0.7071067811865475
  (644, 27)	0.7071067811865475
  (664, 28)	1.0
  (667, 22)	1.0
  (691, 34)	1.0
  (697, 9)	1.0
  (722, 16)	1.0
