In [1]:
import pandas as pd
import numpy as np

In [2]:
corpus = ['A computer is a machine that can be programmed to carry out sequences of arithmetic or logical operations (computation) automatically.',
          'Modern digital electronic computers can perform generic sets of operations known as programs.',
          'These programs enable computers to perform a wide range of tasks' ]

In [3]:
words_set = set()

for doc in  corpus:
    words = doc.split(' ')
    words_set = words_set.union(set(words))

print('Number of words in the corpus:',len(words_set))
print('The words in the corpus: \n', words_set)

Number of words in the corpus: 36
The words in the corpus: 
 {'arithmetic', 'known', 'be', 'is', 'computer', 'can', 'programmed', 'a', 'sequences', 'perform', 'electronic', 'to', 'programs', 'or', 'that', 'out', 'wide', 'These', 'tasks', 'generic', 'of', 'sets', 'Modern', 'as', 'programs.', 'automatically.', 'computers', 'enable', 'carry', '(computation)', 'operations', 'logical', 'A', 'machine', 'range', 'digital'}


In [4]:
n_docs = len(corpus)         # Number of documents in the corpus
n_words_set = len(words_set) # Number of unique words in the set

df_tf = pd.DataFrame(np.zeros((n_docs, n_words_set)), columns=list(words_set))

# Compute Term Frequency (TF)
for i in range(n_docs):
    words = corpus[i].split(' ')  # Words in the document
    for w in words:
        df_tf[w][i] = df_tf[w][i] + (1 / len(words))

df_tf


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df_tf[w][i] = df_tf[w][i] + (1 / len(words))


Unnamed: 0,arithmetic,known,be,is,computer,can,programmed,a,sequences,perform,...,computers,enable,carry,(computation),operations,logical,A,machine,range,digital
0,0.05,0.0,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.0,...,0.0,0.0,0.05,0.05,0.05,0.05,0.05,0.05,0.0,0.0
1,0.0,0.076923,0.0,0.0,0.0,0.076923,0.0,0.0,0.0,0.076923,...,0.076923,0.0,0.0,0.0,0.076923,0.0,0.0,0.0,0.0,0.076923
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.090909,0.0,0.090909,...,0.090909,0.090909,0.0,0.0,0.0,0.0,0.0,0.0,0.090909,0.0


In [5]:
print("IDF of: ")

idf = {}

for w in words_set:
    k = 0    # number of documents in the corpus that contain this word

    for i in range(n_docs):
        if w in corpus[i].split():
            k += 1

    idf[w] =  np.log10(n_docs / k)

    print(f'{w:>15}: {idf[w]:>10}' )

IDF of: 
     arithmetic: 0.47712125471966244
          known: 0.47712125471966244
             be: 0.47712125471966244
             is: 0.47712125471966244
       computer: 0.47712125471966244
            can: 0.17609125905568124
     programmed: 0.47712125471966244
              a: 0.17609125905568124
      sequences: 0.47712125471966244
        perform: 0.17609125905568124
     electronic: 0.47712125471966244
             to: 0.17609125905568124
       programs: 0.47712125471966244
             or: 0.47712125471966244
           that: 0.47712125471966244
            out: 0.47712125471966244
           wide: 0.47712125471966244
          These: 0.47712125471966244
          tasks: 0.47712125471966244
        generic: 0.47712125471966244
             of:        0.0
           sets: 0.47712125471966244
         Modern: 0.47712125471966244
             as: 0.47712125471966244
      programs.: 0.47712125471966244
 automatically.: 0.47712125471966244
      computers: 0.17609125905568124
 

In [6]:
df_tf_idf = df_tf.copy()

for w in words_set:
    for i in range(n_docs):
        df_tf_idf[w][i] = df_tf[w][i] * idf[w]

df_tf_idf

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df_tf_idf[w][i] = df_tf[w][i] * idf[w]


Unnamed: 0,arithmetic,known,be,is,computer,can,programmed,a,sequences,perform,...,computers,enable,carry,(computation),operations,logical,A,machine,range,digital
0,0.023856,0.0,0.023856,0.023856,0.023856,0.008805,0.023856,0.008805,0.023856,0.0,...,0.0,0.0,0.023856,0.023856,0.008805,0.023856,0.023856,0.023856,0.0,0.0
1,0.0,0.036702,0.0,0.0,0.0,0.013545,0.0,0.0,0.0,0.013545,...,0.013545,0.0,0.0,0.0,0.013545,0.0,0.0,0.0,0.0,0.036702
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.016008,0.0,0.016008,...,0.016008,0.043375,0.0,0.0,0.0,0.0,0.0,0.0,0.043375,0.0


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [8]:
tr_idf_model  = TfidfVectorizer()
tf_idf_vector = tr_idf_model.fit_transform(corpus)

In [9]:
print(type(tf_idf_vector), tf_idf_vector.shape)

<class 'scipy.sparse._csr.csr_matrix'> (3, 33)


In [10]:
tf_idf_array = tf_idf_vector.toarray()

print(tf_idf_array)

[[0.24934612 0.         0.24934612 0.24934612 0.18963415 0.24934612
  0.24934612 0.24934612 0.         0.         0.         0.
  0.         0.24934612 0.         0.24934612 0.24934612 0.
  0.14726784 0.18963415 0.24934612 0.24934612 0.         0.24934612
  0.         0.         0.24934612 0.         0.         0.24934612
  0.         0.18963415 0.        ]
 [0.         0.31248742 0.         0.         0.23765474 0.
  0.         0.         0.23765474 0.31248742 0.31248742 0.
  0.31248742 0.         0.31248742 0.         0.         0.31248742
  0.1845601  0.23765474 0.         0.         0.23765474 0.
  0.23765474 0.         0.         0.31248742 0.         0.
  0.         0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.27474564 0.         0.         0.3612575
  0.         0.         0.         0.         0.         0.
  0.2133645  0.         0.         0.         0.27474564 0.
  0.27474564 0.3612575  0.         0.         0.3

In [11]:
words_set = tr_idf_model.get_feature_names_out()

print(words_set)

['arithmetic' 'as' 'automatically' 'be' 'can' 'carry' 'computation'
 'computer' 'computers' 'digital' 'electronic' 'enable' 'generic' 'is'
 'known' 'logical' 'machine' 'modern' 'of' 'operations' 'or' 'out'
 'perform' 'programmed' 'programs' 'range' 'sequences' 'sets' 'tasks'
 'that' 'these' 'to' 'wide']
