In [2]:
import pandas as pd
import math #important library , calculate log
import string

In [3]:
# Sample collection of documents
documents = [
    "Scientists have discovered a new species of marine life in the deep ocean.",
    "NASA's Mars rover is searching for signs of ancient life on the Red Planet.",
    "The stock market experienced a significant drop in trading today.",
    "Astronomers have identified a distant galaxy with unusual star formations.",
    "The government announced new measures to combat climate change."
]
documents

['Scientists have discovered a new species of marine life in the deep ocean.',
 "NASA's Mars rover is searching for signs of ancient life on the Red Planet.",
 'The stock market experienced a significant drop in trading today.',
 'Astronomers have identified a distant galaxy with unusual star formations.',
 'The government announced new measures to combat climate change.']

In [4]:
# Create a dictionary for lemmatization (a simple example, not comprehensive)
lemmatization_dict = {
    "species": "specie",
    "species": "species",
    "oceans": "ocean",
    "ocean's": "ocean",
    "rover": "rover",
    "discovered":"discover",
    "experienced":"experience",
    "rovers": "rover",
    "trading": "trade",
    "identified": "identify",
    "identifies": "identify",
    "formations": "formation",
    "governments": "government",
    "measures": "measure"
}
lemmatization_dict

{'species': 'species',
 'oceans': 'ocean',
 "ocean's": 'ocean',
 'rover': 'rover',
 'discovered': 'discover',
 'experienced': 'experience',
 'rovers': 'rover',
 'trading': 'trade',
 'identified': 'identify',
 'identifies': 'identify',
 'formations': 'formation',
 'governments': 'government',
 'measures': 'measure'}

In [11]:
# terms = [lemmatization_dict.get(term, term) for term in terms]
# terms

In [5]:
# Tokenize documents into words (terms), remove punctuation, and lemmatize
def preprocess_text(document):
    terms = document.lower().split()
    terms = [term.strip(string.punctuation) for term in terms]
    terms = [lemmatization_dict.get(term, term) for term in terms]
    return terms


In [6]:
# Create a set of unique terms (vocabulary)
vocabulary = set()
for document in documents:
    terms = preprocess_text(document)
    vocabulary.update(terms)

vocabulary

{'a',
 'ancient',
 'announced',
 'astronomers',
 'change',
 'climate',
 'combat',
 'deep',
 'discover',
 'distant',
 'drop',
 'experience',
 'for',
 'formation',
 'galaxy',
 'government',
 'have',
 'identify',
 'in',
 'is',
 'life',
 'marine',
 'market',
 'mars',
 'measure',
 "nasa's",
 'new',
 'ocean',
 'of',
 'on',
 'planet',
 'red',
 'rover',
 'scientists',
 'searching',
 'significant',
 'signs',
 'species',
 'star',
 'stock',
 'the',
 'to',
 'today',
 'trade',
 'unusual',
 'with'}

In [7]:
# Create a dictionary to store the term frequency (TF) for each term in each document
tf_values = {term: [0] * len(documents) for term in vocabulary}
tf_values

{'have': [0, 0, 0, 0, 0],
 'experience': [0, 0, 0, 0, 0],
 'identify': [0, 0, 0, 0, 0],
 'planet': [0, 0, 0, 0, 0],
 'galaxy': [0, 0, 0, 0, 0],
 'red': [0, 0, 0, 0, 0],
 'the': [0, 0, 0, 0, 0],
 'a': [0, 0, 0, 0, 0],
 'ancient': [0, 0, 0, 0, 0],
 'astronomers': [0, 0, 0, 0, 0],
 'government': [0, 0, 0, 0, 0],
 'on': [0, 0, 0, 0, 0],
 "nasa's": [0, 0, 0, 0, 0],
 'trade': [0, 0, 0, 0, 0],
 'ocean': [0, 0, 0, 0, 0],
 'formation': [0, 0, 0, 0, 0],
 'discover': [0, 0, 0, 0, 0],
 'drop': [0, 0, 0, 0, 0],
 'mars': [0, 0, 0, 0, 0],
 'marine': [0, 0, 0, 0, 0],
 'to': [0, 0, 0, 0, 0],
 'deep': [0, 0, 0, 0, 0],
 'climate': [0, 0, 0, 0, 0],
 'distant': [0, 0, 0, 0, 0],
 'unusual': [0, 0, 0, 0, 0],
 'significant': [0, 0, 0, 0, 0],
 'market': [0, 0, 0, 0, 0],
 'signs': [0, 0, 0, 0, 0],
 'species': [0, 0, 0, 0, 0],
 'in': [0, 0, 0, 0, 0],
 'new': [0, 0, 0, 0, 0],
 'scientists': [0, 0, 0, 0, 0],
 'measure': [0, 0, 0, 0, 0],
 'rover': [0, 0, 0, 0, 0],
 'for': [0, 0, 0, 0, 0],
 'stock': [0, 0, 0, 0, 0],

In [9]:
# Calculate Term Frequency (TF)
for i, document in enumerate(documents):
    pre_doc = preprocess_text(document)
    for term in pre_doc:
        if term in pre_doc:
            tf_values[term][0] += 1
        tf_values[term][0] = tf_values[term][0] / len(pre_doc)
        
tf_values

{'have': [0.11538461538461538, 0, 0, 1, 0],
 'experience': [0.1, 0, 1, 0, 0],
 'identify': [0.1, 0, 0, 1, 0],
 'planet': [0.07142857142857142, 1, 0, 0, 0],
 'galaxy': [0.1, 0, 0, 1, 0],
 'red': [0.07142857142857142, 1, 0, 0, 0],
 'the': [0.12313797313797314, 1, 1, 0, 1],
 'a': [0.11153846153846154, 0, 1, 1, 0],
 'ancient': [0.07142857142857142, 1, 0, 0, 0],
 'astronomers': [0.1, 0, 0, 1, 0],
 'government': [0.1111111111111111, 0, 0, 0, 1],
 'on': [0.07142857142857142, 1, 0, 0, 0],
 "nasa's": [0.07142857142857142, 1, 0, 0, 0],
 'trade': [0.1, 0, 1, 0, 0],
 'ocean': [0.08547008547008547, 0, 0, 0, 0],
 'formation': [0.1, 0, 0, 1, 0],
 'discover': [0.15384615384615385, 0, 0, 0, 0],
 'drop': [0.1, 0, 1, 0, 0],
 'mars': [0.07142857142857142, 1, 0, 0, 0],
 'marine': [0.15384615384615385, 0, 0, 0, 0],
 'to': [0.1111111111111111, 0, 0, 0, 1],
 'deep': [0.15384615384615385, 0, 0, 0, 0],
 'climate': [0.1111111111111111, 0, 0, 0, 1],
 'distant': [0.1, 0, 0, 1, 0],
 'unusual': [0.1, 0, 0, 1, 0],
 '

In [10]:
# Calculate Inverse Document Frequency (IDF)
idf_values = {}
total_documents = len(documents)
for term in vocabulary:
    document_occurred = sum([term in preprocess_text(document) for document in documents])
    idf_values[term] = math.log(total_documents / document_occurred)

idf_values

{'have': 0.9162907318741551,
 'experience': 1.6094379124341003,
 'identify': 1.6094379124341003,
 'planet': 1.6094379124341003,
 'galaxy': 1.6094379124341003,
 'red': 1.6094379124341003,
 'the': 0.22314355131420976,
 'a': 0.5108256237659907,
 'ancient': 1.6094379124341003,
 'astronomers': 1.6094379124341003,
 'government': 1.6094379124341003,
 'on': 1.6094379124341003,
 "nasa's": 1.6094379124341003,
 'trade': 1.6094379124341003,
 'ocean': 1.6094379124341003,
 'formation': 1.6094379124341003,
 'discover': 1.6094379124341003,
 'drop': 1.6094379124341003,
 'mars': 1.6094379124341003,
 'marine': 1.6094379124341003,
 'to': 1.6094379124341003,
 'deep': 1.6094379124341003,
 'climate': 1.6094379124341003,
 'distant': 1.6094379124341003,
 'unusual': 1.6094379124341003,
 'significant': 1.6094379124341003,
 'market': 1.6094379124341003,
 'signs': 1.6094379124341003,
 'species': 1.6094379124341003,
 'in': 0.9162907318741551,
 'new': 0.9162907318741551,
 'scientists': 1.6094379124341003,
 'measure'

In [17]:
# Calculate TF-IDF values
tfidf_values = []
for i, document in enumerate(documents):
    terms = preprocess_text(document)
    tfidf_document = []
    for term in vocabulary:
        tf = tf_values[term][i]
        idf = idf_values[term]
        tfidf = tf * idf
        tfidf_document.append(tfidf)
    tfidf_values.append(tfidf_document)
tfidf_values

[[1.6094379124341003,
  0.0,
  0.0,
  0.0,
  0.0,
  0.5108256237659907,
  0.0,
  1.6094379124341003,
  0.0,
  0.9162907318741551,
  0.0,
  0.0,
  0.0,
  0.9162907318741551,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.6094379124341003,
  0.0,
  0.0,
  1.6094379124341003,
  0.0,
  0.9162907318741551,
  0.0,
  0.9162907318741551,
  0.0,
  0.0,
  0.0,
  1.6094379124341003,
  1.6094379124341003,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.22314355131420976,
  0.9162907318741551,
  0.0],
 [0.0,
  0.0,
  1.6094379124341003,
  1.6094379124341003,
  1.6094379124341003,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.6094379124341003,
  0.0,
  0.0,
  0.0,
  1.6094379124341003,
  1.6094379124341003,
  1.6094379124341003,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.6094379124341003,
  0.9162907318741551,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.6094379124341003,
  0.0,
  0.0,
  0.0,
  1.6094379124341003,
  0.22314355131

In [18]:
# Convert TF-IDF values to a DataFrame
df_tfidf = pd.DataFrame(tfidf_values, columns=list(vocabulary))

# Display TF-IDF results
print("TF-IDF:")
print(df_tfidf)

TF-IDF:
   scientists      with      mars    nasa's       red         a      drop  \
0    1.609438  0.000000  0.000000  0.000000  0.000000  0.510826  0.000000   
1    0.000000  0.000000  1.609438  1.609438  1.609438  0.000000  0.000000   
2    0.000000  0.000000  0.000000  0.000000  0.000000  0.510826  1.609438   
3    0.000000  1.609438  0.000000  0.000000  0.000000  0.510826  0.000000   
4    0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   

   discover    change      have  ...   measure     today   ancient     stock  \
0  1.609438  0.000000  0.916291  ...  0.000000  0.000000  0.000000  0.000000   
1  0.000000  0.000000  0.000000  ...  0.000000  0.000000  1.609438  0.000000   
2  0.000000  0.000000  0.000000  ...  0.000000  1.609438  0.000000  1.609438   
3  0.000000  0.000000  0.916291  ...  0.000000  0.000000  0.000000  0.000000   
4  0.000000  1.609438  0.000000  ...  1.609438  0.000000  0.000000  0.000000   

   announced   climate  searching       the     

In [19]:
# Save TF-IDF results to a CSV file (optional)
# df_tfidf.to_csv("tfidf_custom_preprocessed_news.csv", index=False)

# Using Libraries for Lemmatization and Tf-Idf

In [20]:
# if your machine doesn't have these libraries, you need to install them
import pandas as pd
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

# download the punkt
nltk.download('punkt')

[nltk_data] Downloading package punkt to C:\Users\Solita
[nltk_data]     Pun\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [21]:

# Initialize NLTK's lemmatizer and download stopwords
nltk.download('wordnet')
nltk.download('stopwords')
# Initialize NLTK's lemmatizer
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
lemmatizer = WordNetLemmatizer()
ps = PorterStemmer()

# Tokenize documents into words (terms), remove punctuation, lemmatize, and remove stopwords
def preprocess_text(document):
    terms = nltk.word_tokenize(document)
    terms = [term.strip(string.punctuation) for term in terms]
    terms = [ps.stem(term) for term in terms]
    terms = [term.lower() for term in terms if term not in stopwords.words('english')]
    return ' '.join(terms)

[nltk_data] Downloading package wordnet to C:\Users\Solita
[nltk_data]     Pun\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Solita
[nltk_data]     Pun\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Solita
[nltk_data]     Pun\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [27]:
# Preprocess the text in the documents
preprocessed_documents = []
for document in documents:
    preprocessed_documents.append(preprocess_text(document))


# Create a TfidfVectorizer instance
vectorizer = TfidfVectorizer()
preprocessed_documents


['scientist discov new speci marin life deep ocean ',
 'nasa mar rover search sign ancient life red planet ',
 'stock market experienc signific drop trade today ',
 'astronom identifi distant galaxi unusu star format ',
 'govern announc new measur combat climat chang ']

In [28]:
# Fit and transform the preprocessed documents to compute TF-IDF values CADT@0zJanZ!
tfidf_matrix = vectorizer.fit_transform(preprocessed_documents)  



# Convert the TF-IDF matrix to a DataFrame
df_tfidf = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

In [29]:
# Display TF-IDF results
print("TF-IDF:")
print(df_tfidf)

TF-IDF:
    ancient   announc  astronom     chang    climat    combat     deep  \
0  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.37007   
1  0.339992  0.000000  0.000000  0.000000  0.000000  0.000000  0.00000   
2  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.00000   
3  0.000000  0.000000  0.377964  0.000000  0.000000  0.000000  0.00000   
4  0.000000  0.387757  0.000000  0.387757  0.387757  0.387757  0.00000   

    discov   distant      drop  ...  scientist    search      sign  signific  \
0  0.37007  0.000000  0.000000  ...    0.37007  0.000000  0.000000  0.000000   
1  0.00000  0.000000  0.000000  ...    0.00000  0.339992  0.339992  0.000000   
2  0.00000  0.000000  0.377964  ...    0.00000  0.000000  0.000000  0.377964   
3  0.00000  0.377964  0.000000  ...    0.00000  0.000000  0.000000  0.000000   
4  0.00000  0.000000  0.000000  ...    0.00000  0.000000  0.000000  0.000000   

     speci      star     stock     today     trade     unusu  
0  