In [2]:
import pandas as pd
import math #important library , calculate log
import string

In [3]:
# Sample collection of documents
documents = [
    "Scientists have discovered a new species of marine life in the deep ocean.",
    "NASA's Mars rover is searching for signs of ancient life on the Red Planet.",
    "The stock market experienced a significant drop in trading today.",
    "Astronomers have identified a distant galaxy with unusual star formations.",
    "The government announced new measures to combat climate change."
]

In [4]:
# Create a dictionary for lemmatization (a simple example, not comprehensive)
lemmatization_dict = {
    "species": "specie",
    "species": "species",
    "oceans": "ocean",
    "ocean's": "ocean",
    "rover": "rover",
    "discovered":"discover",
    "experienced":"experience",
    "rovers": "rover",
    "trading": "trade",
    "identified": "identify",
    "identifies": "identify",
    "formations": "formation",
    "governments": "government",
    "measures": "measure"
}

In [5]:
# terms = [lemmatization_dict.get(term, term) for term in terms]
# terms

In [6]:
# Tokenize documents into words (terms), remove punctuation, and lemmatize
def preprocess_text(document):
    terms = document.lower().split()
    terms = [term.strip(string.punctuation) for term in terms]
    terms = [lemmatization_dict.get(term, term) for term in terms]
    return terms

In [7]:
# Create a set of unique terms (vocabulary)
vocabulary = set()
for document in documents:
    unique_terms= preprocess_text(document)
    for term in unique_terms:
        vocabulary.add(term)


print(vocabulary)

{'of', 'scientists', 'astronomers', 'with', 'discover', 'life', 'distant', 'ocean', 'today', 'stock', 'the', 'climate', "nasa's", 'mars', 'measure', 'significant', 'a', 'species', 'change', 'to', 'rover', 'in', 'signs', 'for', 'marine', 'combat', 'government', 'drop', 'new', 'identify', 'ancient', 'unusual', 'star', 'galaxy', 'announced', 'red', 'experience', 'have', 'is', 'market', 'formation', 'deep', 'searching', 'planet', 'on', 'trade'}


In [8]:
# Create a dictionary to store the term frequency (TF) for each term in each document
tf_values = {term: [0] * len(documents) for term in vocabulary} 

In [9]:
# Calculate Term Frequency (TF)
for i, document in enumerate(documents):
   unique_terms = preprocess_text(document)

   for term in unique_terms:
    tf_values[term][i] +=1

tf_values

{'of': [1, 1, 0, 0, 0],
 'scientists': [1, 0, 0, 0, 0],
 'astronomers': [0, 0, 0, 1, 0],
 'with': [0, 0, 0, 1, 0],
 'discover': [1, 0, 0, 0, 0],
 'life': [1, 1, 0, 0, 0],
 'distant': [0, 0, 0, 1, 0],
 'ocean': [1, 0, 0, 0, 0],
 'today': [0, 0, 1, 0, 0],
 'stock': [0, 0, 1, 0, 0],
 'the': [1, 1, 1, 0, 1],
 'climate': [0, 0, 0, 0, 1],
 "nasa's": [0, 1, 0, 0, 0],
 'mars': [0, 1, 0, 0, 0],
 'measure': [0, 0, 0, 0, 1],
 'significant': [0, 0, 1, 0, 0],
 'a': [1, 0, 1, 1, 0],
 'species': [1, 0, 0, 0, 0],
 'change': [0, 0, 0, 0, 1],
 'to': [0, 0, 0, 0, 1],
 'rover': [0, 1, 0, 0, 0],
 'in': [1, 0, 1, 0, 0],
 'signs': [0, 1, 0, 0, 0],
 'for': [0, 1, 0, 0, 0],
 'marine': [1, 0, 0, 0, 0],
 'combat': [0, 0, 0, 0, 1],
 'government': [0, 0, 0, 0, 1],
 'drop': [0, 0, 1, 0, 0],
 'new': [1, 0, 0, 0, 1],
 'identify': [0, 0, 0, 1, 0],
 'ancient': [0, 1, 0, 0, 0],
 'unusual': [0, 0, 0, 1, 0],
 'star': [0, 0, 0, 1, 0],
 'galaxy': [0, 0, 0, 1, 0],
 'announced': [0, 0, 0, 0, 1],
 'red': [0, 1, 0, 0, 0],
 'exp

In [10]:
# Calculate Inverse Document Frequency (IDF)
idf_values = {}
total_documents = len(documents)
for term in vocabulary:
    document_frequency = 0
    for document in documents:
        unique_terms = preprocess_text(document)
        if term in unique_terms:
            document_frequency +=1
    
    idf_values[term] = math.log(total_documents / document_frequency+1)

idf_values

{'of': 1.252762968495368,
 'scientists': 1.791759469228055,
 'astronomers': 1.791759469228055,
 'with': 1.791759469228055,
 'discover': 1.791759469228055,
 'life': 1.252762968495368,
 'distant': 1.791759469228055,
 'ocean': 1.791759469228055,
 'today': 1.791759469228055,
 'stock': 1.791759469228055,
 'the': 0.8109302162163288,
 'climate': 1.791759469228055,
 "nasa's": 1.791759469228055,
 'mars': 1.791759469228055,
 'measure': 1.791759469228055,
 'significant': 1.791759469228055,
 'a': 0.9808292530117263,
 'species': 1.791759469228055,
 'change': 1.791759469228055,
 'to': 1.791759469228055,
 'rover': 1.791759469228055,
 'in': 1.252762968495368,
 'signs': 1.791759469228055,
 'for': 1.791759469228055,
 'marine': 1.791759469228055,
 'combat': 1.791759469228055,
 'government': 1.791759469228055,
 'drop': 1.791759469228055,
 'new': 1.252762968495368,
 'identify': 1.791759469228055,
 'ancient': 1.791759469228055,
 'unusual': 1.791759469228055,
 'star': 1.791759469228055,
 'galaxy': 1.79175946

In [11]:
# Calculate TF-IDF values
tfidf_values = []
for i, document in enumerate(documents):
    terms = preprocess_text(document)
    tfidf_document = []
    for term in vocabulary:
        tf = tf_values[term][i]
        idf = idf_values[term]
        tfidf = tf * idf
        tfidf_document.append(tfidf)
    tfidf_values.append(tfidf_document)
tfidf_values

[[1.252762968495368,
  1.791759469228055,
  0.0,
  0.0,
  1.791759469228055,
  1.252762968495368,
  0.0,
  1.791759469228055,
  0.0,
  0.0,
  0.8109302162163288,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.9808292530117263,
  1.791759469228055,
  0.0,
  0.0,
  0.0,
  1.252762968495368,
  0.0,
  0.0,
  1.791759469228055,
  0.0,
  0.0,
  0.0,
  1.252762968495368,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.252762968495368,
  0.0,
  0.0,
  0.0,
  1.791759469228055,
  0.0,
  0.0,
  0.0,
  0.0],
 [1.252762968495368,
  0.0,
  0.0,
  0.0,
  0.0,
  1.252762968495368,
  0.0,
  0.0,
  0.0,
  0.0,
  0.8109302162163288,
  0.0,
  1.791759469228055,
  1.791759469228055,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.791759469228055,
  0.0,
  1.791759469228055,
  1.791759469228055,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.791759469228055,
  0.0,
  0.0,
  0.0,
  0.0,
  1.791759469228055,
  0.0,
  0.0,
  1.791759469228055,
  0.0,
  0.0,
  0.0,
  1.791759469228055,
  1.791759469228055,
  

In [12]:
# Convert TF-IDF values to a DataFrame
df_tfidf = pd.DataFrame(tfidf_values, columns=list(vocabulary))

# Display TF-IDF results
print("TF-IDF:")
print(df_tfidf)

TF-IDF:
         of  scientists  astronomers      with  discover      life   distant  \
0  1.252763    1.791759     0.000000  0.000000  1.791759  1.252763  0.000000   
1  1.252763    0.000000     0.000000  0.000000  0.000000  1.252763  0.000000   
2  0.000000    0.000000     0.000000  0.000000  0.000000  0.000000  0.000000   
3  0.000000    0.000000     1.791759  1.791759  0.000000  0.000000  1.791759   
4  0.000000    0.000000     0.000000  0.000000  0.000000  0.000000  0.000000   

      ocean     today     stock  ...  experience      have        is  \
0  1.791759  0.000000  0.000000  ...    0.000000  1.252763  0.000000   
1  0.000000  0.000000  0.000000  ...    0.000000  0.000000  1.791759   
2  0.000000  1.791759  1.791759  ...    1.791759  0.000000  0.000000   
3  0.000000  0.000000  0.000000  ...    0.000000  1.252763  0.000000   
4  0.000000  0.000000  0.000000  ...    0.000000  0.000000  0.000000   

     market  formation      deep  searching    planet        on     trade  
0 

In [13]:
# Save TF-IDF results to a CSV file (optional)
# df_tfidf.to_csv("tfidf_custom_preprocessed_news.csv", index=False)

# Using Libraries for Lemmatization and Tf-Idf

In [16]:
# if your machine doesn't have these libraries, you need to install them
import pandas as pd
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

# download the punkt
nltk.download('punkt')

ModuleNotFoundError: No module named 'nltk'

In [None]:

# Initialize NLTK's lemmatizer and download stopwords
nltk.download('wordnet')
nltk.download('stopwords')
# Initialize NLTK's lemmatizer
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
lemmatizer = WordNetLemmatizer()
ps = PorterStemmer()

# Tokenize documents into words (terms), remove punctuation, lemmatize, and remove stopwords
def preprocess_text(document):
    terms = nltk.word_tokenize(document)
    terms = [term.strip(string.punctuation) for term in terms]
    terms = [ps.stem(term) for term in terms]
    terms = [term.lower() for term in terms if term not in stopwords.words('english')]
    return ' '.join(terms)

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/nattkorat/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nattkorat/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/nattkorat/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
# Preprocess the text in the documents
preprocessed_documents = # your code here ...

# Create a TfidfVectorizer instance
vectorizer = TfidfVectorizer()

In [None]:
# Fit and transform the preprocessed documents to compute TF-IDF values CADT@0zJanZ!
tfidf_matrix = vectorizer.fit_transform(preprocessed_documents)

# Convert the TF-IDF matrix to a DataFrame
df_tfidf = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

In [None]:
# Display TF-IDF results
print("TF-IDF:")
print(df_tfidf)

TF-IDF:
    ancient   announc  astronom     chang    climat    combat     deep  \
0  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.37007   
1  0.339992  0.000000  0.000000  0.000000  0.000000  0.000000  0.00000   
2  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.00000   
3  0.000000  0.000000  0.377964  0.000000  0.000000  0.000000  0.00000   
4  0.000000  0.387757  0.000000  0.387757  0.387757  0.387757  0.00000   

    discov   distant      drop  ...  scientist    search      sign  signific  \
0  0.37007  0.000000  0.000000  ...    0.37007  0.000000  0.000000  0.000000   
1  0.00000  0.000000  0.000000  ...    0.00000  0.339992  0.339992  0.000000   
2  0.00000  0.000000  0.377964  ...    0.00000  0.000000  0.000000  0.377964   
3  0.00000  0.377964  0.000000  ...    0.00000  0.000000  0.000000  0.000000   
4  0.00000  0.000000  0.000000  ...    0.00000  0.000000  0.000000  0.000000   

     speci      star     stock     today     trade     unusu  
0  