In [2]:
import pandas as pd
import math #important library , calculate log
import string

In [3]:
# Sample collection of documents
documents = [
    "Scientists have discovered a new species of marine life in the deep ocean.",
    "NASA's Mars rover is searching for signs of ancient life on the Red Planet.",
    "The stock market experienced a significant drop in trading today.",
    "Astronomers have identified a distant galaxy with unusual star formations.",
    "The government announced new measures to combat climate change."
]

In [4]:
# Create a dictionary for lemmatization (a simple example, not comprehensive)
lemmatization_dict = {
    "species": "specie",
    "species": "species",
    "oceans": "ocean",
    "ocean's": "ocean",
    "rover": "rover",
    "discovered":"discover",
    "experienced":"experience",
    "rovers": "rover",
    "trading": "trade",
    "identified": "identify",
    "identifies": "identify",
    "formations": "formation",
    "governments": "government",
    "measures": "measure"
}

In [5]:
terms = [lemmatization_dict.get(term, term) for term in lemmatization_dict]

In [6]:
# Tokenize documents into words (terms), remove punctuation, and lemmatize
def preprocess_text(document):
    terms = document.lower().split()
    terms = [term.strip(string.punctuation) for term in terms]
    terms = [lemmatization_dict.get(term, term) for term in terms]
    return terms

In [7]:
# Create a set of unique terms (vocabulary)
vocabulary = set()
# you code here ...
for document in documents:
    vocabulary.update(preprocess_text(document))

In [11]:
# Create a dictionary to store the term frequency (TF) for each term in each document
tf_values = {term: [0] * len(documents) for term in vocabulary}
# Calculate Term Frequency (TF)
for i, document in enumerate(documents):
   for term in preprocess_text(document):
       tf_values[term][i] = preprocess_text(document).count(term) / len(document)

tf_values

{'searching': [0, 0.013333333333333334, 0, 0, 0],
 'is': [0, 0.013333333333333334, 0, 0, 0],
 'in': [0.013513513513513514, 0, 0.015384615384615385, 0, 0],
 'announced': [0, 0, 0, 0, 0.015873015873015872],
 'scientists': [0.013513513513513514, 0, 0, 0, 0],
 'of': [0.013513513513513514, 0.013333333333333334, 0, 0, 0],
 'drop': [0, 0, 0.015384615384615385, 0, 0],
 'stock': [0, 0, 0.015384615384615385, 0, 0],
 'trade': [0, 0, 0.015384615384615385, 0, 0],
 'identify': [0, 0, 0, 0.013513513513513514, 0],
 'with': [0, 0, 0, 0.013513513513513514, 0],
 'climate': [0, 0, 0, 0, 0.015873015873015872],
 'ocean': [0.013513513513513514, 0, 0, 0, 0],
 'have': [0.013513513513513514, 0, 0, 0.013513513513513514, 0],
 'the': [0.013513513513513514,
  0.013333333333333334,
  0.015384615384615385,
  0,
  0.015873015873015872],
 'formation': [0, 0, 0, 0.013513513513513514, 0],
 'discover': [0.013513513513513514, 0, 0, 0, 0],
 'deep': [0.013513513513513514, 0, 0, 0, 0],
 'life': [0.013513513513513514, 0.013333

In [13]:
# Calculate DF
total_documents = len(documents)
df_values = {term: 0 for term in vocabulary}
for document in documents:
    for term in vocabulary:
        if term in preprocess_text(document):
            df_values[term] += 1
# Calculate Inverse Document Frequency (IDF)
idf_values = {term: 0 for term in vocabulary}

for term in vocabulary:
    idf_values[term] = math.log((total_documents / df_values[term]), 10)

idf_values

{'searching': 0.6989700043360187,
 'is': 0.6989700043360187,
 'in': 0.3979400086720376,
 'announced': 0.6989700043360187,
 'scientists': 0.6989700043360187,
 'of': 0.3979400086720376,
 'drop': 0.6989700043360187,
 'stock': 0.6989700043360187,
 'trade': 0.6989700043360187,
 'identify': 0.6989700043360187,
 'with': 0.6989700043360187,
 'climate': 0.6989700043360187,
 'ocean': 0.6989700043360187,
 'have': 0.3979400086720376,
 'the': 0.0969100130080564,
 'formation': 0.6989700043360187,
 'discover': 0.6989700043360187,
 'deep': 0.6989700043360187,
 'life': 0.3979400086720376,
 'government': 0.6989700043360187,
 "nasa's": 0.6989700043360187,
 'on': 0.6989700043360187,
 'measure': 0.6989700043360187,
 'species': 0.6989700043360187,
 'a': 0.22184874961635637,
 'to': 0.6989700043360187,
 'unusual': 0.6989700043360187,
 'market': 0.6989700043360187,
 'rover': 0.6989700043360187,
 'galaxy': 0.6989700043360187,
 'change': 0.6989700043360187,
 'distant': 0.6989700043360187,
 'mars': 0.698970004336

In [14]:
# Calculate TF-IDF values
tfidf_values = []
for i, document in enumerate(documents):
    terms = preprocess_text(document)
    tfidf_document = []
    for term in vocabulary:
        tf = tf_values[term][i]
        idf = idf_values[term]
        tfidf = tf * idf
        tfidf_document.append(tfidf)
    tfidf_values.append(tfidf_document)

In [15]:
# Convert TF-IDF values to a DataFrame
df_tfidf = pd.DataFrame(tfidf_values, columns=list(vocabulary))

# Display TF-IDF results
print("TF-IDF:")
print(df_tfidf)

TF-IDF:
   searching       is        in  announced  scientists        of      drop  \
0    0.00000  0.00000  0.005378   0.000000    0.009446  0.005378  0.000000   
1    0.00932  0.00932  0.000000   0.000000    0.000000  0.005306  0.000000   
2    0.00000  0.00000  0.006122   0.000000    0.000000  0.000000  0.010753   
3    0.00000  0.00000  0.000000   0.000000    0.000000  0.000000  0.000000   
4    0.00000  0.00000  0.000000   0.011095    0.000000  0.000000  0.000000   

      stock     trade  identify  ...       new      for      star    combat  \
0  0.000000  0.000000  0.000000  ...  0.005378  0.00000  0.000000  0.000000   
1  0.000000  0.000000  0.000000  ...  0.000000  0.00932  0.000000  0.000000   
2  0.010753  0.010753  0.000000  ...  0.000000  0.00000  0.000000  0.000000   
3  0.000000  0.000000  0.009446  ...  0.000000  0.00000  0.009446  0.000000   
4  0.000000  0.000000  0.000000  ...  0.006317  0.00000  0.000000  0.011095   

     marine   planet      red  ancient    signs 

In [130]:
# Save TF-IDF results to a CSV file (optional)
df_tfidf.to_csv("tfidf_custom_preprocessed_news.csv", index=False)

# Using Libraries for Lemmatization and Tf-Idf

In [131]:
# if your machine doesn't have these libraries, you need to install them
import pandas as pd
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

# download the punkt
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\norak\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [132]:

# Initialize NLTK's lemmatizer and download stopwords
nltk.download('wordnet')
nltk.download('stopwords')
# Initialize NLTK's lemmatizer
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
lemmatizer = WordNetLemmatizer()
ps = PorterStemmer()

# Tokenize documents into words (terms), remove punctuation, lemmatize, and remove stopwords
def preprocess_text(document):
    terms = nltk.word_tokenize(document)
    terms = [term.strip(string.punctuation) for term in terms]
    terms = [ps.stem(term) for term in terms]
    terms = [term.lower() for term in terms if term not in stopwords.words('english')]
    return ' '.join(terms)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\norak\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\norak\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\norak\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [133]:
# Preprocess the text in the documents
preprocessed_documents = [preprocess_text(document) for document in documents]
print(preprocessed_documents)
# Create a TfidfVectorizer instance
vectorizer = TfidfVectorizer()

['scientist discov new speci marin life deep ocean ', 'nasa mar rover search sign ancient life red planet ', 'stock market experienc signific drop trade today ', 'astronom identifi distant galaxi unusu star format ', 'govern announc new measur combat climat chang ']


In [134]:
# Fit and transform the preprocessed documents to compute TF-IDF values CADT@0zJanZ!
tfidf_matrix = vectorizer.fit_transform(preprocessed_documents)

# Convert the TF-IDF matrix to a DataFrame
df_tfidf = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

In [135]:
# Display TF-IDF results
print("TF-IDF:")
print(df_tfidf)

TF-IDF:
    ancient   announc  astronom     chang    climat    combat     deep  \
0  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.37007   
1  0.339992  0.000000  0.000000  0.000000  0.000000  0.000000  0.00000   
2  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.00000   
3  0.000000  0.000000  0.377964  0.000000  0.000000  0.000000  0.00000   
4  0.000000  0.387757  0.000000  0.387757  0.387757  0.387757  0.00000   

    discov   distant      drop  ...  scientist    search      sign  signific  \
0  0.37007  0.000000  0.000000  ...    0.37007  0.000000  0.000000  0.000000   
1  0.00000  0.000000  0.000000  ...    0.00000  0.339992  0.339992  0.000000   
2  0.00000  0.000000  0.377964  ...    0.00000  0.000000  0.000000  0.377964   
3  0.00000  0.377964  0.000000  ...    0.00000  0.000000  0.000000  0.000000   
4  0.00000  0.000000  0.000000  ...    0.00000  0.000000  0.000000  0.000000   

     speci      star     stock     today     trade     unusu  
0  

In [136]:
!git add .



In [137]:
!git commit -m "Submitted"

[main 49d015a] Submitted
 2 files changed, 109 insertions(+), 68 deletions(-)
 create mode 100644 tfidf_custom_preprocessed_news.csv


In [138]:
!git push

To https://github.com/IR-CSGen8/lab3-tf-idf-Norakpichit.git
   3bdc70a..49d015a  main -> main


In [139]:
df = pd.read_csv("tfidf_custom_preprocessed_news.csv")

In [140]:
df

Unnamed: 0,government,drop,scientists,trade,to,in,signs,identify,is,market,...,deep,ocean,experience,ancient,of,with,the,rover,species,combat
0,0.0,0.0,0.69897,0.0,0.0,0.39794,0.0,0.0,0.0,0.0,...,0.69897,0.69897,0.0,0.0,0.39794,0.0,0.09691,0.0,0.69897,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.69897,0.0,0.69897,0.0,...,0.0,0.0,0.0,0.69897,0.39794,0.0,0.09691,0.69897,0.0,0.0
2,0.0,0.69897,0.0,0.69897,0.0,0.39794,0.0,0.0,0.0,0.69897,...,0.0,0.0,0.69897,0.0,0.0,0.0,0.09691,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.69897,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.69897,0.0,0.0,0.0,0.0
4,0.69897,0.0,0.0,0.0,0.69897,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.09691,0.0,0.0,0.69897
