In [62]:
import pandas as pd
import math #important library , calculate log
import string

In [105]:
# Sample collection of documents
documents = [
    "Scientists have discovered a new species of marine life in the deep ocean.",
    "NASA's Mars rover is searching for signs of ancient life on the Red Planet.",
    "The stock market experienced a significant drop in trading today.",
    "Astronomers have identified a distant galaxy with unusual star formations.",
    "The government announced new measures to combat climate change."
]

In [106]:
# Create a dictionary for lemmatization (a simple example, not comprehensive)
lemmatization_dict = {
    "species": "specie",
    "species": "species",
    "oceans": "ocean",
    "ocean's": "ocean",
    "rover": "rover",
    "discovered":"discover",
    "experienced":"experience",
    "rovers": "rover",
    "trading": "trade",
    "identified": "identify",
    "identifies": "identify",
    "formations": "formation",
    "governments": "government",
    "measures": "measure"
}

In [114]:
terms = [lemmatization_dict.get(term, term) for term in lemmatization_dict]

In [115]:
# Tokenize documents into words (terms), remove punctuation, and lemmatize
def preprocess_text(document):
    terms = document.lower().split()
    terms = [term.strip(string.punctuation) for term in terms]
    terms = [lemmatization_dict.get(term, term) for term in terms]
    return terms

In [116]:
# Create a set of unique terms (vocabulary)
vocabulary = set()
# you code here ...
for document in documents:
    vocabulary.update(preprocess_text(document))

In [119]:
# Create a dictionary to store the term frequency (TF) for each term in each document
tf_values = {term: [0] * len(documents) for term in vocabulary}
# Calculate Term Frequency (TF)
for i, document in enumerate(documents):
   for term in preprocess_text(document):
       tf_values[term][i] += 1

tf_values

{'government': [0, 0, 0, 0, 1],
 'drop': [0, 0, 1, 0, 0],
 'scientists': [1, 0, 0, 0, 0],
 'trade': [0, 0, 1, 0, 0],
 'to': [0, 0, 0, 0, 1],
 'in': [1, 0, 1, 0, 0],
 'signs': [0, 1, 0, 0, 0],
 'identify': [0, 0, 0, 1, 0],
 'is': [0, 1, 0, 0, 0],
 'market': [0, 0, 1, 0, 0],
 'discover': [1, 0, 0, 0, 0],
 'planet': [0, 1, 0, 0, 0],
 'unusual': [0, 0, 0, 1, 0],
 "nasa's": [0, 1, 0, 0, 0],
 'have': [1, 0, 0, 1, 0],
 'red': [0, 1, 0, 0, 0],
 'mars': [0, 1, 0, 0, 0],
 'change': [0, 0, 0, 0, 1],
 'announced': [0, 0, 0, 0, 1],
 'stock': [0, 0, 1, 0, 0],
 'a': [1, 0, 1, 1, 0],
 'galaxy': [0, 0, 0, 1, 0],
 'new': [1, 0, 0, 0, 1],
 'significant': [0, 0, 1, 0, 0],
 'climate': [0, 0, 0, 0, 1],
 'life': [1, 1, 0, 0, 0],
 'today': [0, 0, 1, 0, 0],
 'marine': [1, 0, 0, 0, 0],
 'on': [0, 1, 0, 0, 0],
 'astronomers': [0, 0, 0, 1, 0],
 'star': [0, 0, 0, 1, 0],
 'for': [0, 1, 0, 0, 0],
 'searching': [0, 1, 0, 0, 0],
 'distant': [0, 0, 0, 1, 0],
 'formation': [0, 0, 0, 1, 0],
 'measure': [0, 0, 0, 0, 1],
 

In [120]:
# Calculate DF
total_documents = len(documents)
df_values = {term: 0 for term in vocabulary}
for document in documents:
    for term in vocabulary:
        if term in preprocess_text(document):
            df_values[term] += 1
# Calculate Inverse Document Frequency (IDF)
idf_values = {term: 0 for term in vocabulary}

for term in vocabulary:
    idf_values[term] = math.log((total_documents / df_values[term]), 100)

idf_values

{'government': 0.3494850021680094,
 'drop': 0.3494850021680094,
 'scientists': 0.3494850021680094,
 'trade': 0.3494850021680094,
 'to': 0.3494850021680094,
 'in': 0.1989700043360188,
 'signs': 0.3494850021680094,
 'identify': 0.3494850021680094,
 'is': 0.3494850021680094,
 'market': 0.3494850021680094,
 'discover': 0.3494850021680094,
 'planet': 0.3494850021680094,
 'unusual': 0.3494850021680094,
 "nasa's": 0.3494850021680094,
 'have': 0.1989700043360188,
 'red': 0.3494850021680094,
 'mars': 0.3494850021680094,
 'change': 0.3494850021680094,
 'announced': 0.3494850021680094,
 'stock': 0.3494850021680094,
 'a': 0.11092437480817818,
 'galaxy': 0.3494850021680094,
 'new': 0.1989700043360188,
 'significant': 0.3494850021680094,
 'climate': 0.3494850021680094,
 'life': 0.1989700043360188,
 'today': 0.3494850021680094,
 'marine': 0.3494850021680094,
 'on': 0.3494850021680094,
 'astronomers': 0.3494850021680094,
 'star': 0.3494850021680094,
 'for': 0.3494850021680094,
 'searching': 0.34948500

In [85]:
# Calculate TF-IDF values
tfidf_values = []
for i, document in enumerate(documents):
    terms = preprocess_text(document)
    tfidf_document = []
    for term in vocabulary:
        tf = tf_values[term][i]
        idf = idf_values[term]
        tfidf = tf * idf
        tfidf_document.append(tfidf)
    tfidf_values.append(tfidf_document)

In [86]:
# Convert TF-IDF values to a DataFrame
df_tfidf = pd.DataFrame(tfidf_values, columns=list(vocabulary))

# Display TF-IDF results
print("TF-IDF:")
print(df_tfidf)

TF-IDF:
   government     drop  scientists    trade       to       in    signs  \
0     0.00000  0.00000     0.69897  0.00000  0.00000  0.39794  0.00000   
1     0.00000  0.00000     0.00000  0.00000  0.00000  0.00000  0.69897   
2     0.00000  0.69897     0.00000  0.69897  0.00000  0.39794  0.00000   
3     0.00000  0.00000     0.00000  0.00000  0.00000  0.00000  0.00000   
4     0.69897  0.00000     0.00000  0.00000  0.69897  0.00000  0.00000   

   identify       is   market  ...     deep    ocean  experience  ancient  \
0   0.00000  0.00000  0.00000  ...  0.69897  0.69897     0.00000  0.00000   
1   0.00000  0.69897  0.00000  ...  0.00000  0.00000     0.00000  0.69897   
2   0.00000  0.00000  0.69897  ...  0.00000  0.00000     0.69897  0.00000   
3   0.69897  0.00000  0.00000  ...  0.00000  0.00000     0.00000  0.00000   
4   0.00000  0.00000  0.00000  ...  0.00000  0.00000     0.00000  0.00000   

        of     with      the    rover  species   combat  
0  0.39794  0.00000  0.096

In [87]:
# Save TF-IDF results to a CSV file (optional)
# df_tfidf.to_csv("tfidf_custom_preprocessed_news.csv", index=False)

# Using Libraries for Lemmatization and Tf-Idf

In [88]:
# if your machine doesn't have these libraries, you need to install them
import pandas as pd
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

# download the punkt
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\norak\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [89]:

# Initialize NLTK's lemmatizer and download stopwords
nltk.download('wordnet')
nltk.download('stopwords')
# Initialize NLTK's lemmatizer
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
lemmatizer = WordNetLemmatizer()
ps = PorterStemmer()

# Tokenize documents into words (terms), remove punctuation, lemmatize, and remove stopwords
def preprocess_text(document):
    terms = nltk.word_tokenize(document)
    terms = [term.strip(string.punctuation) for term in terms]
    terms = [ps.stem(term) for term in terms]
    terms = [term.lower() for term in terms if term not in stopwords.words('english')]
    return ' '.join(terms)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\norak\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\norak\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\norak\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [90]:
# Preprocess the text in the documents
preprocessed_documents = [preprocess_text(document) for document in documents]
print(preprocessed_documents)
# Create a TfidfVectorizer instance
vectorizer = TfidfVectorizer()

['scientist discov new speci marin life deep ocean ', 'nasa mar rover search sign ancient life red planet ', 'stock market experienc signific drop trade today ', 'astronom identifi distant galaxi unusu star format ', 'govern announc new measur combat climat chang ']


In [91]:
# Fit and transform the preprocessed documents to compute TF-IDF values CADT@0zJanZ!
tfidf_matrix = vectorizer.fit_transform(preprocessed_documents)

# Convert the TF-IDF matrix to a DataFrame
df_tfidf = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

In [92]:
# Display TF-IDF results
print("TF-IDF:")
print(df_tfidf)

TF-IDF:
    ancient   announc  astronom     chang    climat    combat     deep  \
0  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.37007   
1  0.339992  0.000000  0.000000  0.000000  0.000000  0.000000  0.00000   
2  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.00000   
3  0.000000  0.000000  0.377964  0.000000  0.000000  0.000000  0.00000   
4  0.000000  0.387757  0.000000  0.387757  0.387757  0.387757  0.00000   

    discov   distant      drop  ...  scientist    search      sign  signific  \
0  0.37007  0.000000  0.000000  ...    0.37007  0.000000  0.000000  0.000000   
1  0.00000  0.000000  0.000000  ...    0.00000  0.339992  0.339992  0.000000   
2  0.00000  0.000000  0.377964  ...    0.00000  0.000000  0.000000  0.377964   
3  0.00000  0.377964  0.000000  ...    0.00000  0.000000  0.000000  0.000000   
4  0.00000  0.000000  0.000000  ...    0.00000  0.000000  0.000000  0.000000   

     speci      star     stock     today     trade     unusu  
0  

In [93]:
!git add .



In [94]:
!git commit -m "Submitted"

[main 3bdc70a] Submitted
 1 file changed, 101 insertions(+), 23 deletions(-)


In [95]:
!git push

To https://github.com/IR-CSGen8/lab3-tf-idf-Norakpichit.git
   cf05eff..3bdc70a  main -> main
