In [72]:
import pandas as pd
import math #important library , calculate log
import string
import numpy as np

In [102]:
# Sample collection of documents
documents = [
    "Scientists have discovered a new species of marine life in the deep ocean.",
    "NASA's Mars rover is searching for signs of ancient life on the Red Planet.",
    "The stock market experienced a significant drop in trading today.",
    "Astronomers have identified a distant galaxy with unusual star formations.",
    "The government announced new measures to combat climate change."
]


In [103]:
# Create a dictionary for lemmatization (a simple example, not comprehensive)

lemmatization_dict = {
    "species": "specie",
    "species": "species",
    "oceans": "ocean",
    "ocean's": "ocean",
    "rover": "rover",
    "discovered":"discover",
    "experienced":"experience",
    "rovers": "rover",
    "trading": "trade",
    "identified": "identify",
    "identifies": "identify",
    "formations": "formation",
    "governments": "government",
    "measures": "measure"
}

lemmatization_dict

{'species': 'species',
 'oceans': 'ocean',
 "ocean's": 'ocean',
 'rover': 'rover',
 'discovered': 'discover',
 'experienced': 'experience',
 'rovers': 'rover',
 'trading': 'trade',
 'identified': 'identify',
 'identifies': 'identify',
 'formations': 'formation',
 'governments': 'government',
 'measures': 'measure'}

In [None]:
# terms = [lemmatization_dict.get(term, term) for term in terms]
# terms

In [104]:
# Tokenize documents into words (terms), remove punctuation, and lemmatize
def preprocess_text(document):
    terms = document.lower().split()
    terms = [term.strip(string.punctuation) for term in terms]
    terms = [lemmatization_dict.get(term, term) for term in terms]
    return terms

In [105]:
# Create a set of unique terms (vocabulary)
vocabulary = {term for document in documents for term in preprocess_text(document)}
# you code here ...
vocabulary

{'a',
 'ancient',
 'announced',
 'astronomers',
 'change',
 'climate',
 'combat',
 'deep',
 'discover',
 'distant',
 'drop',
 'experience',
 'for',
 'formation',
 'galaxy',
 'government',
 'have',
 'identify',
 'in',
 'is',
 'life',
 'marine',
 'market',
 'mars',
 'measure',
 "nasa's",
 'new',
 'ocean',
 'of',
 'on',
 'planet',
 'red',
 'rover',
 'scientists',
 'searching',
 'significant',
 'signs',
 'species',
 'star',
 'stock',
 'the',
 'to',
 'today',
 'trade',
 'unusual',
 'with'}

In [127]:
# Create a dictionary to store the term frequency (TF) for each term in each document
tf_values = {term: [0] * len(documents) for term in vocabulary}
tf_values

{'searching': [0, 0, 0, 0, 0],
 'formation': [0, 0, 0, 0, 0],
 'mars': [0, 0, 0, 0, 0],
 'scientists': [0, 0, 0, 0, 0],
 'with': [0, 0, 0, 0, 0],
 'life': [0, 0, 0, 0, 0],
 'trade': [0, 0, 0, 0, 0],
 'unusual': [0, 0, 0, 0, 0],
 'astronomers': [0, 0, 0, 0, 0],
 'change': [0, 0, 0, 0, 0],
 'galaxy': [0, 0, 0, 0, 0],
 'drop': [0, 0, 0, 0, 0],
 'have': [0, 0, 0, 0, 0],
 'deep': [0, 0, 0, 0, 0],
 'ancient': [0, 0, 0, 0, 0],
 'climate': [0, 0, 0, 0, 0],
 'new': [0, 0, 0, 0, 0],
 'rover': [0, 0, 0, 0, 0],
 'ocean': [0, 0, 0, 0, 0],
 'the': [0, 0, 0, 0, 0],
 'measure': [0, 0, 0, 0, 0],
 'for': [0, 0, 0, 0, 0],
 'identify': [0, 0, 0, 0, 0],
 'experience': [0, 0, 0, 0, 0],
 'market': [0, 0, 0, 0, 0],
 'in': [0, 0, 0, 0, 0],
 'significant': [0, 0, 0, 0, 0],
 'discover': [0, 0, 0, 0, 0],
 'star': [0, 0, 0, 0, 0],
 'species': [0, 0, 0, 0, 0],
 'is': [0, 0, 0, 0, 0],
 'on': [0, 0, 0, 0, 0],
 'distant': [0, 0, 0, 0, 0],
 'combat': [0, 0, 0, 0, 0],
 'a': [0, 0, 0, 0, 0],
 'signs': [0, 0, 0, 0, 0],
 '

In [128]:
# Calculate Term Frequency (TF)
for i, document in enumerate(documents):
    terms = preprocess_text(document)
    for term in terms:
        tf_values[term][i] += 1
tf_values

{'searching': [0, 1, 0, 0, 0],
 'formation': [0, 0, 0, 1, 0],
 'mars': [0, 1, 0, 0, 0],
 'scientists': [1, 0, 0, 0, 0],
 'with': [0, 0, 0, 1, 0],
 'life': [1, 1, 0, 0, 0],
 'trade': [0, 0, 1, 0, 0],
 'unusual': [0, 0, 0, 1, 0],
 'astronomers': [0, 0, 0, 1, 0],
 'change': [0, 0, 0, 0, 1],
 'galaxy': [0, 0, 0, 1, 0],
 'drop': [0, 0, 1, 0, 0],
 'have': [1, 0, 0, 1, 0],
 'deep': [1, 0, 0, 0, 0],
 'ancient': [0, 1, 0, 0, 0],
 'climate': [0, 0, 0, 0, 1],
 'new': [1, 0, 0, 0, 1],
 'rover': [0, 1, 0, 0, 0],
 'ocean': [1, 0, 0, 0, 0],
 'the': [1, 1, 1, 0, 1],
 'measure': [0, 0, 0, 0, 1],
 'for': [0, 1, 0, 0, 0],
 'identify': [0, 0, 0, 1, 0],
 'experience': [0, 0, 1, 0, 0],
 'market': [0, 0, 1, 0, 0],
 'in': [1, 0, 1, 0, 0],
 'significant': [0, 0, 1, 0, 0],
 'discover': [1, 0, 0, 0, 0],
 'star': [0, 0, 0, 1, 0],
 'species': [1, 0, 0, 0, 0],
 'is': [0, 1, 0, 0, 0],
 'on': [0, 1, 0, 0, 0],
 'distant': [0, 0, 0, 1, 0],
 'combat': [0, 0, 0, 0, 1],
 'a': [1, 0, 1, 1, 0],
 'signs': [0, 1, 0, 0, 0],
 '

In [137]:
# Calculate Inverse Document Frequency (IDF)
idf_values = {}
total_documents = len(documents)
for term in vocabulary:
    document_occurences = sum([1 for document in documents if term in preprocess_text(document)])
    idf_values[term] = math.log(total_documents / (document_occurences+1))
    print(term,":",idf_values[term])


searching : 0.9162907318741551
formation : 0.9162907318741551
mars : 0.9162907318741551
scientists : 0.9162907318741551
with : 0.9162907318741551
life : 0.5108256237659907
trade : 0.9162907318741551
unusual : 0.9162907318741551
astronomers : 0.9162907318741551
change : 0.9162907318741551
galaxy : 0.9162907318741551
drop : 0.9162907318741551
have : 0.5108256237659907
deep : 0.9162907318741551
ancient : 0.9162907318741551
climate : 0.9162907318741551
new : 0.5108256237659907
rover : 0.9162907318741551
ocean : 0.9162907318741551
the : 0.0
measure : 0.9162907318741551
for : 0.9162907318741551
identify : 0.9162907318741551
experience : 0.9162907318741551
market : 0.9162907318741551
in : 0.5108256237659907
significant : 0.9162907318741551
discover : 0.9162907318741551
star : 0.9162907318741551
species : 0.9162907318741551
is : 0.9162907318741551
on : 0.9162907318741551
distant : 0.9162907318741551
combat : 0.9162907318741551
a : 0.22314355131420976
signs : 0.9162907318741551
red : 0.91629073

In [138]:
# Calculate TF-IDF values
tfidf_values = []
for i, document in enumerate(documents):
    terms = preprocess_text(document)
    tfidf_document = []
    for term in vocabulary:
        tf = tf_values[term][i]
        idf = idf_values[term]
        tfidf = tf * idf
        tfidf_document.append(tfidf)
    tfidf_values.append(tfidf_document)
tfidf_values

[[0.0,
  0.0,
  0.0,
  0.9162907318741551,
  0.0,
  0.5108256237659907,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.5108256237659907,
  0.9162907318741551,
  0.0,
  0.0,
  0.5108256237659907,
  0.0,
  0.9162907318741551,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.5108256237659907,
  0.0,
  0.9162907318741551,
  0.0,
  0.9162907318741551,
  0.0,
  0.0,
  0.0,
  0.0,
  0.22314355131420976,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.5108256237659907,
  0.9162907318741551,
  0.0,
  0.0],
 [0.9162907318741551,
  0.0,
  0.9162907318741551,
  0.0,
  0.0,
  0.5108256237659907,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.9162907318741551,
  0.0,
  0.0,
  0.9162907318741551,
  0.0,
  0.0,
  0.0,
  0.9162907318741551,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.9162907318741551,
  0.9162907318741551,
  0.0,
  0.0,
  0.0,
  0.9162907318741551,
  0.9162907318741551,
  0.9162907318741551,
  0.9162907318741551,
  0.0,
  0.0,
  0.0,
  0.5108256237659907,


In [140]:
# Convert TF-IDF values to a DataFrame
df_tfidf = pd.DataFrame(tfidf_values, columns=list(vocabulary))

# Display TF-IDF results
print("TF-IDF:")
df_tfidf['scientists']

TF-IDF:


0    0.916291
1    0.000000
2    0.000000
3    0.000000
4    0.000000
Name: scientists, dtype: float64

In [111]:
# Save TF-IDF results to a CSV file (optional)
df_tfidf.to_csv("tfidf_custom_preprocessed_news.csv", index=False)

# Using Libraries for Lemmatization and Tf-Idf

In [79]:
# if your machine doesn't have these libraries, you need to install them
import pandas as pd
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

# download the punkt
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\davan\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [83]:

# Initialize NLTK's lemmatizer and download stopwords
nltk.download('wordnet')
nltk.download('stopwords')
# Initialize NLTK's lemmatizer
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
lemmatizer = WordNetLemmatizer()
ps = PorterStemmer()

# Tokenize documents into words (terms), remove punctuation, lemmatize, and remove stopwords
def preprocess_text(document):
    terms = nltk.word_tokenize(document)
    terms = [term.strip(string.punctuation) for term in terms]
    terms = [ps.stem(term) for term in terms]
    terms = [term.lower() for term in terms if term not in stopwords.words('english')]
    return ' '.join(terms)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\davan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\davan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\davan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [84]:
# Preprocess the text in the documents
# preprocessed_documents = # your code here ...
preprocessed_documents = [preprocess_text(document) for document in documents]

# Create a TfidfVectorizer instance
vectorizer = TfidfVectorizer()

In [85]:
# Fit and transform the preprocessed documents to compute TF-IDF values CADT@0zJanZ!
tfidf_matrix = vectorizer.fit_transform(preprocessed_documents)
# Convert the TF-IDF matrix to a DataFrame
df_tfidf = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

In [86]:
# Display TF-IDF results
print("TF-IDF:")
print(df_tfidf)

TF-IDF:
    ancient   announc  astronom     chang    climat    combat     deep  \
0  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.37007   
1  0.339992  0.000000  0.000000  0.000000  0.000000  0.000000  0.00000   
2  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.00000   
3  0.000000  0.000000  0.377964  0.000000  0.000000  0.000000  0.00000   
4  0.000000  0.387757  0.000000  0.387757  0.387757  0.387757  0.00000   

    discov   distant      drop  ...  scientist    search      sign  signific  \
0  0.37007  0.000000  0.000000  ...    0.37007  0.000000  0.000000  0.000000   
1  0.00000  0.000000  0.000000  ...    0.00000  0.339992  0.339992  0.000000   
2  0.00000  0.000000  0.377964  ...    0.00000  0.000000  0.000000  0.377964   
3  0.00000  0.377964  0.000000  ...    0.00000  0.000000  0.000000  0.000000   
4  0.00000  0.000000  0.000000  ...    0.00000  0.000000  0.000000  0.000000   

     speci      star     stock     today     trade     unusu  
0  