In [25]:
import pandas as pd
import math #important library , calculate log
import string

In [2]:
# Sample collection of documents
documents = [
    "Scientists have discovered a new species of marine life in the deep ocean.",
    "NASA's Mars rover is searching for signs of ancient life on the Red Planet.",
    "The stock market experienced a significant drop in trading today.",
    "Astronomers have identified a distant galaxy with unusual star formations.",
    "The government announced new measures to combat climate change."
]

In [7]:
# Create a dictionary for lemmatization (a simple example, not comprehensive)
lemmatization_dict = {
    "species": "specie",
    "species": "species",
    "oceans": "ocean",
    "ocean's": "ocean",
    "rover": "rover",
    "discovered":"discover",
    "experienced":"experience",
    "rovers": "rover",
    "trading": "trade",
    "identified": "identify",
    "identifies": "identify",
    "formations": "formation",
    "governments": "government",
    "measures": "measure"
}

In [8]:
# terms = [lemmatization_dict.get(term, term) for term in terms]
# terms

In [10]:
# Tokenize documents into words (terms), remove punctuation, and lemmatize
def preprocess_text(document):
    terms = document.lower().split()
    terms = [term.strip(string.punctuation) for term in terms]
    terms = [lemmatization_dict.get(term, term) for term in terms]
    return terms

In [14]:
# Create a set of unique terms (vocabulary)
vocabulary = set()
for document in documents:
    terms = preprocess_text(document)
    vocabulary.update(terms)
sorted_vocabulary = sorted(vocabulary) 

vocabulary

{'a',
 'ancient',
 'announced',
 'astronomers',
 'change',
 'climate',
 'combat',
 'deep',
 'discover',
 'distant',
 'drop',
 'experience',
 'for',
 'formation',
 'galaxy',
 'government',
 'have',
 'identify',
 'in',
 'is',
 'life',
 'marine',
 'market',
 'mars',
 'measure',
 "nasa's",
 'new',
 'ocean',
 'of',
 'on',
 'planet',
 'red',
 'rover',
 'scientists',
 'searching',
 'significant',
 'signs',
 'species',
 'star',
 'stock',
 'the',
 'to',
 'today',
 'trade',
 'unusual',
 'with'}

In [27]:
# Create a dictionary to store the term frequency (TF) for each term in each document
tf_values = {term: [0] * len(documents) for term in vocabulary}

In [28]:
# Calculate Term Frequency (TF)
for i, document in enumerate(documents):
    # your code here...
    x = preprocess_text(document)
    for y in x:
        tf_values[y][i] += 1 
tf_values

{'announced': [0, 0, 0, 0, 1],
 'market': [0, 0, 1, 0, 0],
 'drop': [0, 0, 1, 0, 0],
 'red': [0, 1, 0, 0, 0],
 'ocean': [1, 0, 0, 0, 0],
 'distant': [0, 0, 0, 1, 0],
 'combat': [0, 0, 0, 0, 1],
 'unusual': [0, 0, 0, 1, 0],
 'deep': [1, 0, 0, 0, 0],
 'measure': [0, 0, 0, 0, 1],
 'climate': [0, 0, 0, 0, 1],
 'species': [1, 0, 0, 0, 0],
 'a': [1, 0, 1, 1, 0],
 'have': [1, 0, 0, 1, 0],
 'galaxy': [0, 0, 0, 1, 0],
 'in': [1, 0, 1, 0, 0],
 'star': [0, 0, 0, 1, 0],
 'identify': [0, 0, 0, 1, 0],
 'the': [1, 1, 1, 0, 1],
 'signs': [0, 1, 0, 0, 0],
 'astronomers': [0, 0, 0, 1, 0],
 'scientists': [1, 0, 0, 0, 0],
 'with': [0, 0, 0, 1, 0],
 "nasa's": [0, 1, 0, 0, 0],
 'trade': [0, 0, 1, 0, 0],
 'on': [0, 1, 0, 0, 0],
 'stock': [0, 0, 1, 0, 0],
 'new': [1, 0, 0, 0, 1],
 'experience': [0, 0, 1, 0, 0],
 'life': [1, 1, 0, 0, 0],
 'change': [0, 0, 0, 0, 1],
 'planet': [0, 1, 0, 0, 0],
 'ancient': [0, 1, 0, 0, 0],
 'government': [0, 0, 0, 0, 1],
 'mars': [0, 1, 0, 0, 0],
 'rover': [0, 1, 0, 0, 0],
 'sea

In [42]:
# Calculate Inverse Document Frequency (IDF)
idf_values = {}
total_documents = len(documents)
for term in vocabulary:
    # your code here ...
    df = sum([ 1 for doc in documents if term in preprocess_text(doc)])
    idf_values[term] = math.log(total_documents/df)
idf_values

{'announced': 1.6094379124341003,
 'market': 1.6094379124341003,
 'drop': 1.6094379124341003,
 'red': 1.6094379124341003,
 'ocean': 1.6094379124341003,
 'distant': 1.6094379124341003,
 'combat': 1.6094379124341003,
 'unusual': 1.6094379124341003,
 'deep': 1.6094379124341003,
 'measure': 1.6094379124341003,
 'climate': 1.6094379124341003,
 'species': 1.6094379124341003,
 'a': 0.5108256237659907,
 'have': 0.9162907318741551,
 'galaxy': 1.6094379124341003,
 'in': 0.9162907318741551,
 'star': 1.6094379124341003,
 'identify': 1.6094379124341003,
 'the': 0.22314355131420976,
 'signs': 1.6094379124341003,
 'astronomers': 1.6094379124341003,
 'scientists': 1.6094379124341003,
 'with': 1.6094379124341003,
 "nasa's": 1.6094379124341003,
 'trade': 1.6094379124341003,
 'on': 1.6094379124341003,
 'stock': 1.6094379124341003,
 'new': 0.9162907318741551,
 'experience': 1.6094379124341003,
 'life': 0.9162907318741551,
 'change': 1.6094379124341003,
 'planet': 1.6094379124341003,
 'ancient': 1.60943791

In [43]:
# Calculate TF-IDF values
tfidf_values = []
for i, document in enumerate(documents):
    terms = preprocess_text(document)
    tfidf_document = []
    for term in vocabulary:
        tf = tf_values[term][i]
        idf = idf_values[term]
        tfidf = tf * idf
        tfidf_document.append(tfidf)
    tfidf_values.append(tfidf_document)
tfidf_values

[[0.0,
  0.0,
  0.0,
  0.0,
  1.6094379124341003,
  0.0,
  0.0,
  0.0,
  1.6094379124341003,
  0.0,
  0.0,
  1.6094379124341003,
  0.5108256237659907,
  0.9162907318741551,
  0.0,
  0.9162907318741551,
  0.0,
  0.0,
  0.22314355131420976,
  0.0,
  0.0,
  1.6094379124341003,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.9162907318741551,
  0.0,
  0.9162907318741551,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.6094379124341003,
  0.9162907318741551,
  1.6094379124341003,
  0.0,
  0.0,
  0.0,
  0.0],
 [0.0,
  0.0,
  0.0,
  1.6094379124341003,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.22314355131420976,
  1.6094379124341003,
  0.0,
  0.0,
  0.0,
  1.6094379124341003,
  0.0,
  1.6094379124341003,
  0.0,
  0.0,
  0.0,
  0.9162907318741551,
  0.0,
  1.6094379124341003,
  1.6094379124341003,
  0.0,
  1.6094379124341003,
  1.6094379124341003,
  1.6094379124341003,
  0.0,
  0.0,
  0.0,
  0.9162907318741551,
  0.0,
  0.0,

In [44]:
# Convert TF-IDF values to a DataFrame
df_tfidf = pd.DataFrame(tfidf_values, columns=list(vocabulary))

# Display TF-IDF results
print("TF-IDF:")
print(df_tfidf)

TF-IDF:
   announced    market      drop       red     ocean   distant    combat  \
0   0.000000  0.000000  0.000000  0.000000  1.609438  0.000000  0.000000   
1   0.000000  0.000000  0.000000  1.609438  0.000000  0.000000  0.000000   
2   0.000000  1.609438  1.609438  0.000000  0.000000  0.000000  0.000000   
3   0.000000  0.000000  0.000000  0.000000  0.000000  1.609438  0.000000   
4   1.609438  0.000000  0.000000  0.000000  0.000000  0.000000  1.609438   

    unusual      deep   measure  ...  searching        to  formation  \
0  0.000000  1.609438  0.000000  ...   0.000000  0.000000   0.000000   
1  0.000000  0.000000  0.000000  ...   1.609438  0.000000   0.000000   
2  0.000000  0.000000  0.000000  ...   0.000000  0.000000   0.000000   
3  1.609438  0.000000  0.000000  ...   0.000000  0.000000   1.609438   
4  0.000000  0.000000  1.609438  ...   0.000000  1.609438   0.000000   

   discover        of    marine  significant       for     today        is  
0  1.609438  0.916291  1.

In [46]:
# Save TF-IDF results to a CSV file (optional)
# df_tfidf.to_csv("tfidf_custom_preprocessed_news.csv", index=False)

# Using Libraries for Lemmatization and Tf-Idf

In [None]:
# if your machine doesn't have these libraries, you need to install them
import pandas as pd
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

# download the punkt
nltk.download('punkt')

In [None]:

# Initialize NLTK's lemmatizer and download stopwords
nltk.download('wordnet')
nltk.download('stopwords')
# Initialize NLTK's lemmatizer
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
lemmatizer = WordNetLemmatizer()
ps = PorterStemmer()

# Tokenize documents into words (terms), remove punctuation, lemmatize, and remove stopwords
def preprocess_text(document):
    terms = nltk.word_tokenize(document)
    terms = [term.strip(string.punctuation) for term in terms]
    terms = [ps.stem(term) for term in terms]
    terms = [term.lower() for term in terms if term not in stopwords.words('english')]
    return ' '.join(terms)

In [45]:
# Preprocess the text in the documents
preprocessed_documents = [preprocess_text(document) for document in documents]

# Create a TfidfVectorizer instance
vectorizer = TfidfVectorizer()

NameError: name 'TfidfVectorizer' is not defined

In [24]:
# Fit and transform the preprocessed documents to compute TF-IDF values CADT@0zJanZ!
tfidf_matrix = vectorizer.fit_transform(preprocessed_documents)

# Convert the TF-IDF matrix to a DataFrame
df_tfidf = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

In [25]:
# Display TF-IDF results
print("TF-IDF:")
print(df_tfidf)

TF-IDF:
    ancient   announc  astronom     chang    climat    combat     deep  \
0  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.37007   
1  0.339992  0.000000  0.000000  0.000000  0.000000  0.000000  0.00000   
2  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.00000   
3  0.000000  0.000000  0.377964  0.000000  0.000000  0.000000  0.00000   
4  0.000000  0.387757  0.000000  0.387757  0.387757  0.387757  0.00000   

    discov   distant      drop  ...  scientist    search      sign  signific  \
0  0.37007  0.000000  0.000000  ...    0.37007  0.000000  0.000000  0.000000   
1  0.00000  0.000000  0.000000  ...    0.00000  0.339992  0.339992  0.000000   
2  0.00000  0.000000  0.377964  ...    0.00000  0.000000  0.000000  0.377964   
3  0.00000  0.377964  0.000000  ...    0.00000  0.000000  0.000000  0.000000   
4  0.00000  0.000000  0.000000  ...    0.00000  0.000000  0.000000  0.000000   

     speci      star     stock     today     trade     unusu  
0  