In [None]:
import re

In [None]:
import pandas as pd
input_file = "posts_first_targil.xlsx"

# Read the Excel file with multiple sheets
df = pd.read_excel(input_file, sheet_name=None)

for sheet_name, data in df.items():
    print(f"Sheet name: {sheet_name} Headlines:, {list(data.columns)}")

Sheet name: A-J Headlines:, ['sub_title', 'date', 'Newspaper', 'Body Text', 'title']
Sheet name: BBC Headlines:, ['date', 'Newspaper', 'Body Text', 'title']
Sheet name: J-P Headlines:, ['date', 'Newspaper', 'Body', 'title']
Sheet name: NY-T Headlines:, ['date', 'Newspaper', 'Body Text', 'title']


In [None]:
if "J-P" in df:
    df["J-P"].rename(columns={"Body": "Body Text"}, inplace=True)

# Verify the change
for sheet_name, data in df.items():
    print(f"Sheet name: {sheet_name} Headlines:, {list(data.columns)}")

Sheet name: A-J Headlines:, ['sub_title', 'date', 'Newspaper', 'Body Text', 'title']
Sheet name: BBC Headlines:, ['date', 'Newspaper', 'Body Text', 'title']
Sheet name: J-P Headlines:, ['date', 'Newspaper', 'Body Text', 'title']
Sheet name: NY-T Headlines:, ['date', 'Newspaper', 'Body Text', 'title']


**Function to clean the data text**

In [None]:
def clean_text(text):
    if not isinstance(text, str):
        return ""
    regx = r"((?<!\w)[^\s\w]|[^\s\w](?!\w))"
    dot_pattern = r"(?<!\w)([a-zA-Z]{2,})\.([a-zA-Z]{2,})(?!\w)"
    clean_t = re.sub(regx, r" \1 ", text)
    clean_t = re.sub(dot_pattern, r"\1 . \2", clean_t)
    return re.sub(r"\s+", " ", clean_t).strip()


**Part 2: Functions for processing data by lemmatize the text**

In [None]:
# Load spaCy's language model
import spacy
nlp = spacy.load("en_core_web_sm")

In [None]:
def lemmatize_text(text):
    if not isinstance(text, str):
        return ""
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc])

**Creating the processed files:**

*A.* clean_file.xlsx

*B.* lemma_file.xlsx

**By each of the functions:**

*A.* lemmatize_text()

*B.* clean_text()

In [None]:
clean_sheets = {}
lemma_sheets = {}
for sheet_name, data in df.items():
    # Apply clean_text to all string columns in the DataFrame
    processed_clean_df = data.map(clean_text)
    clean_sheets[sheet_name] = processed_clean_df
    processed_lemma_df = data.map(lemmatize_text)
    lemma_sheets[sheet_name] = processed_lemma_df

# Save each processed sheet to a separate Excel file
output_clean_file = "output_files/clean_file.xlsx"
with pd.ExcelWriter(output_clean_file) as writer:
    for sheet_name, processed_df in clean_sheets.items():
        processed_df.to_excel(writer, sheet_name=sheet_name, index=False)

output_lemma_file = "output_files/lemma_file.xlsx"
with pd.ExcelWriter(output_lemma_file) as writer:
    for sheet_name, processed_df in lemma_sheets.items():
        processed_df.to_excel(writer, sheet_name=sheet_name, index=False)

print(f"Processed clean Excel file saved as: {output_clean_file}")
print(f"Processed lemma Excel file saved as: {output_lemma_file}")


Processed clean Excel file saved as: output_files/clean_file.xlsx
Processed lemma Excel file saved as: output_files/lemma_file.xlsx


***Part 3: using TF-IDF BM25/Okapi***

In [47]:
from rank_bm25 import BM25Okapi
from nltk.corpus import stopwords
from scipy.sparse import csr_matrix
from scipy.sparse import save_npz
import nltk
import os
import pandas as pd



In [49]:
nltk.download('stopwords')

# Initialize the set of English stop words
stop_words = set(stopwords.words('english'))

# Define the function to filter out stopwords from text
def filter_stopwords(text):
    tokens = text.split()  # Split the input into tokens
    return [token.lower() for token in tokens if token.lower() not in stop_words]

# Process lemmatized documents and generate BM25 matrix
all_results = []
input_file = "/content/output/clean_file.xlsx"
output_file = "/content/output/bm25_clean_file.csv"
clean_df = pd.read_excel(input_file, sheet_name=None)
for sheet_name, data in clean_df.items():
    print(f"Processing file: {sheet_name}")

    # Construct corpus by removing stopwords and combining text fields
    if sheet_name == 'A-J':
        documents = [
            filter_stopwords(f'{record["title"]} {record["sub_title"]} {record["Body Text"]}')
            for _, record in data.iterrows()
        ]
    else:
        documents = [
            filter_stopwords(f'{record["title"]} {record["Body Text"]}')
            for _, record in data.iterrows()
        ]
   # Create BM25 model
    bm25_model = BM25Okapi(documents)

    # Get the words from the vocabulary (IDF keys)
    words_in_vocab = bm25_model.idf.keys()

    # Prepare the result data for Excel
    for doc_idx, doc in enumerate(documents):
        # Get BM25 scores for the document
        doc_scores = bm25_model.get_scores(doc)
        # Prepare the row for this document
        row = [sheet_name, doc_idx]
        row.extend(doc_scores)  # Append BM25 scores for the document
        all_results.append(row)

# Create a DataFrame for the results
header = ["Sheet", "RowIndex"] + list(words_in_vocab)
with open(output_file, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(header)
    writer.writerows(all_results)



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Processing file: A-J
Processing file: BBC
Processing file: J-P
Processing file: NY-T


In [None]:
from scipy.sparse import load_npz

# Load the sparse matrix
sparse_matrix = load_npz("bm25/lemma/BBC.npz")

# Inspect the sparse matrix
print("Sparse matrix shape:", sparse_matrix.shape)
print("Non-zero elements:", sparse_matrix.nnz)
print("Matrix contents:")
print(sparse_matrix.toarray())

Sparse matrix shape: (549, 13666)
Non-zero elements: 1471283
Matrix contents:
[[0.         0.         0.         ... 2.94411559 0.         2.94411559]
 [0.         0.         0.         ... 2.90927687 0.         2.90927687]
 [0.         0.         0.         ... 2.94688451 0.         2.94688451]
 ...
 [0.         0.         0.         ... 2.48177516 0.         2.48177516]
 [0.         0.         0.         ... 2.64277891 0.         2.64277891]
 [0.         0.         0.         ... 2.77756136 0.         2.77756136]]


PART 3 - Word2Vec

In [None]:
import pandas as pd
import numpy as np
import string
import re
import gensim.downloader as api
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.data import find
import csv
import nltk

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Daniel\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Daniel\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [40]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.data import find
nltk.download('punkt')
nltk.download('stopwords')
# File paths
input_file = "/content/output/lemma_file.xlsx"  # Replace with your Excel file path
output_file = "/content/output/w2v_lemma_withoutIdf_withoutStopWords.csv"

df = pd.read_excel(input_file, sheet_name=None)

if "J-P" in df:
    df["J-P"].rename(columns={"Body": "Body Text"}, inplace=True)

# Load GloVe vectors via gensim downloader
try:
     word2vec_model = api.load("word2vec-google-news-300")  # 300-dimensional GloVe vectors
except Exception as e:
    print(f"Error loading model: {e}")

# Preprocessing function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"\d+", "", text)  # Remove digits and dates
    text = text.translate(str.maketrans("", "", string.punctuation))  # Remove punctuation
    tokens = word_tokenize(text)  # Tokenize the text
    stop_words = set(stopwords.words("english"))
    return [word for word in tokens if word not in stop_words]


# Process each sheet
results = []

for sheet_name, data in df.items():
    for index, row in data.iterrows():
        # Combine text from relevant columns
        if sheet_name == 'A-J':
            combined_text = " ".join(str(row[col]) for col in ['title', 'sub_title', 'Body Text'] if pd.notna(row[col]))
        else:
            combined_text = " ".join(str(row[col]) for col in ['title', 'Body Text'] if pd.notna(row[col]))

        # Preprocess text and get tokens
        tokens = preprocess_text(combined_text)
        print(tokens)
        # Extract vectors for each word
        vectors = []
        for word in tokens:
            if word in word2vec_model:
                vectors.append(word2vec_model[word])

        # If there are word vectors for the document, compute the average
        if vectors:
            avg_vector = np.mean(vectors, axis=0)
            results.append([sheet_name, index] + avg_vector.tolist())

# Save results to a CSV file
header = ["Sheet", "RowIndex"] + [f"Dim{i}" for i in range(word2vec_model.vector_size)]
with open(output_file, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(header)
    writer.writerows(results)

print(f"Word vectors saved to {output_file}")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['pope', 'renews', 'call', 'gaza', 'ceasefire', 'release', 'captive', 'easter', 'address', 'pope', 'francis', 'say', 'thought', 'go', 'face', 'war', 'especially', 'child', 'forget', 'smile', 'pope', 'francis', 'renew', 'call', 'immediate', 'ceasefire', 'gaza', 'release', 'israeli', 'captive', 'peace', 'focused', 'address', 'mark', 'easter', 'sunday', 'important', 'day', 'christian', 'calendar']
['biden', 'still', 'good', 'us', 'president', 'israel', 'could', 'wish', 'meaningless', 'ceasefire', 'resolution', 'administration', 'allow', 'un', 'security', 'council', 'pass', 'fool', 'anyone', 'united', 'states', 'president', 'ronald', 'reagan', '’', 'order', 'israeli', 'prime', 'minister', 'menachem', 'begin', 'put', 'end', 'holocaust', 'lebanon', 'perhaps', 'well', 'know', 'political', 'anecdote', 'israel', '’', 'invasion']
['israeli', 'air', 'strike', 'continue', 'across', 'gaza', 'truce', 'talk', 'struggle', 'israeli', 'strike', 'continue', 'kill', 'palestinians', 'pm', 'netanyahu', 'fac

In [None]:
import pandas as pd

# File path
output_file = "output_files/w2v_clean_vectors.csv"  # Replace with your file path

try:
    # Read the first 10 rows of the CSV file
    data = pd.read_csv(output_file)
    print(data.head())
except FileNotFoundError:
    print(f"File '{output_file}' not found. Please check the file path.")

  Sheet  RowIndex       Word      Dim0     Dim1     Dim2     Dim3     Dim4  \
0   A-J         0       pope -0.041353 -0.25456 -0.26952 -0.70652  0.19902   
1   A-J         0     renews  0.061107 -0.43574 -0.22176  0.21351  0.46934   
2   A-J         0       call  0.048021  0.13645 -0.33734  0.24853  0.34074   
3   A-J         0       gaza  0.350890 -0.74485  0.10422  0.16339 -0.60023   
4   A-J         0  ceasefire  1.192300  0.33878  0.36272  0.49263 -0.75752   

       Dim5      Dim6  ...    Dim290   Dim291   Dim292    Dim293   Dim294  \
0  0.124060 -0.288270  ...  0.033295 -0.33427  0.47666  0.276180 -0.17720   
1 -0.057211  0.009017  ... -0.520890  0.29620 -0.10997 -0.852040 -0.26484   
2 -0.310870  0.010738  ... -0.201300 -0.66498 -0.42536  0.077626  0.34800   
3  0.427600 -0.196340  ... -0.932260 -0.82188  0.10704  0.413850  0.33142   
4  0.396450  0.480260  ... -0.780520 -0.33792 -0.48446 -0.538560  0.28775   

    Dim295   Dim296   Dim297    Dim298    Dim299  
0 -0.51197 -0.594

In [None]:
import pandas as pd

# Load the Word2Vec results into a DataFrame
input_file = "output_files/w2v_clean_vectors.csv"
output_file = "output_files/word2vec_mean_vectors.csv"

# Load the word vectors
df = pd.read_csv(input_file)

# Group by Sheet and RowIndex and compute the mean for each dimension
dim_columns = [col for col in df.columns if col.startswith("Dim")]
doc_vectors = (
    df.groupby(["Sheet", "RowIndex"])[dim_columns]
    .mean()
    .reset_index()
)
doc_vectors.to_csv(output_file, index=False)

print(f"Averaged document vectors saved to {output_file}")


Averaged document vectors saved to output_files/word2vec_mean_vectors.csv


In [None]:
import pandas as pd
output_file = "output_files/word2vec_mean_vectors.csv"  # Replace with your file path

try:
    # Read the first 10 rows of the CSV file
    data = pd.read_csv(output_file)
    print(data.head())
except FileNotFoundError:
    print(f"File '{output_file}' not found. Please check the file path.")

  Sheet  RowIndex      Dim0      Dim1      Dim2      Dim3      Dim4      Dim5  \
0   A-J         0 -0.002207 -0.088804  0.013135 -0.075511 -0.040838  0.044297   
1   A-J         1  0.094648 -0.006871 -0.039487  0.051334  0.044967  0.019326   
2   A-J         2  0.142312  0.004286  0.119358 -0.019714 -0.060275  0.050093   
3   A-J         3  0.003800 -0.003572  0.028039 -0.053378 -0.006091 -0.205901   
4   A-J         4 -0.185984 -0.037074  0.066869 -0.093242 -0.048560 -0.171724   

       Dim6      Dim7  ...    Dim290    Dim291    Dim292    Dim293    Dim294  \
0 -0.164953 -0.063671  ... -0.244645 -0.184948  0.009335 -0.022688 -0.038492   
1 -0.197835 -0.015570  ... -0.116167 -0.195352  0.089020  0.008323  0.166662   
2 -0.092735 -0.079893  ... -0.355593 -0.281888 -0.068786  0.055006  0.080515   
3 -0.025491 -0.128688  ... -0.058705 -0.391982 -0.078338 -0.022778  0.211467   
4  0.105862  0.108491  ... -0.251308 -0.340048 -0.024320  0.148964  0.050160   

     Dim295    Dim296    Dim297 

Part 4: doc2vec

In [None]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [None]:
source_file = "posts_first_targil.xlsx"

# Load source Excel file
df = pd.read_excel(source_file, sheet_name=None)
if "J-P" in df:
    df["J-P"].rename(columns={"Body": "Body Text"}, inplace=True)

# Preprocessing function
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r"\d+", "", text)  # Remove digits
    text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation
    return word_tokenize(text)  # Tokenize the text

# Prepare TaggedDocuments
tagged_documents = []
for sheet_name, data in df.items():
    for index, row in data.iterrows():
        if sheet_name == "A-J":
            combined_text = " ".join(str(row[col]) for col in ['title', 'sub_title', 'Body Text'] if pd.notna(row[col]))
        else:
            combined_text = " ".join(str(row[col]) for col in ['title', 'Body Text'] if pd.notna(row[col]))

        tokens = preprocess_text(combined_text)
        tagged_documents.append(TaggedDocument(words=tokens, tags=[f"{sheet_name}_{index}"]))

# Train Doc2Vec model
model = Doc2Vec(vector_size=300, min_count=2, epochs=40, workers=4)
model.build_vocab(tagged_documents)
model.train(tagged_documents, total_examples=model.corpus_count, epochs=model.epochs)

# Save document vectors to CSV
output_file = "output_files/doc2vec_vectors.csv"
header = "Sheet,RowIndex," + ",".join([f"Dim{i}" for i in range(model.vector_size)])
with open(output_file, "w", encoding="utf-8") as file:
    file.write(header + "\n")
    for doc_id, doc in enumerate(tagged_documents):
        # Extract sheet name and row index from doc.tags[0]
        sheet, row_index = doc.tags[0].split("_")
        vector = model.dv[doc.tags[0]].tolist()
        file.write(f"{sheet},{row_index}," + ",".join(map(str, vector)) + "\n")

print(f"Document vectors with RowIndex saved to {output_file}")

In [None]:
import pandas as pd
output_file = "output_files/doc2vec_vectors.csv"  # Replace with your file path

try:
    # Read the first 10 rows of the CSV file
    data = pd.read_csv(output_file)
    print(data.head())
except FileNotFoundError:
    print(f"File '{output_file}' not found. Please check the file path.")

  Sheet  RowIndex      Dim0      Dim1      Dim2      Dim3      Dim4      Dim5  \
0   A-J         0 -0.120381  0.379345  0.134694 -0.177616 -0.337673  0.041946   
1   A-J         1 -0.067182  0.138951  0.090573  0.097399  0.135496 -0.669350   
2   A-J         2  0.025135  0.409332  0.405838  0.012679 -0.301956 -0.096406   
3   A-J         3  0.036234  0.189337  0.372502  0.030741  0.225192 -0.578521   
4   A-J         4 -0.228523  0.438873  0.277208  0.343170 -0.130110 -0.245843   

       Dim6      Dim7  ...    Dim290    Dim291    Dim292    Dim293    Dim294  \
0  0.782558  0.720419  ...  0.071085  0.264771  0.607232  0.047377  0.579811   
1  0.080852  0.947449  ... -0.025642  0.406675  0.322913 -0.005030  0.661448   
2  0.210589  0.551659  ...  0.046759  0.205108  0.021800  0.378580  0.208742   
3  0.198960  0.444680  ... -0.048156  0.490379  0.326162 -0.107866  0.724978   
4  0.018259  0.585959  ...  0.031302  0.473778  0.751093 -0.053052  0.747913   

     Dim295    Dim296    Dim297 

Part 5: BERT

In [None]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel
from nltk.tokenize import word_tokenize
import nltk
nltk.download("punkt")

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Daniel\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
source_file = "posts_first_targil.xlsx"
output_file = "output_files/bert_vectors.csv"

# Load source documents
df = pd.read_excel(source_file, sheet_name=None)
if "J-P" in df:
    df["J-P"].rename(columns={"Body": "Body Text"}, inplace=True)

# Load pre-trained BERT model and tokenizer
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

# Function to preprocess text
def preprocess_text(text):
    text = text.lower()
    return word_tokenize(text)  # Simple tokenization for this example

# Function to generate BERT vectors
def get_bert_vector(text):
    # Tokenize and encode
    inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True, padding="max_length")
    with torch.no_grad():  # Disable gradient computation
        outputs = model(**inputs)
    # Use the CLS token as the document vector
    cls_embedding = outputs.last_hidden_state[0, 0, :].numpy()
    return cls_embedding


results = []
for sheet_name, data in df.items():
    for index, row in data.iterrows():
        if sheet_name == "A-J":
             combined_text = " ".join(str(row[col]) for col in ['title', 'sub_title', 'Body Text'] if pd.notna(row[col]))
        else:
             combined_text = " ".join(str(row[col]) for col in ['title', 'Body Text'] if pd.notna(row[col]))

        # Preprocess and generate vector
        bert_vector = get_bert_vector(combined_text)
        # Add RowIndex column
        results.append([sheet_name, index] + bert_vector.tolist())

# Save vectors to CSV
header = ["Sheet", "RowIndex"] + [f"Dim{i}" for i in range(bert_vector.shape[0])]
with open(output_file, "w", encoding="utf-8") as file:
    file.write(",".join(header) + "\n")
    for row in results:
        file.write(",".join(map(str, row)) + "\n")

print(f"BERT vectors with RowIndex saved to {output_file}")


BERT vectors with RowIndex saved to bert_vectors.csv


In [None]:
import pandas as pd
output_file = "output_files/bert_vectors.csv"  # Replace with your file path

try:
    data = pd.read_csv(output_file)
    print(data.head())
except FileNotFoundError:
    print(f"File '{output_file}' not found. Please check the file path.")

  Sheet  RowIndex      Dim0      Dim1      Dim2      Dim3      Dim4      Dim5  \
0   A-J         0 -1.104568 -0.098170  0.124692 -0.288394 -0.783695  0.428429   
1   A-J         1 -0.534721  0.003136 -0.760571 -0.101115 -0.659938  0.141459   
2   A-J         2 -0.405472  0.057184 -0.284028 -0.544374 -0.807316 -0.168701   
3   A-J         3 -0.529135 -0.265460 -0.528176  0.095607 -0.412520  0.068867   
4   A-J         4 -0.370012 -0.037674 -0.243967 -0.144469 -0.041479  0.415056   

       Dim6      Dim7  ...    Dim758    Dim759    Dim760    Dim761    Dim762  \
0  0.633064  0.763848  ...  0.404507  0.029734  0.554607 -0.370775  0.848014   
1  0.375135  0.971600  ... -0.155197  0.185688  0.345379 -0.207762  0.251543   
2  0.673444  1.003882  ...  0.325744  0.410743  0.708710  0.052671  0.147199   
3  0.141250  0.351869  ...  0.651753 -0.375015  0.744730 -0.014957  0.029713   
4  0.162453  0.236116  ...  0.242417 -0.252672  0.484341 -0.148073  0.382533   

     Dim763    Dim764    Dim765 

Part 6: Ssentence_BERT


In [45]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from nltk.tokenize import sent_tokenize
import nltk

  from tqdm.autonotebook import tqdm, trange


In [52]:
# Download NLTK resources if needed
nltk.download("punkt")

# File path to source documents
source_file = "/content/input/update_posts_first_targil.xlsx"
output_file = "/content/output/sbert_vectors.csv"

# Load source documents
df = pd.read_excel(source_file, sheet_name=None)
if "J-P" in df:
    df["J-P"].rename(columns={"Body": "Body Text"}, inplace=True)

# Load pre-trained SBERT model
model = SentenceTransformer("all-MiniLM-L6-v2")

results = []

# Function to process each document and generate a document vector
def get_document_vector(text):
    # Split the text into sentences
    sentences = sent_tokenize(text)
    # Generate sentence vectors using SBERT
    sentence_vectors = model.encode(sentences)
    # Average the sentence vectors to get the document vector
    document_vector = sentence_vectors.mean(axis=0)
    # Normalize the vector by dividing by the number of sentences
    document_vector /= len(sentences)
    return document_vector


for sheet_name, data in df.items():
    for index, row in data.iterrows():
        if sheet_name == "A-J":
             combined_text = " ".join(str(row[col]) for col in ['title', 'sub_title', 'Body Text'] if pd.notna(row[col]))
        else:
             combined_text = " ".join(str(row[col]) for col in ['title', 'Body Text'] if pd.notna(row[col]))
        # Generate SBERT vector
        document_vector = get_document_vector(combined_text)
        # Append results
        results.append([sheet_name, index] + document_vector.tolist())

# Save results to CSV
header = ["Sheet", "RowIndex"] + [f"Dim{i}" for i in range(document_vector.shape[0])]

with open(output_file, "w", encoding="utf-8") as file:
    file.write(",".join(header) + "\n")
    for row in results:
        file.write(",".join(map(str, row)) + "\n")

print(f"SBERT vectors saved to {output_file}")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


KeyboardInterrupt: 

In [None]:
import pandas as pd
output_file = "output_files/bert_vectors.csv"  # Replace with your file path

try:
    data = pd.read_csv(output_file)
    print(data.head())
except FileNotFoundError:
    print(f"File '{output_file}' not found. Please check the file path.")

  Sheet  RowIndex      Dim0      Dim1      Dim2      Dim3      Dim4      Dim5  \
0   A-J         0 -1.104568 -0.098170  0.124692 -0.288394 -0.783695  0.428429   
1   A-J         1 -0.534721  0.003136 -0.760571 -0.101115 -0.659938  0.141459   
2   A-J         2 -0.405472  0.057184 -0.284028 -0.544374 -0.807316 -0.168701   
3   A-J         3 -0.529135 -0.265460 -0.528176  0.095607 -0.412520  0.068867   
4   A-J         4 -0.370012 -0.037674 -0.243967 -0.144469 -0.041479  0.415056   

       Dim6      Dim7  ...    Dim758    Dim759    Dim760    Dim761    Dim762  \
0  0.633064  0.763848  ...  0.404507  0.029734  0.554607 -0.370775  0.848014   
1  0.375135  0.971600  ... -0.155197  0.185688  0.345379 -0.207762  0.251543   
2  0.673444  1.003882  ...  0.325744  0.410743  0.708710  0.052671  0.147199   
3  0.141250  0.351869  ...  0.651753 -0.375015  0.744730 -0.014957  0.029713   
4  0.162453  0.236116  ...  0.242417 -0.252672  0.484341 -0.148073  0.382533   

     Dim763    Dim764    Dim765 

New Word2Vec

In [50]:
import pandas as pd

# Load the Excel file
input_file = "/content/input/posts_first_targil.xlsx"
df = pd.read_excel(input_file, sheet_name=None)  # Read all sheets into a dictionary

# Iterate through each sheet
for sheet_name, data in df.items():
    # Check if 'title' column exists and modify it
    if 'title' in data.columns:
        # Add a period to the end of each title if it doesn't already end with one
        data['title'] = data['title'].apply(lambda x: str(x).strip() + '.' if isinstance(x, str) and not x.endswith('.') else x)

# Save the updated file
output_file = "/content/input/update_posts_first_targil.xlsx"
with pd.ExcelWriter(output_file) as writer:
    for sheet_name, data in df.items():
        data.to_excel(writer, sheet_name=sheet_name, index=False)

print(f"Updated file saved to {output_file}")


Updated file saved to /content/input/update_posts_first_targil.xlsx


In [31]:
import pandas as pd
import string
import re
import gensim.downloader as api
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict
import csv
import nltk

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [44]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('stopwords')

input_file = "/content/output/lemma_file.xlsx"
output_file = "/content/output/w2v_lemma_withIDF_withoutStopWords.csv"

df = pd.read_excel(input_file, sheet_name=None)

if "J-P" in df:
    df["J-P"].rename(columns={"Body": "Body Text"}, inplace=True)

try:
     word2vec_model = api.load("word2vec-google-news-300")  # 300-dimensional GloVe vectors
except Exception as e:
    print(f"Error loading model: {e}")

# Preprocessing function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"\d+", "", text)  # Remove digits and dates
    text = text.translate(str.maketrans("", "", string.punctuation))  # Remove punctuation
    tokens = word_tokenize(text)  # Tokenize the text
    stop_words = set(stopwords.words("english"))
    return [word for word in tokens if word not in stop_words]

def calculate_idf(corpus):
    vectorizer = TfidfVectorizer(use_idf=True, stop_words="english")
    vectorizer.fit(corpus)
    idf_dict = defaultdict(lambda: 0)
    for word, idf in zip(vectorizer.get_feature_names_out(), vectorizer.idf_):
        idf_dict[word] = idf
    return idf_dict

corpus = []
for sheet_name, data in df.items():
    for index, row in data.iterrows():
        if sheet_name == 'A-J':
            combined_text = " ".join(str(row[col]) for col in ['title', 'sub_title', 'Body Text'] if pd.notna(row[col]))
        else:
            combined_text = " ".join(str(row[col]) for col in ['title', 'Body Text'] if pd.notna(row[col]))
        corpus.append(combined_text)

idf_dict = calculate_idf(corpus)


results = []
for sheet_name, data in df.items():
    for index, row in data.iterrows():
        # Combine text from relevant columns
        if sheet_name == 'A-J':
            combined_text = " ".join(str(row[col]) for col in ['title', 'sub_title', 'Body Text'] if pd.notna(row[col]))
        else:
            combined_text = " ".join(str(row[col]) for col in ['title', 'Body Text'] if pd.notna(row[col]))

        # Preprocess text and get tokens
        tokens = preprocess_text(combined_text)
        vectors = []
        for word in tokens:
            if word in word2vec_model:
                vector = word2vec_model[word]
                idf_value = idf_dict[word]
                vectors.append(vector * idf_value)

        # If there are word vectors for the document, compute the average
        if vectors:
            avg_vector = np.mean(vectors, axis=0)
            results.append([sheet_name, index] + avg_vector.tolist())

# Save results to a CSV file
header = ["Sheet", "RowIndex"] + [f"Dim{i}" for i in range(word2vec_model.vector_size)]
with open(output_file, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(header)
    writer.writerows(results)



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
output_file = "output_files/new_word2vec_mean_vectors.csv"

try:
    data = pd.read_csv(output_file)
    print(data.head())
except FileNotFoundError:
    print(f"File '{output_file}' not found. Please check the file path.")

  Sheet  RowIndex      Dim0      Dim1      Dim2      Dim3      Dim4      Dim5  \
0   A-J         0 -0.000125 -0.333324 -0.341119 -0.585946  0.073495  0.275870   
1   A-J         1  0.361846 -0.050101 -0.036313  0.231563  0.349663  0.204377   
2   A-J         2  0.318587  0.066093  0.150641  0.077817 -0.115447  0.012682   
3   A-J         3  0.048476  0.098860 -0.045120 -0.251414  0.151417 -0.714273   
4   A-J         4 -0.546905 -0.067917  0.356877 -0.676202 -0.028615 -0.581645   

       Dim6      Dim7  ...    Dim290    Dim291    Dim292    Dim293    Dim294  \
0 -0.677872 -0.406010  ... -0.521829 -0.597251  0.289087  0.014901 -0.410971   
1 -0.375672 -0.355330  ... -0.123543 -0.379267  0.451864  0.115989  0.448655   
2 -0.127426 -0.112411  ... -0.915440 -0.545861 -0.277455 -0.059635  0.272248   
3 -0.400813 -0.723303  ... -0.049962 -1.223554 -0.251819 -0.093594  0.839417   
4  0.192781  0.093955  ... -0.704663 -0.861301  0.056560  0.265106  0.278741   

     Dim295    Dim296    Dim297 

New BERT

In [None]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict
import nltk
nltk.download("punkt")


In [None]:
source_file = "posts_first_targil.xlsx"
output_file = "output_files/new_bert_vectors.csv"

df = pd.read_excel(source_file, sheet_name=None)
if "J-P" in df:
    df["J-P"].rename(columns={"Body": "Body Text"}, inplace=True)

# Load pre-trained BERT model and tokenizer
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

def calculate_idf(corpus):
    vectorizer = TfidfVectorizer(use_idf=True, stop_words="english")
    vectorizer.fit(corpus)
    idf_dict = defaultdict(lambda: 0)
    for word, idf in zip(vectorizer.get_feature_names_out(), vectorizer.idf_):
        idf_dict[word] = idf
    return idf_dict

corpus = []
for sheet_name, data in df.items():
    for index, row in data.iterrows():
        if sheet_name == 'A-J':
            combined_text = " ".join(str(row[col]) for col in ['title', 'sub_title', 'Body Text'] if pd.notna(row[col]))
        else:
            combined_text = " ".join(str(row[col]) for col in ['title', 'Body Text'] if pd.notna(row[col]))
        corpus.append(combined_text)

idf_dict = calculate_idf(corpus)


def get_bert_vectors(text_chunk):
    inputs = tokenizer(text_chunk, return_tensors="pt", max_length=512, truncation=True, padding="max_length")
    with torch.no_grad():
        outputs = model(**inputs)
    token_embeddings = outputs.last_hidden_state.squeeze(0)  # Shape: [sequence_length, hidden_size]
    attention_mask = inputs["attention_mask"].squeeze(0)  # Shape: [sequence_length]
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"].squeeze(0))  # List of tokens

    return tokens, token_embeddings, attention_mask

# Function to process subwords into full word embeddings
def process_tokens(tokens, token_embeddings, attention_mask, idf_dict):
    word_embeddings = []
    current_word = ""
    current_word_vectors = []

    for token, embedding, mask in zip(tokens, token_embeddings, attention_mask):
        if mask == 0 or token in ["[CLS]", "[SEP]", "[PAD]"]:
            continue

        if token.startswith("##"):  # Subword continuation
            current_word += token[2:]
            current_word_vectors.append(embedding)
        else:  # New word starts
            if current_word:  # Combine previous word embeddings
                combined_embedding = torch.mean(torch.stack(current_word_vectors), dim=0)
                idf = idf_dict.get(current_word, 1.0)  # Default IDF to 1.0 if not found
                word_embeddings.append(combined_embedding * idf)

            # Start new word
            current_word = token
            current_word_vectors = [embedding]

    # Process the last word
    if current_word:
        combined_embedding = torch.mean(torch.stack(current_word_vectors), dim=0)
        idf = idf_dict.get(current_word, 1.0)
        word_embeddings.append(combined_embedding * idf)

    return word_embeddings
# Function to process an entire document
def process_document(text, idf_dict):
    tokens = tokenizer.tokenize(text)
    max_tokens = 512
    num_chunks = (len(tokens) + max_tokens - 1) // max_tokens  # Ceiling division
    all_word_embeddings = []

    for i in range(num_chunks):
        chunk_tokens = tokens[i * max_tokens : (i + 1) * max_tokens]
        chunk_text = tokenizer.convert_tokens_to_string(chunk_tokens)
        tokens, embeddings, attention_mask = get_bert_vectors(chunk_text)
        chunk_word_embeddings = process_tokens(tokens, embeddings, attention_mask, idf_dict)
        all_word_embeddings.extend(chunk_word_embeddings)

    # Aggregate all word embeddings for the document (e.g., by mean or sum)
    document_embedding = torch.mean(torch.stack(all_word_embeddings), dim=0)
    return document_embedding


results = []
for sheet_name, data in df.items():
    for index, row in data.iterrows():
        if sheet_name == "A-J":
            combined_text = " ".join(str(row[col]) for col in ['title', 'sub_title', 'Body Text'] if pd.notna(row[col]))
        else:
            combined_text = " ".join(str(row[col]) for col in ['title', 'Body Text'] if pd.notna(row[col]))

        # Generate BERT vectors for the document
        bert_vector = process_document(combined_text, idf_dict)
        vector_list = bert_vector.tolist()
        results.append([sheet_name, index] + vector_list)
        print(vector_list)



# Save vectors to CSV
header = ["Sheet", "RowIndex"] + [f"Dim{i}" for i in range(bert_vector.shape[0])]
with open(output_file, "w", encoding="utf-8") as file:
    file.write(",".join(header) + "\n")
    for row in results:
        file.write(",".join(map(str, row)) + "\n")

print(f"BERT vectors with RowIndex saved to {output_file}")

In [None]:
output_file = "output_files/new_bert_vectors.csv"

try:
    data = pd.read_csv(output_file)
    print(data.head())
except FileNotFoundError:
    print(f"File '{output_file}' not found. Please check the file path.")

  Sheet  RowIndex      Dim0      Dim1      Dim2      Dim3      Dim4      Dim5  \
0   A-J         0 -0.788050  0.056614  1.110174 -1.022818  0.147041 -0.474798   
1   A-J         1 -0.003478  0.293304 -1.017949 -0.549554  0.197699 -0.491028   
2   A-J         2 -0.168909  0.331912  0.046262 -0.966383 -0.039441 -1.237246   
3   A-J         3 -0.188200 -0.066150  0.061154 -0.881887  0.318995 -0.174752   
4   A-J         4  0.367581  0.150649 -0.167966 -0.829529  0.578362 -0.619802   

       Dim6      Dim7  ...    Dim758    Dim759    Dim760    Dim761    Dim762  \
0  0.684777  1.736403  ...  0.441260 -0.295122  1.369656 -0.888012  1.068354   
1 -0.230678  0.669581  ...  0.126593  0.276906  0.418110 -0.073043  0.532075   
2  0.228654  1.053331  ...  0.301428  0.401214  0.752306 -0.367575  0.183579   
3 -0.718530  1.247533  ...  0.775314  0.303543  0.816760 -0.345624 -0.053506   
4 -0.128312  0.784007  ...  0.429937  0.090282  0.748448 -0.224150  0.255825   

     Dim763    Dim764    Dim765 