What we need to do: 
1) Go through each newspaper and create a newspaper corpora, display the length of each corpora
2) Find either a) wikipedia dataset and train word2vec on it to compare non-context related words like "bad", 
"victim", "good", "attacker" to representation of each side of the conflict in the newspaper corpora, b) find a 
pretrained word2vec to do the same
3) Find a way to classify each article as pro-israel or pro-palestine

### Access and preprocess our data

In [None]:
import json

# Open and read the JSON file
with open('data/news-data-extracted.json', 'r') as file:
    data = json.load(file)

# Print the data
first_article_data = data["cnn.com"][0] #cnn is the key to a value which is a list of dictionaries, we get the first dictionary (article) of that list of dictionary
first_article = first_article_data["text"]
print(first_article_data)

In [None]:
first_article[0]

In [None]:
from gensim.utils import tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

# tokenizes article into sentences, which are also tokenized into words
def tokenize_article(article):
    tokenized_article = []
    sentences  = sent_tokenize(article, language="english") # divide article into sentences
    
    for sentence in sentences:
        tokenized_sentence = tokenize(sentence) # divide sentences into words
        tokenized_article.append(tokenized_sentence) 
    return tokenized_article

# makes each word lowercase
def lowercase(tokenized_article):
    lowercase_article = []

    for sentence in tokenized_article:
        current_sentence = []
        for word in sentence:
            current_sentence.append(word.lower())
        lowercase_article.append(current_sentence)

    return lowercase_article

stop_words = set(stopwords.words("english"))

def remove_stopwords(tokenized_article):
    # Iterate over the index and content of each sentence
    for i in range(len(tokenized_article)):
        # Create a new list for the filtered sentence
        filtered_sentence = []
        for word in tokenized_article[i]:
            if word not in stop_words:
                filtered_sentence.append(word)
        # Replace the original sentence with the filtered sentence
        tokenized_article[i] = filtered_sentence
    return tokenized_article

def lammetization(tokenized_article):
    lammetizer = WordNetLemmatizer()

    lammetized_article = []

    for sentence in tokenized_article:
        current_sentence = []
        for word in sentence:
            current_sentence.append(lammetizer.lemmatize(word))
        lammetized_article.append(current_sentence)

    return lammetized_article


def remove_punctuation(tokenized_article):
    punc_removed_article = []

    for sentence in tokenized_article:
        punc_removed_sentence = []
        for word in sentence:
            # Split by punctuation, filter out empty strings, and join back if needed
            split_word = ''.join(re.split(r"[^\w]+", word))
            if split_word:  # Add non-empty words only
                punc_removed_sentence.append(split_word)

        punc_removed_article.append(punc_removed_sentence)

    return punc_removed_article

def preprocess_article(article):
    t_article = tokenize_article(article)
    l_article = lowercase(t_article)
    r_article = remove_stopwords(l_article)
    la_article = lammetization(r_article)
    re_article = remove_punctuation(la_article)
    return re_article

In [None]:
print(stop_words)
print(stopwords.words("english"))
print(set(stopwords.words("english")))

In [None]:
preprocessed = preprocess_article(first_article)
print(preprocessed)
for word in stopwords.words("english"):
    if word in preprocessed:
        print(word)

Now create a function that preperocesses a newspaper

In [None]:
def preprocess_newspaper(newspaper, newspaper_name, newspaper_dict):
    newspaper_dict[f"{newspaper_name}"] = []
    i = 0
    for article_data in newspaper:
        text = article_data['text']
        newspaper_dict[f"{newspaper_name}"].extend(preprocess_article(text))  # extends preproccessed
        # articles to
        # newspaper's article list
        print(f"{newspaper_name}: article {i} preprocessed")
        i += 1
    return newspaper_dict

In [None]:
# Lets try with CNN
newspaper = data["cnn.com"]
newspaper_dict = {}
newspaper_dict = preprocess_newspaper(newspaper, "cnn.com", newspaper_dict)
print(newspaper_dict)

### Save and load preprocessed newspapers

In [None]:
import json
import os

def save_newspaper_dict(newspaper_dict):
    # File path for the JSON file
    file_path = "preprocessed_newspaper_articles.json"

    # Step 1: Load existing data if the file exists, otherwise start with an empty list
    if os.path.exists(file_path):
        with open(file_path, "r") as json_file:
            data = json.load(json_file)  # Load existing data
        for key,value in newspaper_dict:
            if key not in data:
                data["key"] = value

    else:
        data = newspaper_dict

    # Step 3: Write the updated data back to the file
    with open(file_path, "w") as json_file:
        json.dump(data, json_file, indent=4)


In [None]:
save_newspaper_dict(newspaper_dict)

In [None]:
import json
with open("preprocessed_newspaper_articles.json", "r") as json_file:
    loaded_newspaper_dict = json.load(json_file)
    print(loaded_newspaper_dict)

### Train word2vec on cnn.com

In [None]:
from gensim.models import Word2Vec

# Prepare sentences for Word2Vec
sentences = loaded_newspaper_dict["cnn.com"] # Each newspaper's corpus is one "document"
print(sentences)
# Train Word2Vec model
# Initialize the model with parameters
model = Word2Vec(sentences=sentences, vector_size=300, window=5, min_count=10, sg=1, workers=4, negative=20)

# Train the model
model.train(sentences, total_examples=len(sentences), epochs=20)

In [None]:
# model.save("cnn_w2v.model")
# Save just the word vectors in a text format
model.wv.save_word2vec_format("cnn_w2v_vectors.txt", binary=False)

# To save in binary format:
model.wv.save_word2vec_format("cnn_w2v_vectors.bin", binary=True)


### Load the model

In [None]:
from gensim.models import KeyedVectors

# Load the word vectors
word_vectors = KeyedVectors.load_word2vec_format("word2vec_vectors.txt", binary=False)

In [None]:
# Get the vector for a word
vector = model.wv["idf"]

# Find most similar words
similar_words = model.wv.most_similar("genocide")
print(similar_words)

# Calculate similarity
similarity = model.wv.similarity("hamas", "terrorist")
print(f"Similarity between 'hamas' and 'terrorist': {similarity}")

# Calculate similarity
similarity = model.wv.similarity("idf", "terrorist")
print(f"Similarity between 'idf' and 'terrorist': {similarity}")