What we need to do: 
1) Go through each newspaper and create a newspaper corpora, display the length of each corpora
2) Find either a) wikipedia dataset and train word2vec on it to compare non-context related words like "bad", 
"victim", "good", "attacker" to representation of each side of the conflict in the newspaper corpora, b) find a 
pretrained word2vec to do the same
3) Find a way to classify each article as pro-israel or pro-palestine

### Access and preprocess our data

In [None]:
import json

# Open and read the JSON file
with open('data/news-data-extracted.json', 'r') as file:
    data = json.load(file)

# Print the data
first_article_data = data["cnn.com"][0] #cnn is the key to a value which is a list of dictionaries, we get the first dictionary (article) of that list of dictionary
first_article = first_article_data["text"]
print(first_article_data)

In [None]:
first_article[0]

In [None]:
from gensim.utils import tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

# tokenizes article into sentences, which are also tokenized into words
def tokenize_article(article):
    tokenized_article = []
    sentences  = sent_tokenize(article, language="english") # divide article into sentences
    
    for sentence in sentences:
        tokenized_sentence = tokenize(sentence) # divide sentences into words
        tokenized_article.append(tokenized_sentence) 
    return tokenized_article

# makes each word lowercase
def lowercase(tokenized_article):
    lowercase_article = []

    for sentence in tokenized_article:
        current_sentence = []
        for word in sentence:
            current_sentence.append(word.lower())
        lowercase_article.append(current_sentence)

    return lowercase_article

stop_words = set(stopwords.words("english"))

def remove_stopwords(tokenized_article):
    # Iterate over the index and content of each sentence
    for i in range(len(tokenized_article)):
        # Create a new list for the filtered sentence
        filtered_sentence = []
        for word in tokenized_article[i]:
            if word not in stop_words:
                filtered_sentence.append(word)
        # Replace the original sentence with the filtered sentence
        tokenized_article[i] = filtered_sentence
    return tokenized_article

def lammetization(tokenized_article):
    lammetizer = WordNetLemmatizer()

    lammetized_article = []

    for sentence in tokenized_article:
        current_sentence = []
        for word in sentence:
            current_sentence.append(lammetizer.lemmatize(word))
        lammetized_article.append(current_sentence)

    return lammetized_article


def remove_punctuation(tokenized_article):
    punc_removed_article = []

    for sentence in tokenized_article:
        punc_removed_sentence = []
        for word in sentence:
            # Split by punctuation, filter out empty strings, and join back if needed
            split_word = ''.join(re.split(r"[^\w]+", word))
            if split_word:  # Add non-empty words only
                punc_removed_sentence.append(split_word)

        punc_removed_article.append(punc_removed_sentence)

    return punc_removed_article

def preprocess_article(article):
    t_article = tokenize_article(article)
    l_article = lowercase(t_article)
    r_article = remove_stopwords(l_article)
    la_article = lammetization(r_article)
    re_article = remove_punctuation(la_article)
    return re_article

In [None]:
print(stop_words)
print(stopwords.words("english"))
print(set(stopwords.words("english")))

In [None]:
preprocessed = preprocess_article(first_article)
print(preprocessed)
for word in stopwords.words("english"):
    if word in preprocessed:
        print(word)

Now create a function that preperocesses a newspaper

In [None]:
def create_article_list(extracted_file, newspaper_name):
    """ Takes in the file of extracted news and the newspaper name
    Outputs an artilce (text) list for the given newspaper
    """
    import json
    with open(extracted_file, "r") as json_file:
        data = json.load(json_file)
    
    newspaper = data[newspaper_name] # newspaper will be a dictionary of articles with values being url, date, authors, text etc.
    newspaper_articles = []
    
    for article in newspaper:
        newspaper_articles.append(f"{article["text"]}")
    
    print(newspaper_articles)

In [None]:
def preprocess_newspaper(article_list):
    """ Takes in article list and give back a list of list which is preprocessed article in the form of every every element in the list is a sentence which consist of lists of words"""
    preprocessed_article_list = []
    i = 0 # to see how many articles we processed

    for article in article_list:
        preprocessed_article_list.extend(preprocess_article(article))  # extends preproccessed
        # articles to newspaper's article list
        print(f"article {i} preprocessed")
        i += 1

    return preprocessed_article_list

In [None]:
# Lets try with CNN
article_list = create_article_list("data/news-data-extracted.json", "cnn.com")
preprocessed = preprocess_newspaper(article_list)
print(preprocessed)

In [None]:
# create get_article_number, corpussize, and other helper functions

def no_of_articles(article_list):
    return len(article_list)

def corpus_size_before(article_list):
    corpus = article_list

    corpus_size = 0
    for article in article_list:
        corpus_size += len(article)

    return corpus_size


def corpus_size_after(preprocessed_article_list):
    corpus = preprocessed_article_list

    corpus_size = 0
    for sentence in corpus:
        for word in sentence:
            corpus_size += 1

    return corpus_size


def no_of_unique_words(preprocessed_article_list):
    words = []

    for sentence in preprocessed_article_list:
        for word in sentence:
            if word in words:
                pass
            else:
                words.append(word)

    return len(words)

def no_of_sentences(preprocessed_article_list):
    return len(preprocessed_article_list)

In [None]:
""" Occurance Counter """

# Palestine
def occurance_palestine(preprocessed_article_list):
    counter = 0

    for sentence in preprocessed_article_list:
        for word in sentence:
            if word == "palestine":
                counter += 1
    return counter


def occurance_palestinian(preprocessed_article_list):
    counter = 0

    for sentence in preprocessed_article_list:
        for word in sentence:
            if word == "palestinian":
                counter += 1
    return counter


def occurance_hamas(preprocessed_article_list):
    counter = 0

    for sentence in preprocessed_article_list:
        for word in sentence:
            if word == "hamas":
                counter += 1
    return counter


def occurance_sinwar(preprocessed_article_list):
    counter = 0

    for sentence in preprocessed_article_list:
        for word in sentence:
            if word == "sinwar":
                counter += 1
    return counter


# Israel
def occurance_israel(preprocessed_article_list):
    counter = 0

    for sentence in preprocessed_article_list:
        for word in sentence:
            if word == "israel":
                counter += 1
    return counter


def occurance_israeli(preprocessed_article_list):
    counter = 0

    for sentence in preprocessed_article_list:
        for word in sentence:
            if word == "israeli":
                counter += 1
    return counter


def occurance_idf(preprocessed_article_list):
    counter = 0

    for sentence in preprocessed_article_list:
        for word in sentence:
            if word == "idf":
                counter += 1
    return counter


def occurance_netanyahu(preprocessed_article_list):
    counter = 0

    for sentence in preprocessed_article_list:
        for word in sentence:
            if word == "netanyahu":
                counter += 1
    return counter

### Training function

In [None]:
def train(newspaper_name, sentence_list):
    from gensim.models import Word2Vec
    # Train Word2Vec model
    # Initialize the model with parameters
    model = Word2Vec(sentences=sentence_list, vector_size=300, window=5, min_count=10, sg=1, workers=4, negative=20)

    # Train and save the model
    model.train(sentence_list, total_examples=len(sentence_list), epochs=20)
    model.save(f"{newspaper_name}_w2v.model")

    # Save just the word vectors in a text and binaryformat
    model.wv.save_word2vec_format(f"{newspaper_name/newspaper_name}_w2v_vectors.txt", binary=False)
    model.wv.save_word2vec_format(f"{newspaper_name/newspaper_name}_w2v_vectors.bin", binary=True)

    return f"{newspaper_name}_w2v.model", f"{newspaper_name}_w2v_vectors.txt", f"{newspaper_name}_w2v_vectors.bin"


### Calculate portrayal

In [None]:
def calculate_portrayal(model, target_words, positive_portrayal_words, negative_portrayal_words): # target_words and portrayal_words are lists
    portrayal_scores = {}

    for word in target_words:
        portrayal_scores[word] = 0
        for positive in positive_portrayal_words:
            portrayal_scores[word] += model.wv.similarity(f"{word}", f"{positive}")
        for negative in negative_portrayal_words:
            portrayal_scores[word] -= model.wv.similarity(f"{word}", f"{negative}")

    return portrayal_scores


Should I include gaza, if yes, add an occurance function and add it to the target word_list and portrayal

In [None]:
newspaper_list = ["cnn.com", "WashingtonPost.com"]

def master(extracted_file, newspaper_list):
    """
    Get a list of newspapers
    Create a dictionary of newspapers, which is a dictionary
    For every newspaper, have the following keys:
    # of articles, corpus size (before preprocessing), # of unique words (before preprocessing),
    list of articles (only text) (before preprocessing),
    preprocessed articles (a list of sentences, which are a list of words)
    corpus size (after preprocessing), # of unique words (after preprocessing),
    how many times each target word appears (palestine, israel, hamas, idf, netanyahu, sinwar, etc.)
    train a word2vec, save the model and the weights,
    bias score for palestine, israel, hamas, idf, etc,
    Add the following key to each articles
    """
    from gensim.models import KeyedVectors
    from gensim.models import Word2Vec

    preprocessed_newspapers = {}

    # check if the newspaper is already preprocessed, if it is skip it
    for newspaper in newspaper_list:
        if newspaper not in preprocessed_newspapers:
            preprocessed_newspapers[newspaper] = {}
            dict_newspaper = preprocessed_newspapers[newspaper]

            article_list = create_article_list(extracted_file, newspaper)
            sentence_list = preprocess_newspaper(article_list)

            dict_newspaper["no_of_articles"] = no_of_articles(article_list)
            dict_newspaper["corpus_size_before_preprocess"] = corpus_size_before(article_list)
            dict_newspaper["corpus_size"] = corpus_size_after(sentence_list)
            dict_newspaper["no_of_unique_words"] = no_of_unique_words(sentence_list)
            dict_newspaper["no_of_sentences"] = no_of_sentences(sentence_list)
            dict_newspaper["occurance_palestine"] = occurance_palestine(sentence_list)
            dict_newspaper["occurance_palestinian"] = occurance_palestinian(sentence_list)
            dict_newspaper["occurance_hamas"] = occurance_hamas(sentence_list)
            dict_newspaper["occurance_sinwar"] = occurance_sinwar(sentence_list)
            dict_newspaper["occurance_israel"] = occurance_israel(sentence_list)
            dict_newspaper["occurance_israeli"] = occurance_israeli(sentence_list)
            dict_newspaper["occurance_idf"] = occurance_idf(sentence_list)
            dict_newspaper["occurance_netanyahu"] = occurance_netanyahu(sentence_list)
            dict_newspaper["model_location"] = ""
            dict_newspaper["vectors_txt_location"] = ""
            dict_newspaper["vectors_bin_location"] = ""
            # dict_newspaper["portrayal_palestine"] = portrayal_palestine(sentence_list)
            # dict_newspaper["portrayal_palestinian"] = portrayal_palestinian(sentence_list)
            # dict_newspaper["portrayal_hamas"] = portrayal_hamas(sentence_list)
            # dict_newspaper["portrayal_sinwar"] = portrayal_sinwar(sentence_list)
            # dict_newspaper["portrayal_israel"] = portrayal_israel(sentence_list)
            # dict_newspaper["portrayal_israeli"] = portrayal_israeli(sentence_list)
            # dict_newspaper["portrayal_idf"] = portrayal_idf(sentence_list)
            # dict_newspaper["portrayal_netanyahu"] = portrayal_netanyahu(sentence_list)
            dict_newspaper["articles"] = article_list
            dict_newspaper["preprocessed"] = sentence_list

            # actually fill out the values for model-related keys
            dict_newspaper["model_location"], dict_newspaper["vectors_txt_location"], dict_newspaper["vectors_bin_location"] = train(newspaper, sentence_list)

            # Load the model from a file
            model = Word2Vec.load(f"{newspaper/newspaper}_w2v.model")

            palestinian_words = ["palestine", "palestinian", "hamas", "sinwar"]
            israeli_words = ["israel", "israeli", "idf", "netanyahu"]

            # positive categories: general (good etc), victim, 
            positive_portrayal_words = ["positive", "good", "victim", "resilient", "justified", "defenders", "innocent", "rightful","humane"]
            negative_portrayal_words = ["positive", "good", "victim", "resilient", "justified", "defenders", "innocent", "rightful"]

            calculate_portrayal(model,
                                palestinian_words
                                , positive_portrayal_words, negative_portrayal_words)


# ### Potential Portrayal Words
# Positive: positive, good, victim, humane, heroic, brave, noble, resilient, justified, courageous, victorious, liberating, righteous, defenders, innocent
# Negative: negative, bad, aggressor, attacker, aggressive, brutal, oppressive, merciless, barbaric, ruthless, massacra
# invaders, terrorist
# terroristic, dictatorial, destructive, illegal, corrupt, authoritarian, regressive, settler
#
# Find word frequency for these words

### Save and load preprocessed newspapers

In [None]:
import json
import os

def save_newspaper_dict(newspaper_dict):
    # File path for the JSON file
    file_path = "preprocessed_newspaper_articles.json"

    # Step 1: Load existing data if the file exists, otherwise start with an empty list
    if os.path.exists(file_path):
        with open(file_path, "r") as json_file:
            data = json.load(json_file)  # Load existing data
        for key,value in newspaper_dict:
            if key not in data:
                data["key"] = value

    else:
        data = newspaper_dict

    # Step 3: Write the updated data back to the file
    with open(file_path, "w") as json_file:
        json.dump(data, json_file, indent=4)


In [None]:
save_newspaper_dict(newspaper_dict)

In [None]:
import json
with open("preprocessed_newspaper_articles.json", "r") as json_file:
    loaded_newspaper_dict = json.load(json_file)
    print(loaded_newspaper_dict)

Find corpus size for cnn

### Train word2vec on cnn.com

In [None]:
from gensim.models import Word2Vec

# Prepare sentences for Word2Vec
sentences = loaded_newspaper_dict["cnn.com"] # Each newspaper's corpus is one "document"
print(sentences)
# Train Word2Vec model
# Initialize the model with parameters
model = Word2Vec(sentences=sentences, vector_size=300, window=5, min_count=10, sg=1, workers=4, negative=20)

# Train the model
model.train(sentences, total_examples=len(sentences), epochs=20)

In [None]:
# model.save("cnn_w2v.model")
# Save just the word vectors in a text format
model.wv.save_word2vec_format("cnn_w2v_vectors.txt", binary=False)

# To save in binary format:
model.wv.save_word2vec_format("cnn_w2v_vectors.bin", binary=True)


### Load the model

In [None]:
from gensim.models import KeyedVectors
from gensim.models import Word2Vec

# Load the model from a file
model = Word2Vec.load("cnn_w2v.model")

# Now you can use the model
print(model.wv.most_similar("israeli"))  # Replace "your_word" with the word you're interested in

# Load the word vectors
word_vectors = KeyedVectors.load_word2vec_format("cnn_W2v_vectors.txt", binary=False)

In [None]:
# Get the vector for a word
vector = model.wv["idf"]

# Find most similar words
similar_words = model.wv.most_similar("bad")
print(similar_words)

# Calculate similarity
similarity = model.wv.similarity("palestine", "victim")
print(f"Similarity between 'palestine' and 'victim': {similarity}")

# Calculate similarity
similarity = model.wv.similarity("israel", "victim")
print(f"Similarity between 'israel' and 'victim': {similarity}")

### Potential Portrayal Words
Positive: positive, good, victim, humane, heroic, brave, noble, resilient, justified, courageous, victorious, liberating, righteous, defenders, innocent
Negative: negative, bad, aggressor, attacker, aggressive, brutal, oppressive, merciless, barbaric, ruthless, massacra
invaders, terrorist
terroristic, dictatorial, destructive, illegal, corrupt, authoritarian, regressive, settler

Find word frequency for these words


# MASTER FUNCTION

In [None]:
import json
with open("data/news-data-extracted.json", "r") as json_file:
    data = json.load(json_file)

newspaper = data["cnn.com"] # newspaper will be a dictionary of articles with values being url, date, authors, text etc.
newspaper_articles = []

for article in newspaper:
    newspaper_articles.append(article["text"])

print(newspaper_articles)

In [None]:
newspaper

In [None]:
# take a scraped newspaper in, which will be a key to the dictionary we download: example: data['cnn.com']
# get corpus size before preprocessing
# preprocess the newspaper
# get corpus size after preprocessing
# find word frequency for israel, palestine, idf, hamas, gaza, west bank
# save it to the preprocessed_newspaper_articles dictionary
# train word2vec on it
# measure portrayal for both sides
# all this metadata & results in a dict, and preprocessed corpus to preprocessed_newspaper_articles
def master(extracted_file, preprocessed_file, newspaper_name):

    import json
    with open(extracted_file, "r") as json_file:
        data = json.load(json_file)

    newspaper = data[f"{newspaper_name}"] # newspaper will be a list of dictionaries, each dictionary representing an article with keys being url, date, authors, text etc.
    newspaper_articles = newspaper['text'] # newspaper_articles will be


    pass