What we need to do: 
1) Go through each newspaper and create a newspaper corpora, display the length of each corpora
2) Find either a) wikipedia dataset and train word2vec on it to compare non-context related words like "bad", 
"victim", "good", "attacker" to representation of each side of the conflict in the newspaper corpora, b) find a 
pretrained word2vec to do the same
3) Find a way to classify each article as pro-israel or pro-palestine

### Access and preprocess our data

In [None]:
import json

# Open and read the JSON file
with open('data/news-data-extracted.json', 'r') as file:
    data = json.load(file)

# Print the data
first_article_data = data["cnn.com"][0] #cnn is the key to a value which is a list of dictionaries, we get the first dictionary (article) of that list of dictionary
first_article = first_article_data["text"]
print(first_article_data)

In [None]:
first_article[0]

In [None]:
from gensim.utils import tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

# tokenizes article into sentences, which are also tokenized into words
def tokenize_article(article):
    tokenized_article = []
    sentences  = sent_tokenize(article, language="english") # divide article into sentences
    
    for sentence in sentences:
        tokenized_sentence = tokenize(sentence) # divide sentences into words
        tokenized_article.append(tokenized_sentence) 
    return tokenized_article

# makes each word lowercase
def lowercase(tokenized_article):
    lowercase_article = []

    for sentence in tokenized_article:
        current_sentence = []
        for word in sentence:
            current_sentence.append(word.lower())
        lowercase_article.append(current_sentence)

    return lowercase_article

stop_words = set(stopwords.words("english"))

def remove_stopwords(tokenized_article):
    # Iterate over the index and content of each sentence
    for i in range(len(tokenized_article)):
        # Create a new list for the filtered sentence
        filtered_sentence = []
        for word in tokenized_article[i]:
            if word not in stop_words:
                filtered_sentence.append(word)
        # Replace the original sentence with the filtered sentence
        tokenized_article[i] = filtered_sentence
    return tokenized_article

def lammetization(tokenized_article):
    lammetizer = WordNetLemmatizer()

    lammetized_article = []

    for sentence in tokenized_article:
        current_sentence = []
        for word in sentence:
            current_sentence.append(lammetizer.lemmatize(word))
        lammetized_article.append(current_sentence)

    return lammetized_article


def remove_punctuation(tokenized_article):
    punc_removed_article = []

    for sentence in tokenized_article:
        punc_removed_sentence = []
        for word in sentence:
            # Split by punctuation, filter out empty strings, and join back if needed
            split_word = ''.join(re.split(r"[^\w]+", word))
            if split_word:  # Add non-empty words only
                punc_removed_sentence.append(split_word)

        punc_removed_article.append(punc_removed_sentence)

    return punc_removed_article

def preprocess_article(article):
    t_article = tokenize_article(article)
    l_article = lowercase(t_article)
    r_article = remove_stopwords(l_article)
    la_article = lammetization(r_article)
    re_article = remove_punctuation(la_article)
    return re_article

In [None]:
print(stop_words)
print(stopwords.words("english"))
print(set(stopwords.words("english")))

In [None]:
preprocessed = preprocess_article(first_article)
print(preprocessed)
for word in stopwords.words("english"):
    if word in preprocessed:
        print(word)

Now create a function that preperocesses a newspaper

In [None]:
# def create_article_list(extracted_file, newspaper_name):
#     """ Takes in the file of extracted news and the newspaper name
#     Outputs an artilce (text) list for the given newspaper
#     """
#     import json
#     with open(extracted_file, "r") as json_file:
#         data = json.load(json_file)
#
#     newspaper = data[newspaper_name] # newspaper will be a dictionary of articles with values being url, date, authors, text etc.
#     newspaper_articles = []
#
#     for article in newspaper:
#         if article != None:
#             newspaper_articles.append(f"{article["text"]}")
#
#     print(newspaper_articles)

def create_article_list(extracted_file, newspaper_name):
    """
    Takes in the file of extracted news and the newspaper name.
    Outputs an article (text) list for the given newspaper.
    """
    import json

    # Load the JSON file
    with open(extracted_file, "r") as json_file:
        data = json.load(json_file)

    # Extract newspaper data
    newspaper = data.get(newspaper_name, [])  # Default to an empty list if not found
    newspaper_articles = []

    # Loop through articles in the newspaper
    for article in newspaper:
        # Check if article has a valid "text" key
        if article and isinstance(article, dict) and "text" in article:
            newspaper_articles.append(article["text"])  # Use append to add the text to the list

    print(f"Extracted {len(newspaper_articles)} articles from {newspaper_name}.")
    return newspaper_articles



In [None]:
def preprocess_newspaper(article_list):
    """ Takes in article list and give back a list of list which is preprocessed article in the form of every element in the list is a sentence which consist of lists of words"""

    if not article_list:  # Handle empty or None input
        print("No articles provided for preprocessing.")
        return []

    preprocessed_article_list = []
    i = 0 # to see how many articles we processed

    for article in article_list:
        preprocessed_article_list.extend(preprocess_article(article))  # extends preproccessed
        # articles to newspaper's article list
        print(f"article {i} preprocessed")
        i += 1

    return preprocessed_article_list

In [None]:
# Lets try with CNN
article_list = create_article_list("data/news-data-extracted.json", "cnn.com")
print(article_list)
preprocessed = preprocess_newspaper(article_list)
print(preprocessed)

In [None]:
# create get_article_number, corpussize, and other helper functions

def no_of_articles(article_list):
    return len(article_list)

def corpus_size_before(article_list):
    corpus = article_list

    corpus_size = 0
    for article in article_list:
        corpus_size += len(article.split())

    return corpus_size


def corpus_size_after(preprocessed_article_list):
    corpus = preprocessed_article_list

    corpus_size = 0
    for sentence in corpus:
        for word in sentence:
            corpus_size += 1

    return corpus_size


def no_of_unique_words(preprocessed_article_list):
    words = []

    for sentence in preprocessed_article_list:
        for word in sentence:
            if word in words:
                pass
            else:
                words.append(word)

    return len(words)

def no_of_sentences(preprocessed_article_list):
    return len(preprocessed_article_list)

In [None]:
""" Occurance Counter """

# Palestine
def occurance(target_word, preprocessed_article_list):
    counter = 0

    for sentence in preprocessed_article_list:
        for word in sentence:
            if word == f"{target_word}":
                counter += 1
    
    return counter

### Training function

In [None]:
def train(newspaper_name, sentence_list):
    from gensim.models import Word2Vec
    import os

    # Ensure the directory exists
    os.makedirs(newspaper_name, exist_ok=True)

    # Train Word2Vec model
    # Initialize the model with parameters
    model = Word2Vec(sentences=sentence_list, vector_size=300, window=5, min_count=10, sg=1, workers=4, negative=20)

    # Train and save the model
    model.train(sentence_list, total_examples=len(sentence_list), epochs=20)
    model.save(os.path.join(newspaper_name, f"{newspaper_name}_w2v.model"))

    # # Save just the word vectors in a text and binaryformat
    # model.wv.save_word2vec_format(f"{newspaper_name/newspaper_name}_w2v_vectors.txt", binary=False)
    # model.wv.save_word2vec_format(f"{newspaper_name/newspaper_name}_w2v_vectors.bin", binary=True)


    import os

    model.wv.save_word2vec_format(os.path.join(newspaper_name, f"{newspaper_name}_w2v_vectors.txt"),binary=False)
    model.wv.save_word2vec_format(os.path.join(newspaper_name, f"{newspaper_name}_w2v_vectors.bin"),binary=True)


    return (
        os.path.join(newspaper_name, f"{newspaper_name}_w2v.model"),
        os.path.join(newspaper_name, f"{newspaper_name}_w2v_vectors.txt"),
        os.path.join(newspaper_name, f"{newspaper_name}_w2v_vectors.bin"),
    )


### Calculate portrayal

In [None]:
def calculate_portrayal(model, palestinian_words, israeli_words, positive_portrayal_words, negative_portrayal_words): # target_words and portrayal_words are lists
    palestine_portrayal_scores = {}
    israel_portrayal_scores = {}

    # Access the list of words in the vocabulary
    vocabulary_words = list(model.wv.key_to_index.keys())
    
    # no of portrayal words
    pos_count = 0
    for word in positive_portrayal_words:
        if word in vocabulary_words:
            pos_count += 1
    neg_count = 0
    for word in negative_portrayal_words:
        if word in vocabulary_words:
            neg_count += 1       


    for word in palestinian_words:
            palestine_portrayal_scores[word] = 0
            for positive in positive_portrayal_words:
                if positive in vocabulary_words:
                    palestine_portrayal_scores[word] += (model.wv.similarity(f"{word}", f"{positive}")/pos_count)
            for negative in negative_portrayal_words:
                if positive in vocabulary_words:
                    palestine_portrayal_scores[word] -= (model.wv.similarity(f"{word}", f"{negative}")/neg_count)

    for word in israeli_words:
        israel_portrayal_scores[word] = 0
        for positive in positive_portrayal_words:
            if positive in vocabulary_words:
                israel_portrayal_scores[word] += (model.wv.similarity(f"{word}", f"{positive}")/pos_count)
        for negative in negative_portrayal_words:
            if positive in vocabulary_words:
                israel_portrayal_scores[word] -= (model.wv.similarity(f"{word}", f"{negative}")/neg_count)

    return palestine_portrayal_scores, israel_portrayal_scores

Should I include gaza, if yes, add an occurance function and add it to the target word_list and portrayal

Save newspaper dictionary

In [None]:
import json
import os

def save_newspaper_dict(newspaper_dict):
    # File path for the JSON file
    file_path = "preprocessed_newspapers_dict.json"

    # Open the JSON file
    with open(file_path, "r") as json_file:
        data = json.load(json_file)  # Load existing data

    # Iterate over items in the dictionary
    for key, value in newspaper_dict.items():  # Use .items() to get key-value pairs
        if key not in data:
            data[key] = value  # Save new key-value pair

    # Save updated data back to the file
    with open(file_path, "w") as json_file:
        json.dump(data, json_file, indent=4)

In [None]:
import json
import os

def load_preprocessed_newspapers(json_file):
    """
    Load preprocessed newspapers from a JSON file.
    """
    if os.path.exists(json_file):
        try:
            with open(json_file, 'r') as file:
                data = json.load(file)
                if isinstance(data, dict):
                    print(f"Successfully loaded preprocessed newspapers from {json_file}.")
                    return data
                else:
                    print("Error: JSON data is not a dictionary. Returning an empty dictionary.")
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON file {json_file}: {e}")
    else:
        print(f"File {json_file} does not exist. Starting with an empty dictionary.")

    return {}


In [None]:
newspaper_list = ["cnn.com", "WashingtonPost.com"]

def master(extracted_file, newspaper_list):
    """
    Get a list of newspapers
    Create a dictionary of newspapers, which is a dictionary
    For every newspaper, have the following keys:
    # of articles, corpus size (before preprocessing), # of unique words (before preprocessing),
    list of articles (only text) (before preprocessing),
    preprocessed articles (a list of sentences, which are a list of words)
    corpus size (after preprocessing), # of unique words (after preprocessing),
    how many times each target word appears (palestine, israel, hamas, idf, netanyahu, sinwar, etc.)
    train a word2vec, save the model and the weights,
    bias score for palestine, israel, hamas, idf, etc,
    Add the following key to each articles
    """
    from gensim.models import KeyedVectors
    from gensim.models import Word2Vec

    preprocessed_newspapers = load_preprocessed_newspapers("preprocessed_newspapers_dict.json")


    # check if the newspaper is already preprocessed, if it is skip it
    for newspaper in newspaper_list:
        if f"{newspaper}" not in preprocessed_newspapers:
            preprocessed_newspapers[newspaper] = {}
            dict_newspaper = preprocessed_newspapers[newspaper]

            article_list = create_article_list(extracted_file, newspaper)
            sentence_list = preprocess_newspaper(article_list)

            dict_newspaper["no_of_articles"] = no_of_articles(article_list)
            dict_newspaper["corpus_size_before_preprocess"] = corpus_size_before(article_list)
            dict_newspaper["corpus_size"] = corpus_size_after(sentence_list)
            dict_newspaper["no_of_unique_words"] = no_of_unique_words(sentence_list)
            dict_newspaper["no_of_sentences"] = no_of_sentences(sentence_list)
            dict_newspaper["occurance_palestine"] = occurance("palestine", sentence_list)
            dict_newspaper["occurance_palestinian"] = occurance("palestinian", sentence_list)
            dict_newspaper["occurance_hamas"] = occurance("hamas", sentence_list)
            dict_newspaper["occurance_sinwar"] = occurance("sinwar", sentence_list)
            dict_newspaper["occurance_israel"] = occurance("israel", sentence_list)
            dict_newspaper["occurance_israeli"] = occurance("israeli", sentence_list)
            dict_newspaper["occurance_idf"] = occurance("idf", sentence_list)
            dict_newspaper["occurance_netanyahu"] = occurance("netanyahu", sentence_list)
            dict_newspaper["model_location"] = ""
            dict_newspaper["vectors_txt_location"] = ""
            dict_newspaper["vectors_bin_location"] = ""
            dict_newspaper["portrayal_palestine"] = {}
            dict_newspaper["portrayal_palestine_score"] = 0
            dict_newspaper["portrayal_israel"] = {}
            dict_newspaper["portrayal_israel_score"] = 0
            dict_newspaper["palestine-israel_score"] = 0
            dict_newspaper["articles"] = article_list
            dict_newspaper["preprocessed"] = sentence_list

            # actually fill out the values for model-related keys
            dict_newspaper["model_location"], dict_newspaper["vectors_txt_location"], dict_newspaper["vectors_bin_location"] = train(newspaper, sentence_list)

            # Load the model from a file
            model = Word2Vec.load(f"{newspaper}/{newspaper}_w2v.model")


            palestinian_words = ["palestine", "palestinian", "hamas", "sinwar"]
            israeli_words = ["israel", "israeli", "idf", "netanyahu"]

            # positive categories: general (good etc), victim, 
            positive_portrayal_words = ["positive", "good", "victim", "resilient", "justified", "defend", "innocent", "rightful", "humane"]
            negative_portrayal_words = ["negative", "bad", "aggressor", "attacker", "brutal", "illegal", "terrorist", "barbaric", "massacre", "invade"]

            dict_newspaper["portrayal_palestine"], dict_newspaper["portrayal_israel"] = calculate_portrayal(model,  palestinian_words, israeli_words, positive_portrayal_words, negative_portrayal_words)
            print(f"{newspaper}", dict_newspaper["portrayal_palestine"], dict_newspaper["portrayal_israel"])

            for key, value in dict_newspaper["portrayal_palestine"].items():
                dict_newspaper["portrayal_palestine_score"] += (value/4)  # divide by four to get the average

            for key, value in dict_newspaper["portrayal_israel"].items():
                dict_newspaper["portrayal_israel_score"] += (value/4)

            dict_newspaper["palestine-israel_score"] = dict_newspaper["portrayal_palestine_score"] - dict_newspaper["portrayal_israel_score"]
            print("palestinian are better portrayed by: ", dict_newspaper["palestine-israel_score"])

            save_newspaper_dict(preprocessed_newspapers)
            
    return preprocessed_newspapers

In [None]:
processed_newspapers = master("data/news-data-extracted.json", newspaper_list)