In [None]:
import pandas as pd


df = pd.read_csv('/content/combined_file.csv')


print(df.head())

                                               title  \
0  Looking for married Muslim men who have hijabi...   
1  Share your istikhara success stories. I need s...   
2                                              Fate?   
3            Good Thrift shop find. Highly reccomend   
4                     Wearing my kippah with tattoos   

                                                text score upvote_ratio  \
0  Don’t message me if you can’t live verify. Too...     1            1   
1  Salaam everyone. I’m a F currently going throu...     1            1   
2  In the Qur'an, I saw verses in these cases tha...     1            1   
3  Holocaust book about family of Jewish Hungaria...     1            1   
4  Shalom friends!\n\nI’m a Baal teshuva with man...     1            1   

  upvotes Unnamed: 5 Unnamed: 6  
0       1        NaN        NaN  
1       1        NaN        NaN  
2       1        NaN        NaN  
3       1        NaN        NaN  
4       1        NaN        NaN  


In [None]:
df.shape

(92815, 7)

In [None]:
df.head()

Unnamed: 0,title,text,score,upvote_ratio,upvotes,Unnamed: 5,Unnamed: 6
0,Looking for married Muslim men who have hijabi...,Don’t message me if you can’t live verify. Too...,1,1,1,,
1,Share your istikhara success stories. I need s...,Salaam everyone. I’m a F currently going throu...,1,1,1,,
2,Fate?,"In the Qur'an, I saw verses in these cases tha...",1,1,1,,
3,Good Thrift shop find. Highly reccomend,Holocaust book about family of Jewish Hungaria...,1,1,1,,
4,Wearing my kippah with tattoos,Shalom friends!\n\nI’m a Baal teshuva with man...,1,1,1,,


In [None]:
pip install spacy nltk wordcloud




# Task 2: Performing NER

In [None]:

import spacy
import pandas as pd
nlp = spacy.load("en_core_web_sm")
def extract_specific_entities(text):
    doc = nlp(text)

    entities = [(ent.text, ent.label_) for ent in doc.ents if ent.text in ['Hamas', 'Israel']]
    return entities if entities else None


df_subset = df
df_subset['text'] = df_subset['text'].apply(lambda x: x if isinstance(x, str) else '')
df_subset['entities'] = df_subset['text'].apply(extract_specific_entities)


df_filtered = df_subset[df_subset['entities'].notnull()]


print(df_filtered[['text', 'entities']].head())


KeyboardInterrupt: 

In [None]:
df_filtered[['text', 'entities']].head()

# Filtering the texts containg Iseral and Hamas only after NER


# Extracting verbs and modifiers related to entities

**Printing the extracted entity context**

# **Correct code**

In [None]:
import spacy
import pandas as pd
from collections import defaultdict
from wordcloud import WordCloud
import matplotlib.pyplot as plt


nlp = spacy.load("en_core_web_sm")


def extract_entity_context(text):
    entity_contexts = defaultdict(lambda: {'verbs': set(), 'modifiers': set()})
    doc = nlp(text)
    for ent in doc.ents:
        if ent.text in ('Hamas', 'Israel'):
            entity = ent.text
            for token in doc:
                if token.dep_ == 'nsubj' and token.head.pos_ == 'VERB' and token.text == entity:
                    entity_contexts[entity]['verbs'].add(token.head.text)
            for token in ent.root.children:
                if token.dep_ in ('amod', 'advmod'):
                    entity_contexts[entity]['modifiers'].add(token.text)
    return {entity: {'verbs': list(context['verbs']), 'modifiers': list(context['modifiers'])}
            for entity, context in entity_contexts.items()}


df_filtered['entity_contexts'] = df_filtered['text'].apply(extract_entity_context)


hamas_verbs_mods = []
israel_verbs_mods = []

for context in df_filtered['entity_contexts']:
    if 'Hamas' in context:
        hamas_verbs_mods.extend(context['Hamas']['verbs'] + context['Hamas']['modifiers'])
    if 'Israel' in context:
        israel_verbs_mods.extend(context['Israel']['verbs'] + context['Israel']['modifiers'])


hamas_text = ' '.join(hamas_verbs_mods)
israel_text = ' '.join(israel_verbs_mods)


def generate_word_cloud(text, title):
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.title(title, fontsize=16)
    plt.axis('off')
    plt.show()

generate_word_cloud(hamas_text, "Hamas: Verbs and Modifiers")
generate_word_cloud(israel_text, "Israel: Verbs and Modifiers")

In [None]:
df_filtered[['text', 'entity_contexts']].head(50)


# **Valence Scores**

In [None]:
df_filtered.shape


In [None]:

def load_vad_lexicon(file_path):
    vad_lexicon = {}
    with open(file_path, 'r') as file:
        for line in file:
            parts = line.strip().split()
            word = " ".join(parts[:-1])
            score = float(parts[-1])
            vad_lexicon[word] = score
    return vad_lexicon


valence_lexicon = load_vad_lexicon('/content/valence-NRC-VAD-Lexicon.txt')


def calculate_average_valence(entity_data, valence_lexicon):
    valence_scores = []


    words_to_analyze = entity_data['verbs'] + entity_data['modifiers']


    for word in words_to_analyze:
        if word in valence_lexicon:
            valence_scores.append(valence_lexicon[word])


    if valence_scores:
        return sum(valence_scores) / len(valence_scores)
    else:
        return None


df_filtered['valence_scores'] = df_filtered['entity_contexts'].apply(
    lambda x: {entity: calculate_average_valence(context, valence_lexicon)
               for entity, context in x.items()}
)


hamas_valences = []
israel_valences = []


for index, row in df_filtered.iterrows():
    for entity, avg_valence in row['valence_scores'].items():
        if avg_valence is not None:
            if entity == 'Hamas':
                hamas_valences.append(avg_valence)
            elif entity == 'Israel':
                israel_valences.append(avg_valence)


if hamas_valences:
    avg_hamas_valence = sum(hamas_valences) / len(hamas_valences)
    print(f"Hamas - Average Valence: {avg_hamas_valence:.12f}")
else:
    print("Hamas - No Valence Data Available")

if israel_valences:
    avg_israel_valence = sum(israel_valences) / len(israel_valences)
    print(f"Israel - Average Valence: {avg_israel_valence:.12f}")
else:
    print("Israel - No Valence Data Available")

In [None]:

def load_vad_lexicon(file_path):
    vad_lexicon = {}
    with open(file_path, 'r') as file:
        for line in file:
            parts = line.strip().split()
            if len(parts) < 2:
                continue
            word = " ".join(parts[:-1])
            try:
                score = float(parts[-1])
                vad_lexicon[word] = score
            except ValueError:
                print(f"Skipping line due to invalid score: {line}")
                continue
    return vad_lexicon


dominance_lexicon = load_vad_lexicon('/content/dominance-NRC-VAD-Lexicon.txt')


def calculate_average_dominance(entity_data, dominance_lexicon):
    dominance_scores = []


    words_to_analyze = entity_data['verbs'] + entity_data['modifiers']


    for word in words_to_analyze:
        if word in dominance_lexicon:
            dominance_scores.append(dominance_lexicon[word])


    if dominance_scores:
        return sum(dominance_scores) / len(dominance_scores)
    else:
        return None


df_filtered['dominance_scores'] = df_filtered['entity_contexts'].apply(
    lambda x: {entity: calculate_average_dominance(context, dominance_lexicon)
               for entity, context in x.items()}
)


hamas_dominances = []
israel_dominances = []


for index, row in df_filtered.iterrows():
    for entity, avg_dominance in row['dominance_scores'].items():
        if avg_dominance is not None:
            if entity == 'Hamas':
                hamas_dominances.append(avg_dominance)
            elif entity == 'Israel':
                israel_dominances.append(avg_dominance)


if hamas_dominances:
    avg_hamas_dominance = sum(hamas_dominances) / len(hamas_dominances)
    print(f"Hamas - Average Dominance: {avg_hamas_dominance:.12f}")
else:
    print("Hamas - No Dominance Data Available")

if israel_dominances:
    avg_israel_dominance = sum(israel_dominances) / len(israel_dominances)
    print(f"Israel - Average Dominance: {avg_israel_dominance:.12f}")
else:
    print("Israel - No Dominance Data Available")

In [None]:
df_filtered.head(50)