# Named Entity Recognition Analysis

# Importing Libraries

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import spacy
from spacy import displacy
import networkx as nx
import os
import matplotlib.pyplot as plt
import scipy
import re
from collections import Counter
from fuzzywuzzy.process import extractOne

ModuleNotFoundError: No module named 'spacy'

In [None]:
!pip install fuzzywuzzy

In [None]:
!pip install python-Levenshtein

In [None]:
# Download English module

!python -m spacy download en_core_web_sm

In [None]:
# Load spaCy's large English model
nlp = spacy.load("en_core_web_sm")

# Load the Twentieth-Century Text Data

In [None]:
# Load the text data
file_path = "20th_century_events_cleaned.txt"
with open(file_path, "r", encoding="utf-8") as file:
    text_data = file.read()

In [None]:
# Print a preview
print(text_data[:1000])  # First 1000 characters


# Text Wrangling and Cleaning

In [None]:
# Remove special characters, multiple spaces, and normalize country names
def clean_text(text):
    text = re.sub(r"\[.*?\]", "", text)  # Remove any [edit] or brackets content
    text = re.sub(r"[^a-zA-Z0-9.,;!?\'\"\s]", "", text)  # Remove unwanted characters
    text = re.sub(r"\s+", " ", text).strip()  # Remove extra spaces
    text = text.replace("\n", " ")  # Remove newline characters
    
    return text

In [None]:
# Apply text cleaning
cleaned_text = clean_text(text_data)


In [None]:
# Save cleaned text to a new file
cleaned_file_path = "20th_century_events_cleaned_normalized.txt"
with open(cleaned_file_path, "w", encoding="utf-8") as file:
    file.write(cleaned_text)

In [None]:
# Print a preview
print(cleaned_text[:1000])

#  Named Entity Recognition (NER) using spaCy

In [None]:
# Apply NLP processing
doc = nlp(cleaned_text)


In [None]:
# Extract named entities
entities = [(ent.text, ent.label_) for ent in doc.ents]

In [None]:
# Display first 20 entities
print(entities[:20])

# Extract and Standardize Country Entities


In [None]:
# List of countries to filter (Updated for consistency)
countries = ["Germany", "Japan", "United States", "France", "Italy", "China", "Russia", "India", "United Kingdom", "Canada"]


In [None]:
# Standardize country mentions using fuzzy matching
def match_country(entity):
    match = extractOne(entity, countries)
    if match:  # Ensure a valid match before accessing its elements
        best_match, score = match
        return best_match if score > 80 else None  # Use a threshold of 80 for accuracy
    return None



In [None]:
# Filter and standardize country entities
country_mentions = [match_country(ent.text) for ent in doc.ents if match_country(ent.text)]

In [None]:
# Count occurrences
country_counts = Counter(country_mentions)

In [None]:
# Convert to DataFrame
df_countries = pd.DataFrame(country_counts.items(), columns=["Country", "Mentions"])


In [None]:
# Display the DataFrame
print(df_countries.head())  # Print first 5 rows

# Create a Relationships DataFrame


In [None]:
# Extract sentences containing country mentions
sentences = [sent.text for sent in doc.sents if any(country in sent.text for country in countries)]

In [None]:
# Create relationships DataFrame
relationship_data = []
for sent in sentences:
    present_countries = [match_country(country) for country in countries if match_country(country) and country in sent]
    present_countries = list(set(present_countries))  # Remove duplicates
    if len(present_countries) > 1:
        relationship_data.append({"Sentence": sent, "Countries": ", ".join(present_countries)})

In [None]:
df_relationships = pd.DataFrame(relationship_data)

In [None]:
# Display the DataFrame
print(df_relationships.head())  # Print first 5 rows

#  Save and Export the DataFrame

In [None]:
# Save to CSV
df_relationships.to_csv("country_relationships.csv", index=False)