# Assignment 2 
### Kusal Bista

In [2]:
# Libraries for reading data
import random
import numpy as np
import pandas as pd 
import glob
import json
from tqdm import tqdm

# Libraries for pre-processing
import re
import nltk

from nltk import tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

# Libraries for information retrieval
import spacy
from spacy import displacy
from spacy.lang.en import English
import networkx as nx
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Libraries for data analysis
import matplotlib.pyplot as plt
from tabulate import tabulate

In [2]:
# !pip install spacy
# !python -m spacy download en_core_web_sm
# !python -m spacy download en_core_web_md
# !pip install tabulate
# !pip install nltk

In [3]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\a1881044\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\a1881044\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\a1881044\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\a1881044\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

### 1 Reading dataset and pre-processing

In [50]:
news_dataset = pd.read_csv('news_dataset.csv', encoding='latin-1')

In [37]:
news_dataset.head()

Unnamed: 0,id,author,date,year,month,topic,article
0,17307,Marlise Simons,1/01/2017,2017,1,architecture,PARIS ? When the Islamic State was about to...
1,17292,Andy Newman,31/12/2016,2016,12,art,Angels are everywhere in the Mu?iz family?s ap...
2,17298,Emma G. Fitzsimmons,2/01/2017,2017,1,business,Finally. The Second Avenue subway opened in Ne...
3,17311,Carl Hulse,3/01/2017,2017,1,business,WASHINGTON ? It?s or time for Republica...
4,17339,Jim Rutenberg,5/01/2017,2017,1,business,"For Megyn Kelly, the shift from Fox News to NB..."


In [51]:
news_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       1000 non-null   int64 
 1   author   994 non-null    object
 2   date     1000 non-null   object
 3   year     1000 non-null   object
 4   month    1000 non-null   object
 5   topic    1000 non-null   object
 6   article  1000 non-null   object
dtypes: int64(1), object(6)
memory usage: 54.8+ KB


### 1.2 Handling missing value

In [52]:
print("Missing value")
print(news_dataset.isnull().sum())

Missing value
id         0
author     6
date       0
year       0
month      0
topic      0
article    0
dtype: int64


In [53]:
# Handling missing value
news_dataset['author'] = news_dataset['author'].fillna('No author')
# checking missing value after handling missing value

In [54]:
print("After handling missing value")
print(news_dataset.isnull().sum())

After handling missing value
id         0
author     0
date       0
year       0
month      0
topic      0
article    0
dtype: int64


In [55]:
news_dataset = news_dataset.drop_duplicates(subset=['article'], keep='first').reset_index(drop=True)

In [56]:
news_dataset.head()

Unnamed: 0,id,author,date,year,month,topic,article
0,17307,Marlise Simons,1/01/2017,2017,1,architecture,PARIS ? When the Islamic State was about to...
1,17292,Andy Newman,31/12/2016,2016,12,art,Angels are everywhere in the Mu?iz family?s ap...
2,17298,Emma G. Fitzsimmons,2/01/2017,2017,1,business,Finally. The Second Avenue subway opened in Ne...
3,17311,Carl Hulse,3/01/2017,2017,1,business,WASHINGTON ? It?s or time for Republica...
4,17339,Jim Rutenberg,5/01/2017,2017,1,business,"For Megyn Kelly, the shift from Fox News to NB..."


### 1.3 Data pre-processing

In [57]:
def pre_process(data):
    # Define stop words
    stop_words = set(stopwords.words('english'))
    stop_words.update(["This", "The", "the"])

    s = " \[(?=.*\d).*?\]" 

    # Lemmatization and removal of stopwords
    lemmatizer = WordNetLemmatizer()

    result = []
    for text in data:
        # Clean text
        # Remove non-ASCII characters
        text = ''.join([char for char in text if ord(char) < 128])

        # Remove multiple spaces
        text = re.sub(r'\s+', ' ', text)

        # Remove question mark problems
        text = re.sub(r'(\s\?)',' ',text)
        text = re.sub(r"\b\?\b", "\'", text)
        text = re.sub(r"(,\?)",",", text)
        text = re.sub(r"\?+", "?", text)
        text = text.strip()

        # Lemmatization and removal of stopwords
        processed_text = " ".join([lemmatizer.lemmatize(word) for word in re.sub(s, "", text).split() if word.lower() not in stop_words])

        result.append(processed_text)

    return result

In [58]:
news_dataset['processed_article'] = pre_process(news_dataset['article'])

In [59]:
news_dataset.head()

Unnamed: 0,id,author,date,year,month,topic,article,processed_article
0,17307,Marlise Simons,1/01/2017,2017,1,architecture,PARIS ? When the Islamic State was about to...,PARIS Islamic State driven ancient city Palmyr...
1,17292,Andy Newman,31/12/2016,2016,12,art,Angels are everywhere in the Mu?iz family?s ap...,Angels everywhere Mu'iz family's apartment Bro...
2,17298,Emma G. Fitzsimmons,2/01/2017,2017,1,business,Finally. The Second Avenue subway opened in Ne...,Finally. Second Avenue subway opened New York ...
3,17311,Carl Hulse,3/01/2017,2017,1,business,WASHINGTON ? It?s or time for Republica...,WASHINGTON time Republicans. tumultuous decade...
4,17339,Jim Rutenberg,5/01/2017,2017,1,business,"For Megyn Kelly, the shift from Fox News to NB...","Megyn Kelly, shift Fox News NBC host daily day..."


In [14]:
class NamedEntityRecognition:
    def __init__(self):
        self.ner_dict = {}
        self.nlp = spacy.load("en_core_web_sm")

    def example(self, article):
        """
        Display named entities in the given document using displacy.
        """
        text = self.nlp(article)
        displacy.render(text, style="ent", jupyter=True)
    
    def get_ner(self, data):
        """
        Extract named entities from the given data and store them in a dictionary.
        
        Args:
        data (DataFrame): DataFrame containing 'id' and 'clean' columns.
        
        Returns:
        dict: A dictionary containing named entities for each document.
        """
        for i in range(data.shape[0]):
            id = data['id'][i]
            text = self.nlp(str(data['processed_article'][i]))
            if id not in self.ner_dict:
                self.ner_dict[id] = [{"text": ent.text.strip(), "label": ent.label_} for ent in text.ents]
            else:
                existing_entities = set((entity['text'], entity['label']) for entity in self.ner_dict[id])
                new_entities = [{"text": ent.text.strip(), "label": ent.label_} for ent in text.ents
                                if (ent.text.strip(), ent.label_) not in existing_entities]
                self.ner_dict[id].extend(new_entities)
        return self.ner_dict

In [15]:
# Initializing NamedEntityRecognition object
ner = NamedEntityRecognition()

# Process example document and display named entities
ner.example(news_dataset['processed_article'][1][:800])

In [16]:
article_ner_dict = ner.get_ner(news_dataset)

In [17]:
article_ner_dict

{17307: [{'text': 'PARIS Islamic State', 'label': 'ORG'},
  {'text': 'March', 'label': 'DATE'},
  {'text': 'Yves Ubelmann', 'label': 'PERSON'},
  {'text': 'Syria', 'label': 'GPE'},
  {'text': 'Ubelmann', 'label': 'PERSON'},
  {'text': '36', 'label': 'DATE'},
  {'text': 'Syria', 'label': 'GPE'},
  {'text': 'Paris', 'label': 'GPE'},
  {'text': 'Islamists', 'label': 'NORP'},
  {'text': 'Houmam Saad', 'label': 'PERSON'},
  {'text': 'Syrian', 'label': 'NORP'},
  {'text': 'four day', 'label': 'DATE'},
  {'text': 'four', 'label': 'CARDINAL'},
  {'text': 'six', 'label': 'CARDINAL'},
  {'text': 'Ubelmann', 'label': 'PERSON'},
  {'text': 'Iconem', 'label': 'GPE'},
  {'text': 'Ubelmann', 'label': 'PERSON'},
  {'text': 'today', 'label': 'DATE'},
  {'text': 'Syria', 'label': 'GPE'},
  {'text': 'Iraq', 'label': 'GPE'},
  {'text': 'Islamic', 'label': 'NORP'},
  {'text': 'Ubelmann', 'label': 'PERSON'},
  {'text': 'Grand Palais Paris', 'label': 'FAC'},
  {'text': 'Jan. 9', 'label': 'DATE'},
  {'text': 

## A. Tasks as specified for your team structure

**One headings for each task.**

## B. References

## C. Appendix