# Assignment 2 
### Kusal Bista

In [2]:
# Libraries for reading data
import random
import numpy as np
import pandas as pd 
import glob
import json
from tqdm import tqdm

# Libraries for pre-processing
import re
import nltk

from nltk import tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

# Libraries for information retrieval
import spacy
from spacy import displacy
from spacy.lang.en import English
import networkx as nx
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Libraries for data analysis
import matplotlib.pyplot as plt
from tabulate import tabulate

In [3]:
# !pip install spacy
# !python -m spacy download en_core_web_sm
# !python -m spacy download en_core_web_md
# !pip install tabulate
# !pip install nltk

In [4]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\a1881044\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\a1881044\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\a1881044\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\a1881044\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

### 1 Reading dataset and pre-processing

In [5]:
news_dataset = pd.read_csv('news_dataset.csv', encoding='latin-1')

In [6]:
news_dataset.head()

Unnamed: 0,id,author,date,year,month,topic,article
0,17307,Marlise Simons,1/01/2017,2017,1,architecture,PARIS ? When the Islamic State was about to...
1,17292,Andy Newman,31/12/2016,2016,12,art,Angels are everywhere in the Mu?iz family?s ap...
2,17298,Emma G. Fitzsimmons,2/01/2017,2017,1,business,Finally. The Second Avenue subway opened in Ne...
3,17311,Carl Hulse,3/01/2017,2017,1,business,WASHINGTON ? It?s or time for Republica...
4,17339,Jim Rutenberg,5/01/2017,2017,1,business,"For Megyn Kelly, the shift from Fox News to NB..."


In [8]:
news_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       1000 non-null   int64 
 1   author   994 non-null    object
 2   date     1000 non-null   object
 3   year     1000 non-null   object
 4   month    1000 non-null   object
 5   topic    1000 non-null   object
 6   article  1000 non-null   object
dtypes: int64(1), object(6)
memory usage: 54.8+ KB


### 1.2 Handling missing value

In [9]:
print("Missing value")
print(news_dataset.isnull().sum())

Missing value
id         0
author     6
date       0
year       0
month      0
topic      0
article    0
dtype: int64


In [10]:
# Handling missing value
news_dataset['author'] = news_dataset['author'].fillna('No author')
# checking missing value after handling missing value

In [11]:
print("After handling missing value")
print(news_dataset.isnull().sum())

After handling missing value
id         0
author     0
date       0
year       0
month      0
topic      0
article    0
dtype: int64


### 1.3 Data pre-processing

In [12]:
def pre_process(data):
    # Define stop words
    stop_words = set(stopwords.words('english'))
    stop_words.update(["This", "The", "the"])

    s = " \[(?=.*\d).*?\]" 

    # Lemmatization and removal of stopwords
    lemmatizer = WordNetLemmatizer()

    result = []
    for text in data:
        # Clean text
        # Remove non-ASCII characters
        text = ''.join([char for char in text if ord(char) < 128])

        # Remove multiple spaces
        text = re.sub(r'\s+', ' ', text)

        # Remove question mark problems
        text = re.sub(r'(\s\?)',' ',text)
        text = re.sub(r"\b\?\b", "\'", text)
        text = re.sub(r"(,\?)",",", text)
        text = re.sub(r"\?+", "?", text)
        text = text.strip()

        # Lemmatization and removal of stopwords
        processed_text = " ".join([lemmatizer.lemmatize(word) for word in re.sub(s, "", text).split() if word.lower() not in stop_words])

        result.append(processed_text)

    return result

In [13]:
news_dataset['processed_article'] = pre_process(news_dataset['article'])

In [14]:
news_dataset.head()

Unnamed: 0,id,author,date,year,month,topic,article,processed_article
0,17307,Marlise Simons,1/01/2017,2017,1,architecture,PARIS ? When the Islamic State was about to...,PARIS Islamic State driven ancient city Palmyr...
1,17292,Andy Newman,31/12/2016,2016,12,art,Angels are everywhere in the Mu?iz family?s ap...,Angels everywhere Mu'iz family's apartment Bro...
2,17298,Emma G. Fitzsimmons,2/01/2017,2017,1,business,Finally. The Second Avenue subway opened in Ne...,Finally. Second Avenue subway opened New York ...
3,17311,Carl Hulse,3/01/2017,2017,1,business,WASHINGTON ? It?s or time for Republica...,WASHINGTON time Republicans. tumultuous decade...
4,17339,Jim Rutenberg,5/01/2017,2017,1,business,"For Megyn Kelly, the shift from Fox News to NB...","Megyn Kelly, shift Fox News NBC host daily day..."


In [1]:
import json
from pprint import pprint
import nltk
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import _pickle as pkl