In [3]:
import nltk
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import RegexpTokenizer
import json
import langid


In [5]:
# 1. Data Collection --------------------------------------------------------------------------------

# Loading json
with open('Science_Technology_News.json', encoding='utf-8') as f:
    data = json.load(f)

# Main dataframe
df = pd.json_normalize(data)

# Printing Main dataframe structure
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5437 entries, 0 to 5436
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   title             5437 non-null   object
 1   link              5437 non-null   object
 2   keywords          2502 non-null   object
 3   creator           2610 non-null   object
 4   video_url         0 non-null      object
 5   description       4936 non-null   object
 6   content           1627 non-null   object
 7   pubDate           5437 non-null   object
 8   full_description  2670 non-null   object
 9   image_url         2169 non-null   object
 10  source_id         5437 non-null   object
dtypes: object(11)
memory usage: 467.4+ KB
None


In [6]:
# 2. Information Extraction -----------------------------------------------------------------

# Extracting relevant fields
df = df[['title', 'pubDate', 'creator', 'content']].copy()

# Filtering out non-English articles ------------------------------------------
def englishfilter(text):
    if pd.isna(text):
        return "other"
    else:
        return langid.classify(text)[0]
df['language'] = df['content'].apply(englishfilter)
df = df[df['language'] == 'en']
df.drop('language', axis=1, inplace=True)
#-----------------------------------------------------------------------------

# Preprocessing
stop_words = set(stopwords.words('english'))
stemmer = SnowballStemmer('english')
tokenizer = RegexpTokenizer(r'\w+')

def preprocess(text):
    if text:
        tokens = tokenizer.tokenize(text.lower())
        filtered_tokens = [word for word in tokens if word not in stop_words]
        stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
        return " ".join(stemmed_tokens)
    return ""

# Applying preprocessing to content field
df['processed_content'] = df['content'].apply(preprocess)


In [8]:
# Printing Main dataframe structure
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5437 entries, 0 to 5436
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   title              5437 non-null   object
 1   pubDate            5437 non-null   object
 2   creator            2610 non-null   object
 3   content            1627 non-null   object
 4   processed_content  5437 non-null   object
dtypes: object(5)
memory usage: 212.5+ KB
None


In [9]:
# Printing preprocessed content
print(df['processed_content'].head())

0                                                     
1    uno smartphon pieghevol come xiaomi mix fold a...
2                                                     
3    mától magyar erst bank kártyáiv elérhető mobil...
4                                                     
Name: processed_content, dtype: object
