# EDA for the All news data set

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [None]:
import os
import sys

# Add ../src to Python path
project_root = os.path.abspath(os.path.join(os.path.dirname('__file__'), '../src'))
sys.path.insert(0, project_root)

# Import the parse config function to parse the .toml file
from utils.config_tool import parse_config

In [None]:
# Load the example dataset
config_file = "../config/predict_stock_w_news.toml"
config = parse_config(config_file)

df = pd.read_csv(os.path.join(config['info']['local_data_path'],'data_raw', config['news_ingestion']['input']["all_news_eg1_file"]))

df.head()

In [None]:
# Count the number of news articles per year
df['year'].value_counts()

In [None]:
# Plot the number of news articles per year
plt.figure(figsize=(8,8))

plt.bar(df['year'].value_counts().index, df['year'].value_counts())

plt.show()

### This first 10000 rows in the files the time is almost evenly distributed

#### Now looking at the specific time count:

In [None]:
# Convert the date column to datetime
df['date'] = pd.to_datetime(df['date'])
df['date'].head()

In [None]:
plt.figure(figsize=(20,8))

plt.bar(df['date'].value_counts().index, df['date'].value_counts().values)

plt.show()

### Note that there are still labels

In [None]:
# Count the number of news articles per section
df['section'].value_counts()

In [None]:
plt.figure(figsize=(40,40))

sns.barplot(y=df['section'].value_counts().index, x=df['section'].value_counts().values)

plt.yticks(rotation = 0, fontsize = 10)

plt.show()

### But there are still some nan lables. Note that nan is not of string type, it is of float type

In [None]:
# Look at the type of the NAN values in the section column
print(df.iloc[0]['section'], type(df.iloc[0]['section']))


In [None]:
# Find the number of NAN values in the section column
df[df['section'].isna()].count()

### Lets look at the publishers

In [None]:
# Describe the publisher column
print(df['publication'].value_counts())
print(df['publication'].value_counts().sum())
print(df['publication'].isna().sum())

In [None]:
plt.figure(figsize=(20,20))

plt.pie(df['publication'].value_counts().values, autopct='%2.0f%%', 
                labels=df['publication'].value_counts().index)


plt.show()

### This is not of the same distribution as the whole data set as indicated in the description of the whole data set.

### Preprocessing pipline test

### 1: Drop the link and the time column

In [None]:
columns_to_drop = ['Unnamed: 0', 'author', 'year', 'month', 'day', 'url']

df = df.drop(columns_to_drop,axis=1)
df.head()

### 2: Select the publisher

In [None]:
## This is just an example, I don't need to do it now

publisher = {'Vox'}
print(type(publisher))
df[df['publication'].isin(publisher)]

### 3: Remove the null

In [None]:
# Find the null values in the dataframe
df.isna().sum()

So there is no null element in title and article. It is OK for author and section to be null

In [None]:
df.isnull().sum()

In [None]:
#df[df['author'].isna()]

Seems like they are also resonable data.

In [None]:
df[df['title'] == '']

In [None]:
#df[df['article'] == '']

In [None]:
df.dropna(subset=['article'], inplace=True)
df.isnull().sum()

For now, there are no empty articles.

### 4: Add column of word counting:

In [None]:
# Word count of the articles
df['word_count'] = df['article'].apply(lambda x: len(x.split()))
df['word_count'].describe()

In [None]:
plt.figure(figsize=(8,8))

sns.displot(df['word_count'])

plt.show()

In [None]:
print(df['article'][df['article'].apply(lambda x: len(x.split())) <= 10])
#print(len(df['article'][df['article'].apply(lambda x: len(x.split())) <= 30]))

Note that there contains some advertisments and one word news, which is not so good for clustering task.

### 5: Get the first 100 words:

In [None]:
def get_first_words(article_text):
    # Split the article text into words
    words = article_text.split()
    # Get the first 100 words
    first_100_words = words[:100]
    # Join them back into a string
    result = ' '.join(first_100_words)
    return result

In [None]:
df['summary'] = df['article'].apply(get_first_words)
df.head()

In [None]:
artical_0 = df['article'][0]
print(artical_0)
print(len(df['summary'][0].split()))

### 6: Text Tokenization

In [None]:
import nltk
from nltk import word_tokenize

# Download 'punkt' if you haven't already
nltk.download('punkt')

print(nltk.data.path)


In [None]:
import nltk
from nltk import word_tokenize

# Append the NLTK data path
nltk.data.path.append('/Users/xiaokangwang/nltk_data')

# Download 'punkt' if you haven't already
nltk.download('punkt', download_dir='/Users/xiaokangwang/nltk_data')

# Verify the NLTK data path
print(nltk.data.path)

# Download 'punkt_tab' if you haven't already
nltk.download('punkt_tab', download_dir='/Users/xiaokangwang/nltk_data')

# Verify the 'punkt' tokenizer
import os
print(os.listdir('/Users/xiaokangwang/nltk_data/tokenizers/punkt'))

# Load the 'punkt' tokenizer explicitly
nltk.data.load('tokenizers/punkt/english.pickle')



In [None]:
# Compare the word_tokenizer with the split method
text = word_tokenize(df['summary'][0])
print(len(text), text)
print(len(df['summary'][0].split()), df['summary'][0].split())

.split() and word_tokenize are basically the same thing, but tokenize are better. The only problem is that tokenize will include punctuations, so we better removed them

In [None]:
def get_tokenized_words_with_no_punctuation(text):
    words = word_tokenize(text, language="english")
    words_no_punctuation = [word.lower() for word in words if word.isalnum()]
    return words_no_punctuation

In [None]:
print(len(get_tokenized_words_with_no_punctuation(df['summary'][0])),get_tokenized_words_with_no_punctuation(df['summary'][0]))

### 7: Deleting the stop words

In [None]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
 

In [None]:
def remove_stop_words(words):
    words_no_stop_words = [word for word in words if word not in stop_words]
    return words_no_stop_words

In [None]:
print(len(remove_stop_words(get_tokenized_words_with_no_punctuation(df['summary'][0]))),remove_stop_words(get_tokenized_words_with_no_punctuation(df['summary'][0])))

### 6: Lemmatization

In [None]:
from nltk.stem import WordNetLemmatizer

In [None]:
def lemmatize_words(article_text):
    words = get_tokenized_words_with_no_punctuation(article_text)
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return lemmatized_words

In [None]:
df['tokenized_words'] = df['article'].apply(lemmatize_words)

In [None]:
df.head()

In [None]:
df['tokenized_words'] = df['tokenized_words'].apply(remove_stop_words)

In [None]:
df.head()

In [None]:
df = df.drop('article', axis=1)
df.head()

In [None]:
df['tokenized_words'].apply(len).describe()

### 7:NER transformation

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [None]:
# Use spacy to extract named entities
def extract_ner_features(text):
    doc = nlp(text)
    entity_counts = {
        "PERSON": 0,
        "ORG": 0,
        "GPE": 0,
        "EVENT": 0,
        "PRODUCT": 0
    }
    for ent in doc.ents:
        if ent.label_ in entity_counts:
            entity_counts[ent.label_] += 1
    return pd.Series(entity_counts)


In [None]:
# Join the extracted features to the dataframe

A = df['tokenized_words'].apply(lambda x: extract_ner_features(' '.join(x)))
print(A.head())


In [None]:
df_new = df.join(A.apply(pd.Series))

In [None]:
df_new.head()