# The Text: Introductory

### Installing

In [None]:
% pip install dateparser feedparser==6.0.8 requests==2.26.0 bs4==0.0.1 numpy==1.21.4 pandas==1.3.5 networkx==2.6.3 matplotlib==3.5.1 nltk==3.6.5 scikit-learn==1.0.1 tensorflow==2.7.0 spacy==3.2.1 langdetect deep_translator
import nltk; nltk.download('stopwords'); nltk.download('punkt')
!python -m spacy download en_core_web_lg
!python -m spacy download en_core_web_sm

### Connect Drive

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

### Settings

In [None]:
current_directory = ''

### Importing

In [None]:
from dateparser import parse as parse_date
import feedparser
import urllib
import requests
import re
import copy
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import matplotlib as mpl
from nltk import corpus, tokenize
from sklearn import feature_extraction, metrics, model_selection, preprocessing, cluster
import tensorflow as tf
import spacy
from string import punctuation
from langdetect import detect
from deep_translator import GoogleTranslator

In [None]:
def log_progress(sequence, every=None, size=None, name='Items'):
    from ipywidgets import IntProgress, HTML, VBox
    from IPython.display import display

    is_iterator = False
    if size is None:
        try:
            size = len(sequence)
        except TypeError:
            is_iterator = True
    if size is not None:
        if every is None:
            if size <= 200:
                every = 1
            else:
                every = int(size / 200)     # every 0.5%
    else:
        assert every is not None, 'sequence is iterator, set every'

    if is_iterator:
        progress = IntProgress(min=0, max=1, value=1)
        progress.bar_style = 'info'
    else:
        progress = IntProgress(min=0, max=size, value=0)
    label = HTML()
    box = VBox(children=[label, progress])
    display(box)

    index = 0
    try:
        for index, record in enumerate(sequence, 1):
            if index == 1 or index % every == 0:
                if is_iterator:
                    label.value = '{name}: {index} / ?'.format(
                        name=name,
                        index=index
                    )
                else:
                    progress.value = index
                    label.value = u'{name}: {index} / {size}'.format(
                        name=name,
                        index=index,
                        size=size
                    )
            yield record
    except:
        progress.bar_style = 'danger'
        raise
    else:
        progress.bar_style = 'success'
        progress.value = index
        label.value = "{name}: {index}".format(
            name=name,
            index=str(index or '?')
        )

### Scraper Class

In [None]:
class Scraper:
    def __init__(self, language='en', country='MY'):
        """
            :param str country: two string country code, example: 'MY', 'US'
            :param str language: news language
        """
        self.lang = language.lower()
        self.country = country.upper()
        self.BASE_URL = 'https://news.google.com/rss'

    def __news_parser(self, text):
        try:
            bs4_html = BeautifulSoup(text, 'html.parser')
            lis = bs4_html.find_all('li')
            sub_articles = []
            for li in lis:
                try:
                    sub_articles.append({'url': li.a['href'],
                                         'title': li.a.text,
                                         'publisher': li.font.text})
                except:
                    pass
            return sub_articles
        except:
            return text

    def __ceid(self):
        return '?ceid={}:{}&hl={}&gl={}'.format(self.country, self.lang, self.lang, self.country)

    def __add_sub_articles(self, entries):
        for i, val in enumerate(entries):
            if 'summary' in entries[i].keys():
                entries[i]['sub_articles'] = self.__news_parser(entries[i]['summary'])
            else:
                entries[i]['sub_articles'] = None
                
        return entries

    def __parse_feed(self, feed_url, proxies=None):
        if proxies:
            r = requests.get(feed_url, proxies = proxies)
        else:
            r = requests.get(feed_url)

        r = requests.get(feed_url)

        if 'https://news.google.com/rss/unsupported' in r.url:
            raise Exception('This feed is not available')

        d = feedparser.parse(r.text)

        if not proxies and len(d['entries']) == 0:
            d = feedparser.parse(feed_url)

        return dict((k, d[k]) for k in ('feed', 'entries'))

    def __search_helper(self, query):
        return urllib.parse.quote_plus(query)

    def __from_to_helper(self, validate=None):
        try:
            validate = parse_date(validate).strftime('%Y-%m-%d')
            return str(validate)
        except:
            raise Exception('Could not parse your date')
            
    def __extract_summary(self, text:str):
        result = list()
        length = len(text.split('target="_blank">'))

        if length > 2:
            for i in text.split('target="_blank">')[:-1]:
                if '</a>' not in i: 
                    continue
                else:
                    text = i.split('</a>')[0]
                    result.append(text)
            result = '. '.join(result)
        else:
            if '</a>' in text: 
                text = text.split('target="_blank">')[1]
                text = text.split('</a>')[0]
                result.append(text)

        return result 
    
    def __clean_news(self, r, n: int, show: bool):
        r = copy.copy(r)
        required, present = 0, len(r.get('entries'))
        
        if n < present:
            required = copy.copy(n)
        if n > present:
            required = copy.copy(present)

        titles, publishers, published_times, summaries, links = [], [], [], [], []

        i = 0
        for i in range(required):
            if r.get('entries')[i].get('title').count('-') > 1:
                title = r.get('entries')[i].get('title').rsplit('-', 1)[0]
                publisher = r.get('entries')[i].get('title').rsplit('-', 1)[-1].strip()
            else:
                title = r.get('entries')[i].get('title').split('-')[0]
                publisher = r.get('entries')[i].get('title').split('-')[-1].strip()
            published_time = r.get('entries')[i].get('published')
            raw_summary = r.get('entries')[i].get('summary')
            summary = self.__extract_summary(raw_summary)
            link = r.get('entries')[i].get('link')

            titles.append(title)
            publishers.append(publisher)
            published_times.append(published_time)
            summaries.append(summary)
            links.append(link)

            if show:
                print('Title: ', title)
                print('Publisher: ', publisher)
                print('Published Time: ', published_time)
                print('Summary: ', summary)
                print('Link: ', link)
                print('\n')

        return {'titles': titles, 
                'publishers': publishers, 
                'published_times': published_times, 
                'summaries': summaries,
                'links': links}
        
    def get_news(self, nums: int, show: bool, proxies=None):
        """
            :param int nums: number of news to retrieve
            :param bool show: print searched results
            :return dict d: dictionary of curated results
            
        """
        d = self.__parse_feed(self.BASE_URL + self.__ceid(), proxies=proxies)
        d['entries'] = self.__add_sub_articles(d['entries'])
        d = self.__clean_news(d, nums, show)
        
        return d

    def get_news_by_topics(self, topic: str, nums: int, show: bool, proxies=None):
        """
            :param str topic: news topic to query
            :param int nums: number of news to retrieve
            :param bool show: print searched results
            :return dict d: dictionary of curated results
            
        """
        if topic.upper() in ['WORLD', 'NATION', 'BUSINESS', 'TECHNOLOGY', 'ENTERTAINMENT', 'SCIENCE', 'SPORTS', 'HEALTH']:
            d = self.__parse_feed(self.BASE_URL + '/headlines/section/topic/{}'.format(topic.upper()) + self.__ceid(), 
                                  proxies = proxies)

        else:
            d = self.__parse_feed(self.BASE_URL + '/topics/{}'.format(topic) + self.__ceid(), proxies = proxies)

        d['entries'] = self.__add_sub_articles(d['entries'])
        if len(d['entries']) > 0:
            d = self.__clean_news(d, nums, show)
            return d
        else:
            raise Exception('unsupported topic')

    def search(self, query: str, nums: int, show: bool, helper=True, when=None, from_=None, to_=None, proxies=None):
        """
            :param str query: news title to query
            :param int nums: number of news to retrieve
            :param bool show: print searched results
            :param str when: results in an article published in last _, example: '30m', '1h', '7d'
            :return dict d: dictionary of curated results
            
        """
        if when:
            query += ' when:' + when

        if from_ and not when:
            from_ = self.__from_to_helper(validate=from_)
            query += ' after:' + from_

        if to_ and not when:
            to_ = self.__from_to_helper(validate=to_)
            query += ' before:' + to_

        if helper == True:
            query = self.__search_helper(query)

        search_ceid = self.__ceid()
        search_ceid = search_ceid.replace('?', '&')

        d = self.__parse_feed(self.BASE_URL + '/search?q={}'.format(query) + search_ceid, proxies = proxies)
        d['entries'] = self.__add_sub_articles(d['entries'])
        d = self.__clean_news(d, nums, show)
        
        return d

In [None]:
scraper = Scraper(country='MY')
news = scraper.get_news(nums=20, show=True)
news_topics = scraper.get_news_by_topics(topic='science', nums=20, show=True)
news_searched = scraper.search(query='5G', nums=20, show=True)

### Levenshtein Distance 

In [None]:
def levenshtein_dis(seq1, seq2):
    size_x = len(seq1) + 1
    size_y = len(seq2) + 1
    matrix = np.zeros ((size_x, size_y))
    for x in range(size_x):
        matrix [x, 0] = x
    for y in range(size_y):
        matrix [0, y] = y

    for x in range(1, size_x):
        for y in range(1, size_y):
            if seq1[x-1] == seq2[y-1]:
                matrix [x,y] = min(
                    matrix[x-1, y] + 1,
                    matrix[x-1, y-1],
                    matrix[x, y-1] + 1
                )
            else:
                matrix [x,y] = min(
                    matrix[x-1,y] + 1,
                    matrix[x-1,y-1] + 1,
                    matrix[x,y-1] + 1
                )
                
    # print (matrix)
    return (matrix[size_x - 1, size_y - 1])

### Cosine Similarity and TFIDF

In [None]:
def cosimilarity_tfidf(text1, text2):
    
    text_list = list([text1, text2])
    
    vectorizer = feature_extraction.text.TfidfVectorizer()
    textX = vectorizer.fit_transform(text_list)
    
    return metrics.pairwise.cosine_similarity( textX[0], textX[1] ).flatten()[0]    

### News Similarity from Single Country

In [None]:
%%time

def single_news_network(country: str, search_keywords: str, nums: int, when: str, formula: int):
    scraper = Scraper(country=country)
    searched_news = scraper.search(query=search_keywords, nums=nums, show=False, when=when)
    titles_list = searched_news.get('titles')
    
    sources_distances = list()
    sources_distances_df = pd.DataFrame(index=titles_list)

    if formula == 1:
        for i in range(len(titles_list)):
            for j in range(len(titles_list)):
                sources_distances.append(cosimilarity_tfidf(titles_list[i], titles_list[j]))
    elif formula == 2:
        for i in range(len(titles_list)):
            for j in range(len(titles_list)):
                sources_distances.append(levenshtein_dis(titles_list[i], titles_list[j]))

    chunks = [sources_distances[i:i+len(titles_list)] for i in range(0, len(sources_distances), len(titles_list))]

    for i, t in enumerate(titles_list):
        sources_distances_df[t] = chunks[i]

    stacked_df = sources_distances_df.stack().reset_index()
    stacked_df.columns = ['Source_1', 'Source_2', 'Distances']

    filtered_stacked_df = stacked_df.loc[ (stacked_df['Distances'] > 0) & (stacked_df['Source_1'] != stacked_df['Source_2']) ]

    G = nx.from_pandas_edgelist(filtered_stacked_df, source='Source_1', target='Source_2')

    plt.figure(figsize=(100,50), dpi=50)
    plt.title('News Similarity under the same keywords: {}'.format(search_keywords), fontsize=75)
    nx.draw(G, 
            with_labels=True, 
            node_color='orange', 
            node_size=400, 
            edge_color='grey', 
            style='dashed', 
            linewidths=1, 
            font_size=30)

single_news_network(country='MY', 
                    search_keywords='blockchain', 
                    nums=20,
                    when='12h',
                    formula=2) # 1 for lev_dis, 2 for cosine_tfidf

### News Similarity from Multi Country

In [None]:
%%time

def multi_news_networks(country: list, search_keywords: str, nums: int, when: str):
    scraper_list, titles_list_of_list, titles_list = ([] for i in range(3))
    
    for c in country:
        scraper_list.append(Scraper(country=c))
        
    for i in range(len(country)):
        searched_news = scraper_list[i].search(query=search_keywords, nums=nums, show=False, when=when)
        titles = searched_news.get('titles')
        titles_list_of_list.append(titles)
    
    titles_df = pd.DataFrame(data=titles_list_of_list).T
    titles_df.columns = country.copy()
    titles_df = pd.melt(titles_df, value_vars=country)
    titles_df.rename(columns={'variable': 'country', 'value': 'title'}, inplace=True)
    
    titles_list = [t for sublist in titles_list_of_list for t in sublist] # list of list to list
    titles_list = list(set(titles_list)) # remove duplicates title

    node_characteristic = pd.DataFrame({'ID': country + titles_list,
                                        'type': country + ['t'] * len(titles_list)})
    
    plt.figure(figsize=(50, 50), dpi=150)
    plt.title('Multicountry News Similarity: The significant', fontsize=50)
    
    G = nx.from_pandas_edgelist(titles_df, source='country', target='title', create_using=nx.Graph())
        
    node_characteristic = node_characteristic.set_index('ID')
    node_characteristic = node_characteristic.reindex(G.nodes())
    node_characteristic['type'] = pd.Categorical(node_characteristic['type'])

    cmap = matplotlib.colors.ListedColormap(['yellow', 'C0', 'green', 'red', 'darkorange', 'thistle'])
    
    node_sizes = [4000 if entry != 't' else 300 for entry in node_characteristic.type]
    
    nx.draw(G, 
            with_labels=True,
            node_size=node_sizes, 
            node_color=node_characteristic['type'].cat.codes,
            cmap=cmap,
            edge_color='grey', 
            style='dashed', 
            linewidths=1, 
            font_size=20)
    
multi_news_networks(country=['MY', 'US', 'GB', 'SG', 'IN'], search_keywords='blockchain', nums=20, when='12h')

### Train Sentiment LSTM Model for News 

In [None]:
def read_sentiment_data():
    # any sentiment dataset will do
    sentiment_data = pd.read_csv(current_directory + 'datasets/sentiment.csv', header=None, encoding='ISO-8859-1')
    sentiment_data.columns = ['sentiment', 'text']
    return sentiment_data

sentiment_data = read_sentiment_data()

In [None]:
def clean_the_text(text: str):
    text = str(text).lower()
    text = re.sub('<.*?>+', '', text)
    text = re.sub(r"[^a-zA-Z0-9]", " ", text)
    text = re.sub(r'http\S+', r'<URL>', text)
    text = re.sub(r'[^\w\s]', '', text)
    
    stop_words = set(corpus.stopwords.words('english'))
    word_token = tokenize.word_tokenize(text)
    word_token = [w for w in word_token if w not in stop_words]
    
    return ' '.join(word_token)

MAX_WORDS = 5000
MAX_LEN = 50

def tokenize_pad_sequences(text: str):
    tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=MAX_WORDS, lower=True, split=' ')
    tokenizer.fit_on_texts(text)
    X = tokenizer.texts_to_sequences(text)
    X = tf.keras.preprocessing.sequence.pad_sequences(X, padding='post', maxlen=MAX_LEN)
    return X, tokenizer

In [None]:
%%time

def sentiment_lstm_model(data, plot=True):
    sentiment_data = data.copy()
    
    sentiment_data['text'] = sentiment_data['text'].apply(lambda x: clean_the_text(x))
    X, tokenizer = tokenize_pad_sequences(text=sentiment_data['text'])
    y = pd.get_dummies(sentiment_data['sentiment'])
    trainX, testX, trainy, testy = model_selection.train_test_split(X, y, test_size=.1, shuffle=True, random_state=0)
    
    VOCAB_SIZE = 5000
    EMBEDDING_SIZE = 32
    EPOCHS = 50
    BATCH_SIZE = 64
    CALLBACK = tf.keras.callbacks.EarlyStopping(monitor='val_acc', 
                                                patience=15, 
                                                min_delta=0.01, 
                                                restore_best_weights=True)
    
    model = tf.keras.models.Sequential()
    model.add(tf.keras.layers.Embedding(VOCAB_SIZE, EMBEDDING_SIZE, input_length=MAX_LEN))
    model.add(tf.keras.layers.Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
    model.add(tf.keras.layers.MaxPooling1D(pool_size=2))
    model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)))
    model.add(tf.keras.layers.Dropout(.4))
    model.add(tf.keras.layers.Dense(3, activation='softmax'))
    
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
    history = model.fit(trainX, trainy, 
                        validation_data=(testX, testy), 
                        batch_size=BATCH_SIZE, 
                        epochs=EPOCHS, 
                        verbose=1,
                        callbacks=[CALLBACK])
    
    if plot:
        fig, ax = plt.subplots(1, 2, figsize=(10,5), dpi=100)
        ax[0].plot(history.history['acc'], marker='o', label='Accuracy')
        ax[0].plot(history.history['val_acc'], marker='v', label='Validation Accuracy')
        ax[0].set_title('Accuracies')
        ax[0].set_xlabel('Epochs')
        ax[0].set_ylabel('Accuracy')
        ax[0].legend()
        
        ax[1].plot(history.history['loss'], marker='o', label='Loss')
        ax[1].plot(history.history['val_loss'], marker='v', label='Validation Loss')
        ax[1].set_title('Losses')
        ax[1].set_xlabel('Epochs')
        ax[1].set_ylabel('Loss')
        ax[1].legend()
        
        plt.show()
        
    return model, tokenizer

slstm, tokenizer = sentiment_lstm_model(data=sentiment_data)

### Predict News Sentiment

In [None]:
def sentiment_predict(text, model, tokenizer):
    text = copy.copy(text)
    # text = clean_the_text(text)
    sentiment_state = ['Negative', 'Neutral', 'Positive']
    MAX_LEN = 50
    textX = tokenizer.texts_to_sequences(text)
    textX = tf.keras.preprocessing.sequence.pad_sequences(textX, padding='post', maxlen=MAX_LEN)
    texty = slstm.predict(textX).argmax(axis=1)
    print('Sentiment State: ', sentiment_state[texty[0]])
    
sentiment_predict(text=['this is a testing sentences'], model=slstm, tokenizer=tokenizer)

In [None]:
def check_news_sentiment(topic: str, nums: int, model, tokenizer):
    topic = copy.copy(topic)
    nums = copy.copy(nums)
    
    scraper = Scraper(country='MY')
    news_topics = scraper.get_news_by_topics(topic=topic, nums=nums, show=False)
    
    for title in news_topics.get('titles'):
        sentiment_predict(text=[title], model=model, tokenizer=tokenizer)

In [None]:
check_news_sentiment(topic='business', nums=10, model=slstm, tokenizer=tokenizer)

### News Keywords Extractions

In [None]:
def get_news_keywords(country: str, query: str, nums: int): 
    scraper = Scraper(country=country)
    news_searched = scraper.search(query=query, nums=nums, show=False)
    
    spacy_nlp = spacy.load('en_core_web_sm')
    pos_tag = ['PROPN', 'ADJ', 'NOUN'] 
    
    titles_core = list()
    
    for title in news_searched.get('titles'):
        doc = spacy_nlp(title.lower())
        core = list()
        for token in doc:
            if(token.text in spacy_nlp.Defaults.stop_words or token.text in punctuation):
                continue
                
            if(token.pos_ in pos_tag):
                core.append(token.text)
        
        titles_core.append(' '.join(core))
            
    return titles_core

In [None]:
get_news_keywords(country='MY', query='blockchain', nums=20)

### Clustering News from Various Countries

In [None]:
news_from_country = ['MY', 'US', 'GB', 'IN', 'CN', 'DE', 'FR', 'TW', 'HK', 'AU', 'KR', 'JP', 
                    'CA', 'SG', 'ID', 'NZ', 'IE', 'IL', 'PK', 'ZA', 'CH', 'IT']

In [None]:
def clean_detect_translate(text: str, source_language: str=None, trg_language: str=None):    
    text = re.sub(r'[^\w\s]', '', text)
    text = text.strip()
    
    if source_language == None: 
        source_language = detect(text)
        lang_dict = {'en': 'en', 'zh-cn': 'zh-CN', 'zh-tw': 'zh-TW', 'de': 'de', 'fr': 'fr', 'ko': 'ko', 'ja': 'ja', 'id': 'id'}
        source_language = lang_dict.get(source_language)
        
        if source_language == None:
            return None
        
    if source_language == trg_language:
        return text
    else:
        # print('Translating news')
        translator = GoogleTranslator(source=source_language, target=trg_language)
        translated_text = translator.translate(text)
        # GoogleTranslator.get_supported_languages(as_dict=True)
        translated_text = translator.translate(text)
    
    return translated_text

In [None]:
def detect_cjk(text: str):
    text = re.sub('[^a-zA-Z]', ' ', str(text))
    if re.search("[\u4e00-\u9FFF]", text):
        return False
    if re.search("[\uac00-\ud7a3]", text):
        return False
    if re.search("[\u3040-\u30ff]", text):
        return False
    return True

In [None]:
%%time

def preprocessing_news(country_list: list):
    news_dict = dict()
    
    for c in log_progress(country_list, every=1):
        extracted_news = list()
        country_lang_dict = {'MY': 'en', 
                             'US': 'en', 
                             'GB': 'en', 
                             'IN': 'en', 
                             'CN': 'CN', 
                             'DE': 'de',
                             'FR': 'fr', 
                             'TW': 'TW',
                             'HK': 'TW',
                             'AU': 'en',
                             'KR': 'ko',
                             'JP': 'ja',
                             'CA': 'en',
                             'SG': 'en',
                             'ID': 'en',
                             'NZ': 'en',
                             'IE': 'en',
                             'IL': 'en',
                             'PK': 'en',
                             'ZA': 'en',
                             'CH': 'de',
                             'IT': 'it',}
        
        language_use = country_lang_dict.get(c)
        scraper = Scraper(language=language_use, country=c)
        news = scraper.get_news_by_topics(topic='world', nums=50, show=False)
        
        lang = 'zh-TW' if c == 'HK' or c == 'TW' else ('zh-CN' if c == 'CN' else None) 
        if lang == None:
            lang = country_lang_dict.get(c)
         
        for i in range(len(news.get('titles'))):
            cleaned_text = clean_detect_translate(news.get('titles')[i], source_language=lang, trg_language='en')
            if detect_cjk(text=cleaned_text):
                extracted_news.append(cleaned_text)

        print('Successfully scrape {} news'.format(c)) if len(extracted_news) > 0 else print('Failed to scrape {} news'.format(c))
        news_dict[c] = extracted_news
        print(c)
        print(extracted_news)
        print('\n')
        
    compiled_news = pd.DataFrame()
    for c in country_list:
        compiled_news = pd.concat([compiled_news, pd.DataFrame({c: news_dict.get(c)})], axis=1)
        
    return compiled_news
        
news_data = preprocessing_news(country_list=news_from_country)

In [None]:
def news_kmeans_clustering():
    melted_news_data = pd.melt(news_data, value_vars=news_from_country)
    melted_news_data.rename(columns={'variable': 'Country', 'value': 'News'}, inplace=True)
    melted_news_data.dropna(inplace=True)
    # le = preprocessing.LabelEncoder()
    # melted_news_data['Encoder Country'] = preprocessing.LabelEncoder().fit_transform(melted_news_data['Country'])

    vectorizer = feature_extraction.text.TfidfVectorizer(stop_words='english')
    textX = vectorizer.fit_transform(melted_news_data['News'].values)
    # print('n_features: {}, vocab: {}'.format(textX.shape[0], textX.shape[1]))

    # # Elbow Method
    # sse = {}
    # for k in range(1, 11):
    #     km = cluster.KMeans(n_clusters=k, random_state=0).fit(textX)
    #     sse[k] = km.inertia_

    # plt.figure(figsize=(10,5), dpi=100)
    # plt.plot(list(sse.keys()), list(sse.values()))
    # plt.title('Elbow Method for Optimal K')
    # plt.xlabel('Nums of K')
    # plt.ylabel('Inertia')
    # plt.show()

    km = cluster.KMeans(n_clusters=5, random_state=0).fit(textX)
    melted_news_data['Cluster'] = km.labels_

    country_list = melted_news_data['Country'].unique().tolist()
    news_list = melted_news_data['News'].unique().tolist()

    G = nx.from_pandas_edgelist(melted_news_data, source='Cluster', target='Country', create_using=nx.Graph())
    
    node_characteristic = pd.DataFrame({'ID': country_list + news_list,
                                        'type': ['c'] * len(country_list) + ['n'] * len(news_list)})
    node_characteristic = node_characteristic.set_index('ID')
    node_characteristic = node_characteristic.reindex(G.nodes())
    node_characteristic['type'] = pd.Categorical(node_characteristic['type'])
    cmap = matplotlib.colors.ListedColormap(['blue', 'red'])

    fig, ax = plt.subplots(figsize=(30,30), dpi=100)
    ax.set_title('Countries-News KMeans Clustering', fontsize=30)
    nx.draw(G, 
            with_labels=True,
            node_size=4000, 
            node_color=node_characteristic['type'].cat.codes,
            cmap=cmap,
            edge_color='grey', 
            style='dashed', 
            linewidths=1, 
            font_size=40,
            font_color='white')

    legend_elements = [mpl.lines.Line2D([0], [0], marker='o', color='w', label='Country', markerfacecolor='red', markersize=30),
                       mpl.lines.Line2D([0], [0], marker='o', color='w', label='Cluster', markerfacecolor='blue', markersize=30)]
    
    ax.legend(handles=legend_elements, loc='best', fontsize='xx-large', labelspacing=1.5, borderpad=1.5)
    
    # overall = nx.degree_centrality(G)
    # print([overall[k] for k in news_from_country if k in overall])

news_kmeans_clustering()