In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re, string, unicodedata

import nltk
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer

import contractions
import inflect
import time
import datetime

In [2]:
# nltk.download('punkt')
# nltk.download('wordnet')

In [3]:
# thanks to kdnuggets

def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    return text

def replace_contractions(text):
    """Replace contractions in string of text"""
    return contractions.fix(text)

def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def replace_numbers(words):
    """Replace all interger occurrences in list of tokenized words with textual representation"""
    p = inflect.engine()
    new_words = []
    for word in words:
        if word.isdigit():
            new_word = p.number_to_words(word)
            new_words.append(new_word)
        else:
            new_words.append(word)
    return new_words

def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    new_words = []
    for word in words:
        if word not in stopwords.words('english'):
            new_words.append(word)
    return new_words

def stem_words(words):
    """Stem words in list of tokenized words"""
    stemmer = LancasterStemmer()
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems

def lemmatize_verbs(words):
    """Lemmatize verbs in list of tokenized words"""
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in words:
        lemma = lemmatizer.lemmatize(word, pos='v')
        lemmas.append(lemma)
    return lemmas

def normalize(words):
    words = remove_non_ascii(words)
    words = to_lowercase(words)
    words = remove_punctuation(words)
    words = replace_numbers(words)
    words = remove_stopwords(words)
    return words

def stem_and_lemmatize(words):
    stems = stem_words(words)
    lemmas = lemmatize_verbs(words)
    return stems, lemmas

def article_info(link):
    for l in ['facebook.com', 'nymag.com',
              'commondreams.org', 'rollingstone',
              'wired.com', 'archive',
              'miamiherald.com', 'usnews.com',
              'kansascity.com', 'kansas.com',
              'dni.gov', 'fredmeyer.com',
              'sacbee.com', '.jpg',
              '.gif', '.png',
              'imgur.com', 'seattletimes.com',
              'thenewstribune.com', 'bluzz.org']:
        if l in link:
            return ''
    try:
        page = requests.get(link)
        text = page.text
        sample = denoise_text(text)
        sample = replace_contractions(sample)
        words = nltk.word_tokenize(sample)
        words = [x.lower() for x in words if 2 < len(x) < 12]
        stems, lemmas = stem_and_lemmatize(words)
    except:
        lemmas = ''
    return lemmas

In [4]:
for topic in ['politics', 'science', 'sports', 'weather', 'worldnews']:
    
    df = pd.read_csv(topic + ".csv")
    df = df.dropna()
    df['title'] = df['title'].map(lambda x: x.lower())
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['timestamp'] = df['timestamp'].dt.floor('d')
    
    df['cleaned_article'] = df['url'].map(lambda url: article_info(url))
    
    df.to_csv(topic + "_cleaned" + ".csv", index=False)