In [None]:
!pip install dask[complete]

## importing necessary library

In [None]:
import dask.dataframe as dd
from dask import bag
from dask import array
from dask.diagnostics import ProgressBar

In [None]:
import numpy 
from nltk.tokenize import word_tokenize
import re
import nltk
import string
from string import punctuation
from nltk.corpus import stopwords
from spacy.lang.en.stop_words import STOP_WORDS
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn

In [None]:
import warnings
warnings.filterwarnings('ignore')


In [None]:
df = dd.read_csv('/content/drive/MyDrive/all_news.csv',error_bad_lines=False,engine='python', encoding='utf-8',
                 dtype={'Unnamed: 0':'object','Unnamed: 0.1':'object','day':'object','year':'object','month':'object'})

In [None]:
df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,date,year,month,day,author,title,article,url,section,publication
0,0,0,2016-12-09 18:31:00,2016,12.0,9,Lee Drutman,We should take concerns about the health of li...,"This post is part of Polyarchy, an independent...",https://www.vox.com/polyarchy/2016/12/9/138983...,,Vox
1,1,1,2016-10-07 21:26:46,2016,10.0,7,Scott Davis,Colts GM Ryan Grigson says Andrew Luck's contr...,The Indianapolis Colts made Andrew Luck the h...,https://www.businessinsider.com/colts-gm-ryan-...,,Business Insider
2,2,2,2018-01-26 00:00:00,2018,1.0,26,,Trump denies report he ordered Mueller fired,"DAVOS, Switzerland (Reuters) - U.S. President ...",https://www.reuters.com/article/us-davos-meeti...,Davos,Reuters
3,3,3,2019-06-27 00:00:00,2019,6.0,27,,France's Sarkozy reveals his 'Passions' but in...,PARIS (Reuters) - Former French president Nico...,https://www.reuters.com/article/france-politic...,World News,Reuters
4,4,4,2016-01-27 00:00:00,2016,1.0,27,,Paris Hilton: Woman In Black For Uncle Monty's...,Paris Hilton arrived at LAX Wednesday dressed ...,https://www.tmz.com/2016/01/27/paris-hilton-mo...,,TMZ


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### dropping unnecessary columns

In [None]:
df_dropped=df.drop(['Unnamed: 0','Unnamed: 0.1','date','year','month','day','url','author','section','publication'], axis=1)

### fill nan cell with value 0

In [None]:
data=df_dropped.fillna(value=0,axis=1)

In [None]:
data.head()

Unnamed: 0,title,article
0,We should take concerns about the health of li...,"This post is part of Polyarchy, an independent..."
1,Colts GM Ryan Grigson says Andrew Luck's contr...,The Indianapolis Colts made Andrew Luck the h...
2,Trump denies report he ordered Mueller fired,"DAVOS, Switzerland (Reuters) - U.S. President ..."
3,France's Sarkozy reveals his 'Passions' but in...,PARIS (Reuters) - Former French president Nico...
4,Paris Hilton: Woman In Black For Uncle Monty's...,Paris Hilton arrived at LAX Wednesday dressed ...


### combining the title and article column text

In [None]:
cols= data.columns
data['combined'] = data[cols].apply(func=(lambda row: ' '.join(row.values.astype(str))), axis=1)
data= data.drop(cols, axis=1)

In [None]:
data.head()

Unnamed: 0,combined
0,We should take concerns about the health of li...
1,Colts GM Ryan Grigson says Andrew Luck's contr...
2,Trump denies report he ordered Mueller fired D...
3,France's Sarkozy reveals his 'Passions' but in...
4,Paris Hilton: Woman In Black For Uncle Monty's...


In [None]:
print(data.head().loc[0, 'combined'][:1000])

We should take concerns about the health of liberal democracy seriously This post is part of Polyarchy, an independent blog produced by the political reform program at New America, a Washington think tank devoted to developing new ideas and new voices. Imagine you are an otherwise healthy 30-something who starts feeling weird. You are sometimes short of breath. You get migraines. Your feet start to swell a little. But otherwise, everything seems fine. You go to the doctor. The doctor runs some tests. She tells you, It's probably nothing, but these could be signs of a coming heart attack. You push for more certainty, but the doctor tells you she's not sure. The human body is a complex system. You're young and otherwise pretty healthy. There could be plenty of other explanations for what you're feeling. But it is a little worrying. So just to be on the safe side, maybe you should reduce the stress in your life and eat a healthier diet. What would you do? If you're a sensible person, you'

In [None]:
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
stop_words = stopwords.words("english")
punctuation = punctuation + '\n' + '—' + '“' + ',' + '”' + '‘' + '-' + '’'


### text preprocessing

In [None]:
def text_preproc(x):
    x = x.lower() #lower the text character
    x = ' '.join([word for word in x.split(' ') if word not in stop_words])# remove stop words
    x = x.encode('ascii', 'ignore').decode() #remove unicode characters
    x = re.sub(r'https*\S+', ' ', x) #remove urls
    x = re.sub(r'@\S+', ' ', x) # remove mentions
    x = re.sub(r'#\S+', ' ', x) # remove hashtags
    x = re.sub(r'\'\w+', '', x) # remove ticks and the next chracter
    x = re.sub('[%s]' % re.escape(string.punctuation), ' ', x)# remove punctuations
    x = re.sub('[^a-zA-Z]',' ',x) #remove non alphabetic characters
    x = re.sub(r'\w*\d+\w*', '', x) # remove numbers
    x = re.sub(r'\s{2,}', ' ', x) # replace the overspaces
    x = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', ' ',x) #remove url
    x = re.sub('[\n]',' ',x) #remove newline character
    x = ''.join(word for word in x if word not in punctuation)
    x = re.sub(r'\b\w{1,3}\b', '',x)
    x = x.replace("'","").replace('"', ' ')
    x = x.replace("'s", '')
    x = x.replace('’s', '')
    x = x.replace("\'s", '')
    x = x.replace("\’s", '')
    
    return x

### apply text_preproc func on the data

In [None]:
data['combined'] = data.combined.apply(lambda x: text_preproc(x),meta='combined')


In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
data['combined'].head()

0    take concerns health liberal democracy serious...
1    colts  ryan grigson says andrew luck contract ...
2    trump denies report ordered mueller fired davo...
3    france sarkozy reveals insists come back cards...
4    paris hilton woman black uncle monty funeral p...
Name: combined, dtype: object

In [None]:

!pip install contractions

In [None]:
from contractions import contractions_dict

In [None]:
def contracted_word_expansion(token):
    if token in contractions_dict.keys():
        return contractions_dict[token]
    else:
        return token


In [None]:
def contractions_expansion(words):

    return list(map(contracted_word_expansion,data['combined']))


In [None]:
data['combined'] = data.combined.apply(str,lambda x:contractions_expansion(words))

In [None]:
data['combined'].head()

0    take concerns health liberal democracy serious...
1    colts  ryan grigson says andrew luck contract ...
2    trump denies report ordered mueller fired davo...
3    france sarkozy reveals insists come back cards...
4    paris hilton woman black uncle monty funeral p...
Name: combined, dtype: object

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### stopwords removal

In [None]:
en_stop_words = list(set(stopwords.words('english')).union(set(STOP_WORDS)))


In [None]:
def is_stopword(token):
    return not(token in en_stop_words or re.search(r'\b\w\b|[^\u0000-\u007f]+|_+|\W+',token))


In [None]:
def stopwords_removal(x):
    return list(filter(is_stopword,data['combined']))
  

In [None]:
data['combined'] = data['combined'].apply(str,lambda x:stopwords_removal(x))


### pos_tags(part_of_speech_tags .i.e. verb,adjective,noun,adverb) 


In [None]:
def get_wnet_pos_tag(treebank_tag):
    if treebank_tag[1].startswith('J'):
        return (treebank_tag[0],wordnet.ADJ)
    elif treebank_tag[1].startswith('V'):
        return (treebank_tag[0],wordnet.VERB)
    elif treebank_tag[1].startswith('N'):
        return (treebank_tag[0],wordnet.NOUN)
    elif treebank_tag[1].startswith('R'):
        return (treebank_tag[0],wordnet.ADV)
    else:
        (treebank_tag[0],wordnet.NOUN)

In [None]:

def get_pos_tag(list_of_tokens):
    return map(get_wnet_pos_tag,pos_tag(list_of_tokens))

In [None]:
data['combined'] = data['combined'].apply(str,lambda x: get_pos_tag(x),meta='combined')

In [None]:
data['combined'] = data['combined'].apply(str,lambda x: list(x))

In [None]:
lemmatizer = WordNetLemmatizer()

In [None]:
def token_lemmatization(token_pos_tuple):
    if token_pos_tuple == None:
        return ""
    else:
        return lemmatizer.lemmatize(word=token_pos_tuple[0],pos=token_pos_tuple[1])

In [None]:
def lemmatization(list_of_tokens):
    if len(list_of_tokens) > 0:
        return map(lambda x: token_lemmatization(x),list_of_tokens)

In [None]:
data['combined'] = data['combined'].apply(str,lambda x: lemmatization(x))

In [None]:
data['combined'] = data['combined'].apply(str,lambda x: list(x))

In [None]:
data['combined'].head()

0    take concerns health liberal democracy serious...
1    colts  ryan grigson says andrew luck contract ...
2    trump denies report ordered mueller fired davo...
3    france sarkozy reveals insists come back cards...
4    paris hilton woman black uncle monty funeral p...
Name: combined, dtype: object

In [None]:
wordfreq = {}
for sentence in data.combined:
    tokens = nltk.word_tokenize(sentence)
    for token in tokens:
        if token not in wordfreq.keys():
            wordfreq[token] = 1
        else:
            wordfreq[token] += 1


In [None]:
import heapq
most_freq = heapq.nlargest(200, wordfreq, key=wordfreq.get)


In [None]:
most_freq

['said',
 'trump',
 'would',
 'people',
 'year',
 'also',
 'like',
 'time',
 'first',
 'president',
 'could',
 'last',
 'percent',
 'years',
 'company',
 'million',
 'even',
 'told',
 'says',
 'back',
 'state',
 'house',
 'make',
 'many',
 'reuters',
 'news',
 'since',
 'according',
 'government',
 'still',
 'world',
 'made',
 'states',
 'going',
 'think',
 'much',
 'week',
 'well',
 'that',
 'work',
 'three',
 'take',
 'know',
 'around',
 'billion',
 'want',
 'long',
 'right',
 'white',
 'including',
 'china',
 'next',
 'united',
 'former',
 'market',
 'group',
 'business',
 'york',
 'public',
 'american',
 'good',
 'part',
 'high',
 'report',
 'another',
 'really',
 'month',
 'deal',
 'country',
 'reporting',
 'washington',
 'home',
 'need',
 'life',
 'national',
 'show',
 'women',
 'health',
 'campaign',
 'data',
 'companies',
 'called',
 'there',
 'hill',
 'best',
 'help',
 'times',
 'tuesday',
 'support',
 'trade',
 'something',
 'city',
 'second',
 'court',
 'come',
 'dont',
 'po