In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = (SparkSession.builder.appName("Converting articles into BoW Vectors").getOrCreate())

In [3]:
spark

In [4]:
data=spark.read.csv("all_news.csv",header=True,inferSchema=True)

In [5]:
print(data)

DataFrame[_c0: string, Unnamed: 0: string, date: string, year: string, month: string, day: string, author: string, title: string, article: string, url: string, section: string, publication: string]


In [7]:
from pyspark.sql.functions import udf, col, lower, regexp_replace,concat,lit,split,explode
from pyspark.ml.feature import Tokenizer, StopWordsRemover
import nltk
import string
import re
from nltk.stem.snowball import SnowballStemmer 
from pyspark.sql.functions import udf
import pyspark.sql.types as T
import pyspark.sql.functions as F
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from spacy.lang.en.stop_words import STOP_WORDS
from itertools import filterfalse
from nltk.corpus import wordnet as wn

In [8]:
data=data.select('title','article')


In [9]:
data.show()

+--------------------+--------------------+
|               title|             article|
+--------------------+--------------------+
|We should take co...|"This post is par...|
|Colts GM Ryan Gri...| The Indianapolis...|
|                null|                null|
|Trump denies repo...|DAVOS, Switzerlan...|
|France's Sarkozy ...|PARIS (Reuters) -...|
|Paris Hilton: Wom...|"Paris Hilton arr...|
|ECB's Coeure: If ...|BERLIN, June 17 (...|
|                null|                null|
|Venezuela detains...|CARACAS (Reuters)...|
|You Can Trick You...|"If only every da...|
|How to watch the ...|Google I/O, the c...|
|China is dismissi...|China is dismissi...|
|“Elizabeth Warren...|Elizabeth Warren ...|
|Hudson's Bay's ch...|(Reuters) - The s...|
|Joakim Noah's Vic...|Joakim Noah's ﻿mo...|
|Jermaine Jackson ...|"Jermaine Jackson...|
|UK PM May presses...|LONDON (Reuters) ...|
|Nancy Pelosi says...|"Nancy Pelosi is ...|
|The government of...|The nonpartisan d...|
|Mark Zuckerberg’s...|The threat

In [10]:
new_data = data.withColumn('text', F.concat(F.col('title'), F.col('article'))).drop(*data.columns[:2])

In [11]:
new_data.show()

+--------------------+
|                text|
+--------------------+
|We should take co...|
|Colts GM Ryan Gri...|
|                null|
|Trump denies repo...|
|France's Sarkozy ...|
|Paris Hilton: Wom...|
|ECB's Coeure: If ...|
|                null|
|Venezuela detains...|
|You Can Trick You...|
|How to watch the ...|
|China is dismissi...|
|“Elizabeth Warren...|
|Hudson's Bay's ch...|
|Joakim Noah's Vic...|
|Jermaine Jackson ...|
|UK PM May presses...|
|Nancy Pelosi says...|
|The government of...|
|Mark Zuckerberg’s...|
+--------------------+
only showing top 20 rows



In [12]:
non_null_data = new_data.dropna()

In [None]:
non_null_data.count()

In [None]:
non_null_data.show()

In [None]:
normalized_data = non_null_data.select(lower(col("text")).alias("normalized tokens"))

In [None]:
normalized_data.show(10)

In [19]:
def text_preproc(x):
    x = x.lower() #lower the text character
    x = x.replace("'","").replace('"', '')
    x = x.encode('ascii', 'ignore').decode() #remove unicode characters
    x = re.sub(r'https*\S+', ' ', x) #remove urls
    x = re.sub(r'@\S+', ' ', x) # remove mentions
    x = re.sub(r'#\S+', ' ', x) # remove hashtags
    x = re.sub(r'\'\w+', '', x) # remove ticks and the next chracter
    x = re.sub('[%s]' % re.escape(string.punctuation), ' ', x)# remove punctuations
    x = re.sub('[^a-zA-Z]',' ',x) #remove non alphabetic characters
    x = re.sub(r'\w*\d+\w*', '', x) # remove numbers
    x = re.sub(r'\s{2,}', ' ', x) # replace the overspaces
    x = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', ' ',x) #remove url
    x = re.sub('[\n]',' ',x) #remove newline character
   
    return x

In [20]:
data_clean_udf =F.udf(text_preproc, T.StringType())

In [None]:
clean_text=normalized_data.select(data_clean_udf(col("normalized_tokens")).alias("clean_text"))

In [None]:
clean_text.show(truncate=100)

In [None]:
tokenizer = Tokenizer(inputCol='clean_text', outputCol='words_token')
data_words_token = tokenizer.transform(clean_text).select('words_token')

In [None]:
data_words_token.show()

In [None]:
remover = StopWordsRemover(inputCol='words_token', outputCol='words_clean')
text = remover.transform(data_words_token).select('words_clean')

In [None]:
text.show(truncate=100)

In [None]:
snow_stemmer = SnowballStemmer(language='english')

In [None]:
stemmer_udf = udf(lambda tokens: [stemmer.stem(token) for token in tokens], ArrayType(StringType()))
data_stemmed = text.withColumn("words_stemmed", stemmer_udf("words_clean")).select('words_stemmed')


In [None]:
filter_length_udf = udf(lambda row: [x for x in row if len(x) >= 3], ArrayType(StringType()))
data_final_words = data_stemmed.withColumn('words', filter_length_udf(col('words_stemmed')))


In [17]:
from pyspark.sql.functions import explode


In [None]:
tokens = data_final_words.select(explode(col("words_stemmed")).alias("tokens"))

In [35]:
token_groups = tokens.groupby(col("tokens"))

In [None]:
token_groups

In [None]:
token_counts = token_groups.count()

In [21]:
token_counts.show()

In [22]:
token_counts.orderBy("count",ascending=False).show(100)
