### Extract all files in train.zip into train folder

In [0]:
#import zipfile
#with zipfile.ZipFile("train.zip",'r') as zip_ref:
#    zip_ref.extractall("train")

### Create Spark Session

In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('FakeNews').getOrCreate()

In [0]:
df_org = spark.read.json('Data/train.json')

In [0]:
df_org.printSchema()

root
 |-- claim: string (nullable = true)
 |-- claimant: string (nullable = true)
 |-- date: string (nullable = true)
 |-- id: long (nullable = true)
 |-- label: long (nullable = true)
 |-- related_articles: array (nullable = true)
 |    |-- element: long (containsNull = true)



In [0]:
df_org.show()

+--------------------+-----------------+----------+---+-----+--------------------+
|               claim|         claimant|      date| id|label|    related_articles|
+--------------------+-----------------+----------+---+-----+--------------------+
|A line from Georg...|                 |2017-07-17|  0|    0|[122094, 122580, ...|
|Maine legislature...|                 |2018-03-17|  1|    2|[106868, 127320, ...|
|A 17-year-old gir...|                 |2018-07-18|  4|    1|[132130, 132132, ...|
|In 1988 author Ro...|                 |2019-02-04|  5|    2|[123254, 123418, ...|
|When it comes to ...|  Hillary Clinton|2016-03-22|  6|    2|[41099, 89899, 72...|
|Rhode Island is "...|Leonidas Raptakis|2014-02-11|  7|    2|[8284, 3768, 2009...|
|The poorest count...|         Jim Webb|2014-11-19|  8|    1|      [70709, 70708]|
|Koch Industries p...|                 |2013-07-18|  9|    0|[120591, 120592, ...|
|"Minnesota, Michi...|        Robin Vos|2013-08-22| 11|    1|[69547, 80095, 79...|
|"FB

In [0]:
for row in df_org.head(5):
    print(row)
    print('\n')

Row(claim="A line from George Orwell's novel 1984 predicts the power of smartphones.", claimant='', date='2017-07-17', id=0, label=0, related_articles=[122094, 122580, 130685, 134765])


Row(claim='Maine legislature candidate Leslie Gibson insulted Parkland shooting survivor and activist Emma Gonzalez via Twitter.', claimant='', date='2018-03-17', id=1, label=2, related_articles=[106868, 127320, 128060])


Row(claim='A 17-year-old girl named Alyssa Carson is being trained by NASA to become an astronaut.', claimant='', date='2018-07-18', id=4, label=1, related_articles=[132130, 132132, 149722])


Row(claim='In 1988 author Roald Dahl penned an open letter urging parents to have their children vaccinated against measles.', claimant='', date='2019-02-04', id=5, label=2, related_articles=[123254, 123418, 127464])


Row(claim='When it comes to fighting terrorism, "Another thing we know that does not work, based on lots of empirical evidence, is torture."', claimant='Hillary Clinton', date='2

### Create a new column with number of related articles  (NOT ACTUALLY NEEDED)

In [0]:
from pyspark.sql.functions import col,udf
from pyspark.sql.types import IntegerType,StringType
count_articles = udf(lambda articles : len(articles),IntegerType())
df_org = df_org.withColumn("articleCount",count_articles(col("related_articles")))

In [0]:
df_org.show()

+--------------------+-----------------+----------+---+-----+--------------------+------------+
|               claim|         claimant|      date| id|label|    related_articles|articleCount|
+--------------------+-----------------+----------+---+-----+--------------------+------------+
|A line from Georg...|                 |2017-07-17|  0|    0|[122094, 122580, ...|           4|
|Maine legislature...|                 |2018-03-17|  1|    2|[106868, 127320, ...|           3|
|A 17-year-old gir...|                 |2018-07-18|  4|    1|[132130, 132132, ...|           3|
|In 1988 author Ro...|                 |2019-02-04|  5|    2|[123254, 123418, ...|           3|
|When it comes to ...|  Hillary Clinton|2016-03-22|  6|    2|[41099, 89899, 72...|           6|
|Rhode Island is "...|Leonidas Raptakis|2014-02-11|  7|    2|[8284, 3768, 2009...|           6|
|The poorest count...|         Jim Webb|2014-11-19|  8|    1|      [70709, 70708]|           2|
|Koch Industries p...|                 |

### Combine Data

In [0]:
@udf(returnType=StringType())
def getArticleContent(articles):
    content=""
    for val in articles:
        filePath = "train/train_articles/"+str(val)+".txt"
        content+=open (filePath,'r').read().lower()+" "
    return content
df_combined = df_org.withColumn("Article_Content",getArticleContent(col("related_articles")))

In [0]:
df_combined.show()

+--------------------+-----------------+----------+---+-----+--------------------+------------+--------------------+
|               claim|         claimant|      date| id|label|    related_articles|articleCount|     Article_Content|
+--------------------+-----------------+----------+---+-----+--------------------+------------+--------------------+
|A line from Georg...|                 |2017-07-17|  0|    0|[122094, 122580, ...|           4|1984: george orwe...|
|Maine legislature...|                 |2018-03-17|  1|    2|[106868, 127320, ...|           3|republican who cr...|
|A 17-year-old gir...|                 |2018-07-18|  4|    1|[132130, 132132, ...|           3|first person to c...|
|In 1988 author Ro...|                 |2019-02-04|  5|    2|[123254, 123418, ...|           3|how dangerous is ...|
|When it comes to ...|  Hillary Clinton|2016-03-22|  6|    2|[41099, 89899, 72...|           6|remarks on counte...|
|Rhode Island is "...|Leonidas Raptakis|2014-02-11|  7|    2|[82

In [0]:
#First Combined Article
df_combined.head()['Article_Content']

'1984: george orwell predicted 2017 almost 70 years ago\napril, 1984. 13:00. comrade 6079, winston smith, thinks a thought, starts a diary, and falls in love. but big brother is always watching. set in a world where an invasive government keeps a malevolently watchful eye on its citizens, this radical and much-lauded staging explores surveillance, identity and why orwell?s vision of the future is as relevant now as ever. now with a brand new australian cast, we bring adapter-directors robert icke and duncan macmillan?s thrilling vision back to australia for a national tour.\n\nthere was no crystal ball, no religious apparitions or voices from the dead when george orwell wrote nineteen eighty-four, accurately depicted our present almost 70 years ago.\n\nit’s no coincidence that orwell’s 1949 novel had climbed back into multiple bestseller lists this year — people have realised its scarily accurate foretelling.\n\nin 2013 its relevance — still strong and worryingly reliable — led to the 

### Remove Special Characters from Article_Content

In [0]:
from pyspark.sql.functions import regexp_replace
df_combined = df_combined.withColumn("Article_Content",regexp_replace(col("Article_Content"),"[^a-zA-Z0-9\\s+]",""))
df_combined.show()

+--------------------+-----------------+----------+---+-----+--------------------+------------+--------------------+
|               claim|         claimant|      date| id|label|    related_articles|articleCount|     Article_Content|
+--------------------+-----------------+----------+---+-----+--------------------+------------+--------------------+
|A line from Georg...|                 |2017-07-17|  0|    0|[122094, 122580, ...|           4|1984 george orwel...|
|Maine legislature...|                 |2018-03-17|  1|    2|[106868, 127320, ...|           3|republican who cr...|
|A 17-year-old gir...|                 |2018-07-18|  4|    1|[132130, 132132, ...|           3|first person to c...|
|In 1988 author Ro...|                 |2019-02-04|  5|    2|[123254, 123418, ...|           3|how dangerous is ...|
|When it comes to ...|  Hillary Clinton|2016-03-22|  6|    2|[41099, 89899, 72...|           6|remarks on counte...|
|Rhode Island is "...|Leonidas Raptakis|2014-02-11|  7|    2|[82

In [0]:
df_combined.head()['Article_Content'] #No Special characters

'1984 george orwell predicted 2017 almost 70 years ago\napril 1984 1300 comrade 6079 winston smith thinks a thought starts a diary and falls in love but big brother is always watching set in a world where an invasive government keeps a malevolently watchful eye on its citizens this radical and muchlauded staging explores surveillance identity and why orwells vision of the future is as relevant now as ever now with a brand new australian cast we bring adapterdirectors robert icke and duncan macmillans thrilling vision back to australia for a national tour\n\nthere was no crystal ball no religious apparitions or voices from the dead when george orwell wrote nineteen eightyfour accurately depicted our present almost 70 years ago\n\nits no coincidence that orwells 1949 novel had climbed back into multiple bestseller lists this year  people have realised its scarily accurate foretelling\n\nin 2013 its relevance  still strong and worryingly reliable  led to the west end smash hit adaptation 

### Remove Line Spacings (\n \t) 

In [0]:
df_combined = df_combined.withColumn("Article_Content",regexp_replace(col("Article_Content"),"/(\r\n)+|\r+|\n+|\t+/i"," "))

In [0]:
df_combined.show()

+--------------------+-----------------+----------+---+-----+--------------------+------------+--------------------+
|               claim|         claimant|      date| id|label|    related_articles|articleCount|     Article_Content|
+--------------------+-----------------+----------+---+-----+--------------------+------------+--------------------+
|A line from Georg...|                 |2017-07-17|  0|    0|[122094, 122580, ...|           4|1984 george orwel...|
|Maine legislature...|                 |2018-03-17|  1|    2|[106868, 127320, ...|           3|republican who cr...|
|A 17-year-old gir...|                 |2018-07-18|  4|    1|[132130, 132132, ...|           3|first person to c...|
|In 1988 author Ro...|                 |2019-02-04|  5|    2|[123254, 123418, ...|           3|how dangerous is ...|
|When it comes to ...|  Hillary Clinton|2016-03-22|  6|    2|[41099, 89899, 72...|           6|remarks on counte...|
|Rhode Island is "...|Leonidas Raptakis|2014-02-11|  7|    2|[82

In [0]:
df_combined.head()['Article_Content']  #No \n\t present

'1984 george orwell predicted 2017 almost 70 years ago april 1984 1300 comrade 6079 winston smith thinks a thought starts a diary and falls in love but big brother is always watching set in a world where an invasive government keeps a malevolently watchful eye on its citizens this radical and muchlauded staging explores surveillance identity and why orwells vision of the future is as relevant now as ever now with a brand new australian cast we bring adapterdirectors robert icke and duncan macmillans thrilling vision back to australia for a national tour there was no crystal ball no religious apparitions or voices from the dead when george orwell wrote nineteen eightyfour accurately depicted our present almost 70 years ago its no coincidence that orwells 1949 novel had climbed back into multiple bestseller lists this year  people have realised its scarily accurate foretelling in 2013 its relevance  still strong and worryingly reliable  led to the west end smash hit adaptation of the boo

### Repeat Same thing Claim column

In [0]:
df_combined = df_combined.withColumn("Claim",regexp_replace(col("Claim"),"[^a-zA-Z0-9\\s+]",""))

In [0]:
df_combined = df_combined.withColumn("Claim",regexp_replace(col("Claim"),"/(\r\n)+|\r+|\n+|\t+/i"," "))

In [0]:
df_combined.show()

+--------------------+-----------------+----------+---+-----+--------------------+------------+--------------------+
|               Claim|         claimant|      date| id|label|    related_articles|articleCount|     Article_Content|
+--------------------+-----------------+----------+---+-----+--------------------+------------+--------------------+
|A line from Georg...|                 |2017-07-17|  0|    0|[122094, 122580, ...|           4|1984 george orwel...|
|Maine legislature...|                 |2018-03-17|  1|    2|[106868, 127320, ...|           3|republican who cr...|
|A 17yearold girl ...|                 |2018-07-18|  4|    1|[132130, 132132, ...|           3|first person to c...|
|In 1988 author Ro...|                 |2019-02-04|  5|    2|[123254, 123418, ...|           3|how dangerous is ...|
|When it comes to ...|  Hillary Clinton|2016-03-22|  6|    2|[41099, 89899, 72...|           6|remarks on counte...|
|Rhode Island is a...|Leonidas Raptakis|2014-02-11|  7|    2|[82

In [0]:
df_combined.head()['Claim']  #No \n\t present

'A line from George Orwells novel 1984 predicts the power of smartphones'

### Make Claim to lower case

In [0]:
from pyspark.sql.functions import lower
df_combined = df_combined.withColumn("Claim",lower(col("Claim")))
df_combined.show()

+--------------------+-----------------+----------+---+-----+--------------------+------------+--------------------+
|               Claim|         claimant|      date| id|label|    related_articles|articleCount|     Article_Content|
+--------------------+-----------------+----------+---+-----+--------------------+------------+--------------------+
|a line from georg...|                 |2017-07-17|  0|    0|[122094, 122580, ...|           4|1984 george orwel...|
|maine legislature...|                 |2018-03-17|  1|    2|[106868, 127320, ...|           3|republican who cr...|
|a 17yearold girl ...|                 |2018-07-18|  4|    1|[132130, 132132, ...|           3|first person to c...|
|in 1988 author ro...|                 |2019-02-04|  5|    2|[123254, 123418, ...|           3|how dangerous is ...|
|when it comes to ...|  Hillary Clinton|2016-03-22|  6|    2|[41099, 89899, 72...|           6|remarks on counte...|
|rhode island is a...|Leonidas Raptakis|2014-02-11|  7|    2|[82

### Save the cleaned data to csv 

In [0]:
df_combined.printSchema()

root
 |-- Claim: string (nullable = true)
 |-- claimant: string (nullable = true)
 |-- date: string (nullable = true)
 |-- id: long (nullable = true)
 |-- label: long (nullable = true)
 |-- related_articles: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- articleCount: integer (nullable = true)
 |-- Article_Content: string (nullable = true)



In [0]:
df_cleaned = df_combined.select(["Claim","Claimant","Article_Content","label"])

In [0]:
df_cleaned.printSchema()

root
 |-- Claim: string (nullable = true)
 |-- Claimant: string (nullable = true)
 |-- Article_Content: string (nullable = true)
 |-- label: long (nullable = true)



In [0]:
df_cleaned.show()

+--------------------+-----------------+--------------------+-----+
|               Claim|         Claimant|     Article_Content|label|
+--------------------+-----------------+--------------------+-----+
|a line from georg...|                 |1984 george orwel...|    0|
|maine legislature...|                 |republican who cr...|    2|
|a 17yearold girl ...|                 |first person to c...|    1|
|in 1988 author ro...|                 |how dangerous is ...|    2|
|when it comes to ...|  Hillary Clinton|remarks on counte...|    2|
|rhode island is a...|Leonidas Raptakis|lis  code of virg...|    2|
|the poorest count...|         Jim Webb|counties in appal...|    1|
|koch industries p...|                 |update confrontin...|    0|
|minnesota michiga...|        Robin Vos|robin vos discuss...|    1|
|fbi uniform crime...|     Nick Schroer|fbi over four tim...|    1|
| pelosi sinks to ...|  Western Journal|pelosi sinks to n...|    0|
|socialist teacher...|                 |r wolfe 

In [0]:
df_cleaned.write.csv("CleanedNews.csv")

### Read cleaned data.

In [0]:
df = spark.read.csv('CleanedNews.csv/part-00000-e0c20413-d9a2-4ae3-bc41-a77b460c6a58-c000.csv',inferSchema=True)
df.show()

+--------------------+-----------------+--------------------+---+
|                 _c0|              _c1|                 _c2|_c3|
+--------------------+-----------------+--------------------+---+
|a line from georg...|             null|1984 george orwel...|  0|
|maine legislature...|             null|republican who cr...|  2|
|a 17yearold girl ...|             null|first person to c...|  1|
|in 1988 author ro...|             null|how dangerous is ...|  2|
|when it comes to ...|  Hillary Clinton|remarks on counte...|  2|
|rhode island is a...|Leonidas Raptakis|lis  code of virg...|  2|
|the poorest count...|         Jim Webb|counties in appal...|  1|
|koch industries p...|             null|update confrontin...|  0|
|minnesota michiga...|        Robin Vos|robin vos discuss...|  1|
|fbi uniform crime...|     Nick Schroer|fbi over four tim...|  1|
|pelosi sinks to n...|  Western Journal|pelosi sinks to n...|  0|
|socialist teacher...|             null|r wolfe on twitte...|  1|
|says that

In [0]:
df.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: integer (nullable = true)

