In [50]:
from pyspark.sql.functions import lit
from pyspark.sql.functions import when, col, count
from pyspark.sql import SparkSession
import os
from pyspark.sql import functions as F

In [51]:


spark = SparkSession.builder \
    .appName("CSV to DataFrame") \
    .getOrCreate()

df_fake = spark.read.csv("fake.csv", header=True, inferSchema=True)
df_fake_or_real = spark.read.csv(
    "fake_or_real_news.csv",
    header=True,  # Assumes the first line of the file contains headers
    inferSchema=True,  # Automatically infers column types
    quote='"',  # Specifies the quote character
    escape='"',  # Specifies the escape character
    multiLine=True  # Allows for parsing of multiline fields
)
df_news_articles = spark.read.csv(
    "news_articles.csv",
    header=True,  # Assumes the first line of the file contains headers
    inferSchema=True,  # Automatically infers column types
    quote='"',  # Specifies the quote character
    escape='"',  # Specifies the escape character
    multiLine=True  # Allows for parsing of multiline fields
)
df_WELFake_Dataset = spark.read.csv(
    "WELFake_Dataset.csv",
    header=True,  # Assumes the first line of the file contains headers
    inferSchema=True,  # Automatically infers column types
    quote='"',  # Specifies the quote character
    escape='"',  # Specifies the escape character
    multiLine=True  # Allows for parsing of multiline fields
)
print("CSV FAKE")
df_fake.show()
print("fake_or_real_news")
df_fake_or_real.show()
print("news_articles")
df_news_articles.show()
print("WELFake_Dataset")
df_WELFake_Dataset.show()



CSV FAKE
+--------------------+--------------------+-------+-----------------+
|               title|                text|subject|             date|
+--------------------+--------------------+-------+-----------------+
| Donald Trump Sen...|Donald Trump just...|   News|December 31, 2017|
| Drunk Bragging T...|House Intelligenc...|   News|December 31, 2017|
| Sheriff David Cl...|On Friday, it was...|   News|December 30, 2017|
| Trump Is So Obse...|On Christmas day,...|   News|December 29, 2017|
| Pope Francis Jus...|Pope Francis used...|   News|December 25, 2017|
| Racist Alabama C...|The number of cas...|   News|December 25, 2017|
| Fresh Off The Go...|Donald Trump spen...|   News|December 23, 2017|
| Trump Said Some ...|In the wake of ye...|   News|December 23, 2017|
| Former CIA Direc...|Many people have ...|   News|December 22, 2017|
| WATCH: Brand-New...|Just when you mig...|   News|December 21, 2017|
| Papa John’s Foun...|A centerpiece of ...|   News|December 21, 2017|
| WATCH: Pa

In [52]:
df_WELFake_Dataset.where(col("label").like("%beans%")).show()

df_WELFake_Dataset.select('label').distinct().show()

+--------------------+--------------------+------+
|               title|                text| label|
+--------------------+--------------------+------+
| be it public or ...| to land. Be it beef| beans|
| be it public or ...| to land. Be it beef| beans|
+--------------------+--------------------+------+

+--------------------+
|               label|
+--------------------+
| даже невзирая на...|
| насколько мы пон...|
| I saw his hands ...|
| you will find th...|
|               beans|
| makeshift rafts ...|
| abortion……. I’ve...|
|       to never hear|
| to take over con...|
| и для моих колле...|
| fluoride water a...|
| than followers o...|
| and they are sti...|
|     pastor and wife|
| and execute a co...|
| just as Wednesda...|
| чтобы включить к...|
|     опять обогатить|
| the acting chair...|
|             jewelry|
+--------------------+
only showing top 20 rows



## Estandarización de esquemas: Titulo, texto, label

In [59]:
#### 1 = Fake
#### 0 = Real

df_fake_est = df_fake.select("title", "text")
df_fake_est = df_fake_est.withColumn("label", lit('FAKE'))
df_fake_est = df_fake_est.withColumn("Label_bin", lit(1))

df_fake_or_real_est = df_fake_or_real.select("title", "text",  "label")
df_fake_or_real_est = df_fake_or_real_est.withColumn(
    "Label_bin",
    when(col("label") == "FAKE", 1).otherwise(0)
)

df_news_articles_est = df_news_articles.select("title", "text",  "label").where(col("label") != "NULL")
df_news_articles_est = df_news_articles_est.withColumn(
    "Label_bin",
    when(col("label") == "Fake", 1).otherwise(0)
)

df_WELFake_Dataset_est = df_WELFake_Dataset.select("title", "text",  "label").where((col("label") == "Fake") | (col("label") == "Real"))
df_WELFake_Dataset_est = df_WELFake_Dataset_est.withColumn(
    "Label_bin",
    when(col("label") == "Fake", 1).otherwise(0)
)

print("WELFake_Dataset")
print(df_WELFake_Dataset_est.dropDuplicates(['title','text']).count())


WELFake_Dataset
63580


In [54]:
combined_df = df_fake_or_real_est.union(df_news_articles_est.union(df_WELFake_Dataset_est.union(df_fake_est)))

## Verificación de duplicados y labels consistentes

In [55]:
print("Distinct labels")
diff_labels = combined_df.select('Label_bin').distinct().show()

combined_df = combined_df.dropDuplicates(['title','text'])


Distinct labels


+---------+
|Label_bin|
+---------+
|        1|
|        0|
+---------+



In [56]:
combined_df = combined_df.select('title','text',col('Label_bin').alias("label"))


In [57]:
print("DF_combinado")
print(combined_df.count())
print("CSV FAKE")
print(df_fake.count())
print("fake_or_real_news")
print(df_fake_or_real.count())
print("news_articles")
print(df_news_articles.count())
print("WELFake_Dataset")
print(df_WELFake_Dataset.count())

DF_combinado
66568
CSV FAKE
23489
fake_or_real_news
6335
news_articles
2096
WELFake_Dataset
78227


## Guardar como parquet

In [58]:
combined_df.write.parquet(r"D:\Universidad\Maestria ciencias de los datos y analitica\Mineria en grandes volumenes de informacion\Proyecto final mineria\Base_news")

AnalysisException: [PATH_ALREADY_EXISTS] Path file:/D:/Universidad/Maestria ciencias de los datos y analitica/Mineria en grandes volumenes de informacion/Proyecto final mineria/Base_news already exists. Set mode as "overwrite" to overwrite the existing path.