In [94]:
from pyspark.sql.functions import lit
from pyspark.sql.functions import when, col, count
from pyspark.sql import SparkSession
import os
from pyspark.sql import functions as F

In [95]:
# dir_path = os.path.dirname(os.path.realpath(__file__))

spark = SparkSession.builder \
    .appName("CSV to DataFrame") \
    .getOrCreate()

df_fake = spark.read.csv(r"Bases_csv\fake.csv", header=True, inferSchema=True)
df_true = spark.read.csv(r"Bases_csv\true.csv", header=True, inferSchema=True)

df_fake_or_real = spark.read.csv(
    r"Bases_csv\fake_or_real_news.csv",
    header=True,  # Assumes the first line of the file contains headers
    inferSchema=True,  # Automatically infers column types
    quote='"',  # Specifies the quote character
    escape='"',  # Specifies the escape character
    multiLine=True  # Allows for parsing of multiline fields
)
df_news_articles = spark.read.csv(
    r"Bases_csv\news_articles.csv",
    header=True,  # Assumes the first line of the file contains headers
    inferSchema=True,  # Automatically infers column types
    quote='"',  # Specifies the quote character
    escape='"',  # Specifies the escape character
    multiLine=True  # Allows for parsing of multiline fields
)
df_WELFake_Dataset = spark.read.csv(
    r"Bases_csv\WELFake_Dataset.csv",
    header=True,  # Assumes the first line of the file contains headers
    inferSchema=True,  # Automatically infers column types
    quote='"',  # Specifies the quote character
    escape='"',  # Specifies the escape character
    multiLine=True  # Allows for parsing of multiline fields
)
print("CSV FAKE")
df_fake.show()
print("CSV TRUE")
df_true.show()
print("fake_or_real_news")
df_fake_or_real.show()
print("news_articles")
df_news_articles.show()
print("WELFake_Dataset")
df_WELFake_Dataset.show()



CSV FAKE
+--------------------+--------------------+-------+-----------------+
|               title|                text|subject|             date|
+--------------------+--------------------+-------+-----------------+
| Donald Trump Sen...|Donald Trump just...|   News|December 31, 2017|
| Drunk Bragging T...|House Intelligenc...|   News|December 31, 2017|
| Sheriff David Cl...|On Friday, it was...|   News|December 30, 2017|
| Trump Is So Obse...|On Christmas day,...|   News|December 29, 2017|
| Pope Francis Jus...|Pope Francis used...|   News|December 25, 2017|
| Racist Alabama C...|The number of cas...|   News|December 25, 2017|
| Fresh Off The Go...|Donald Trump spen...|   News|December 23, 2017|
| Trump Said Some ...|In the wake of ye...|   News|December 23, 2017|
| Former CIA Direc...|Many people have ...|   News|December 22, 2017|
| WATCH: Brand-New...|Just when you mig...|   News|December 21, 2017|
| Papa John’s Foun...|A centerpiece of ...|   News|December 21, 2017|
| WATCH: Pa

## Estandarización de esquemas: Titulo, texto, label

In [96]:
#### 1 = Fake
#### 0 = Real

df_fake_est = df_fake.select("title", "text")
df_fake_est = df_fake_est.withColumn("label", lit('FAKE'))
df_fake_est = df_fake_est.withColumn("Label_bin", lit(1))

df_true_est = df_true.select("title", "text")
df_true_est = df_true_est.withColumn("label", lit('TRUE'))
df_true_est = df_true_est.withColumn("Label_bin", lit(0))

df_fake_or_real_est = df_fake_or_real.select("title", "text",  "label")
df_fake_or_real_est = df_fake_or_real_est.withColumn(
    "Label_bin",
    when(col("label") == "FAKE", 1).otherwise(0)
)

df_news_articles_est = df_news_articles.select("title", "text",  "label").where(col("label") != "NULL")
df_news_articles_est = df_news_articles_est.withColumn(
    "Label_bin",
    when(col("label") == "Fake", 1).otherwise(0)
)

df_WELFake_Dataset_est = df_WELFake_Dataset.select("title", "text", "label", col('label').alias("Label_bin"))



In [97]:
combined_df = df_fake_or_real_est.union(df_news_articles_est.union(df_WELFake_Dataset_est.union(df_fake_est.union(df_true_est))))
combined_df_til_text_label= combined_df.select('title','text',col('Label_bin').alias("label"))

## Verificación de duplicados y labels consistentes

In [98]:
print("Distinct labels")
diff_labels = combined_df_til_text_label.select('label').distinct().show()
# Step 1: Count total number of rows
total_rows = combined_df_til_text_label.count()
print(f"Total rows {total_rows}")
# Step 2: Count number of distinct rows
distinct_rows = combined_df_til_text_label.distinct().count()
print(f"Distinct rows {distinct_rows}")
# Step 3: Calculate number of duplicate rows
duplicate_rows = total_rows - distinct_rows

# Print the number of duplicate rows
print(f"Number of duplicate rows: {duplicate_rows}")

combined_df_distinct = combined_df_til_text_label.dropDuplicates()


Distinct labels


+-----+
|label|
+-----+
|    1|
|    0|
+-----+

Total rows 125470
Distinct rows 66793
Number of duplicate rows: 58677


In [99]:
print("DF_combinado")
print(combined_df_distinct.count())
print("CSV FAKE")
print(df_fake.count())
print("CSV TRUE")
print(df_true.count())
print("fake_or_real_news")
print(df_fake_or_real.count())
print("news_articles")
print(df_news_articles.count())
print("WELFake_Dataset")
print(df_WELFake_Dataset.count())

DF_combinado


66793
CSV FAKE
23489
CSV TRUE
21417
fake_or_real_news
6335
news_articles
2096
WELFake_Dataset
72134


## Guardar como parquet

In [100]:
combined_df_distinct.write.parquet(r"C:\Users\Ales9914\Desktop\Prueba_venv\Unificacion_bases\Base_news_corregido")