In [None]:
import pandas as pd
import mysql.connector
from dotenv import load_dotenv
import os

# 1. Load .env credentials
load_dotenv(override=True)

username = os.getenv("DB_USERNAME")
password = os.getenv("DB_PASSWORD")
host = "localhost"
port = 3306
database = "true_and_fake_news_detection_db"

# 2. Load cleaned CSV file
csv_path = "./datasets/true_and_fake_news_clean_with_features.csv"
df = pd.read_csv(csv_path)
print("CSV loaded with shape:", df.shape)

# 3. Connect to MySQL
conn = mysql.connector.connect(
    host=host,
    user=username,
    password=password,
    database=database,
    port=port
)
cursor = conn.cursor()

# 4. Create table if not exists
cursor.execute("""
CREATE TABLE IF NOT EXISTS news_articles (
    id INT AUTO_INCREMENT PRIMARY KEY,
    title TEXT,
    text LONGTEXT,
    subject VARCHAR(255),
    date DATE,
    label INT,
    clean_text LONGTEXT,
    label_str VARCHAR(10),
    word_count INT,
    char_count INT,
    avg_word_length FLOAT,
    sentence_count INT,
    polarity FLOAT
)
""")

# 5. Prepare insert statement
sql = """
INSERT IGNORE INTO news_articles (
    title, text, subject, date, label,
    clean_text, label_str, word_count, char_count,
    avg_word_length, sentence_count, polarity
) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
"""

# 6. Preprocess DataFrame
# Replace NaN with None
df = df.replace({pd.NA: None, pd.NaT: None})
df = df.where(pd.notnull(df), None)

# Clean whitespace
df["date"] = df["date"].str.strip()
# Now convert properly
df["date"] = pd.to_datetime(df["date"], format="%B %d, %Y", errors="coerce").dt.date

# 7. Convert DataFrame rows to list of tuples (handling NaN/NaT)
values = [tuple(None if pd.isna(x) else x for x in row)
          for row in df[[
              "title", "text", "subject", "date", "label",
              "clean_text", "label_str", "word_count", "char_count",
              "avg_word_length", "sentence_count", "polarity"
          ]].to_numpy()]

# 7. Insert rows in batches
batch_size = 1000  # due to large dataset we cant insert all at once
for i in range(0, len(values), batch_size):
    batch = values[i:i+batch_size]
    cursor.executemany(sql, batch)
    conn.commit()
    print(f"Inserted batch {i+1} to {i+len(batch)}")

# 8. Close cursor and connection
cursor.close()
conn.close()
print("All batches inserted successfully.")

CSV loaded with shape: (44898, 12)
Inserted batch 1 to 1000
Inserted batch 1001 to 2000
Inserted batch 2001 to 3000
Inserted batch 3001 to 4000
Inserted batch 4001 to 5000
Inserted batch 5001 to 6000
Inserted batch 6001 to 7000
Inserted batch 7001 to 8000
Inserted batch 8001 to 9000
Inserted batch 9001 to 10000
Inserted batch 10001 to 11000
Inserted batch 11001 to 12000
Inserted batch 12001 to 13000
Inserted batch 13001 to 14000
Inserted batch 14001 to 15000
Inserted batch 15001 to 16000
Inserted batch 16001 to 17000
Inserted batch 17001 to 18000
Inserted batch 18001 to 19000
Inserted batch 19001 to 20000
Inserted batch 20001 to 21000
Inserted batch 21001 to 22000
Inserted batch 22001 to 23000
Inserted batch 23001 to 24000
Inserted batch 24001 to 25000
Inserted batch 25001 to 26000
Inserted batch 26001 to 27000
Inserted batch 27001 to 28000
Inserted batch 28001 to 29000
Inserted batch 29001 to 30000
Inserted batch 30001 to 31000
Inserted batch 31001 to 32000
Inserted batch 32001 to 330