In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import sqlalchemy

from sqlalchemy import create_engine, Column, Integer, String
from sqlalchemy.ext.declarative import declarative_base

import pickle
import os
from dotenv import load_dotenv
import pymysql

In [2]:
#with gzip.open("../preprocesado/meneame_procesado.pkl.gz", "rb") as f:
#    df = pickle.load(f)


In [3]:
current_dir = os.path.abspath(os.getcwd())
print(f"Directorio actual: {current_dir}")

# Subir hasta el directorio raíz común (en este caso 'Analisis-de-noticias')
root_dir = os.path.abspath(os.path.join(current_dir, "../../.."))
print(f"Directorio raíz común: {root_dir}")

# Construir la ruta al directorio donde están los archivos pkl
directory = os.path.join(root_dir, "src/00.data/preprocesado/")
print(f"Ruta de directorio ajustada: {directory}")

# Lista para almacenar los DataFrames
df_lista = []

# Buscar todos los archivos pkl en el directorio
archivos_pkl = [f for f in os.listdir(directory) if f.startswith("meneame_procesado_") and f.endswith(".pkl")]

# Leer cada archivo .pkl y agregarlo a la lista de DataFrames
for archivo in archivos_pkl:
    file_path = os.path.join(directory, archivo)
    with open(file_path, "rb") as f:
        df_chunk = pickle.load(f)
        df_lista.append(df_chunk)
        print(f"Cargado: {archivo} con {len(df_chunk)} filas")

# Concatenar todos los DataFrames en uno solo
df = pd.concat(df_lista, ignore_index=True)

# Verificar el tamaño del DataFrame final
print(f"DataFrame final con {df.shape[0]} filas y {df.shape[1]} columnas")

Directorio actual: /Users/lucija/Projects/Analisis-de-noticias/src/00.data/SQL
Directorio raíz común: /Users/lucija/Projects/Analisis-de-noticias
Ruta de directorio ajustada: /Users/lucija/Projects/Analisis-de-noticias/src/00.data/preprocesado/
Cargado: meneame_procesado_3.pkl con 87563 filas
Cargado: meneame_procesado_2.pkl con 100000 filas
Cargado: meneame_procesado_1.pkl con 100000 filas
DataFrame final con 287563 filas y 19 columnas


In [5]:
df = df.astype({'provincia': 'object', 'comunidad': 'object'})

df['provincia'] = df['provincia'].fillna('Desconocido')
df['comunidad'] = df['comunidad'].fillna('Desconocido')

In [7]:
#asignando id a cada columna para poder organizar base de datos
df['user_id'] = pd.factorize(df['user'])[0] + 1
df['source_id'] = pd.factorize(df['source'])[0] + 1
df['provincia_id'] = pd.factorize(df['provincia'])[0] + 1
df['category_id'] = pd.factorize(df['category'])[0] + 1

In [8]:
df_user = df[["user_id", "user"]].drop_duplicates()

df_source = df[["source_id", "source"]].drop_duplicates()

df_category = df[["category_id", "category"]].drop_duplicates()

df_location = df[["provincia_id", "provincia", "comunidad"]].drop_duplicates("provincia_id").dropna()

df_news = df[["news_id", "title", "content", "meneos", "clicks", "karma", "positive_votes", 
                 "anonymous_votes", "negative_votes", "comments", "published_date", "source_link", 
                "scraped_date", "user_id", "source_id", "category_id", "provincia_id"]].drop_duplicates()

In [9]:
load_dotenv()

user = os.getenv("DB_USER")
password = os.getenv("DB_PASSWORD")
host = os.getenv("HOST", "localhost")

try:
    connection = pymysql.connect(host=host, user=user, password=password)
    cursor = connection.cursor()

    cursor.execute("CREATE DATABASE IF NOT EXISTS meneame;")
    connection.select_db("meneame")

    sql_schema = """
    CREATE TABLE IF NOT EXISTS user_table (
        user_id INT PRIMARY KEY NOT NULL,
        user VARCHAR(50) NOT NULL
    );

    CREATE TABLE IF NOT EXISTS category_table (
        category_id INT PRIMARY KEY,
        category VARCHAR(50) 
    );

    CREATE TABLE IF NOT EXISTS source_table (
        source_id BIGINT PRIMARY KEY,
        source VARCHAR(255)  
    );

    CREATE TABLE IF NOT EXISTS location_table (
        provincia_id INT PRIMARY KEY,
        provincia VARCHAR(50),
        comunidad VARCHAR(50)
    );

    CREATE TABLE IF NOT EXISTS news_info_table (
        news_id BIGINT PRIMARY KEY,
        title TEXT,
        content TEXT,
        category_id INT,
        meneos INT,
        clicks INT,
        karma INT,
        positive_votes INT,
        anonymous_votes INT,
        negative_votes INT,
        comments INT,
        published_date DATETIME,
        scraped_date DATETIME,
        user_id INT,
        source_id BIGINT,
        source_link TEXT,
        provincia_id INT,
        FOREIGN KEY (category_id) REFERENCES category_table(category_id),
        FOREIGN KEY (user_id) REFERENCES user_table(user_id),
        FOREIGN KEY (provincia_id) REFERENCES location_table(provincia_id),
        FOREIGN KEY (source_id) REFERENCES source_table(source_id)
    );
    """

    for statement in sql_schema.split(";"):
        statement = statement.strip()
        if statement:
            cursor.execute(statement)

    connection.commit()

except Exception as e:
    print("An error occurred:", e)
finally:
    if cursor is not None:
        cursor.close()
    if connection is not None:
        connection.close()



In [10]:
database = "meneame"

engine = create_engine(f"mysql+pymysql://{user}:{password}@localhost/{database}")

connection = engine.connect()

#connection.close()





In [11]:
df_user.to_sql(name="user_table", con=engine, if_exists="append", index=False)


25681

In [12]:
df_source.to_sql(name="source_table", con=engine, if_exists="append", index=False)

31088

In [13]:
df_category.to_sql(name="category_table", con=engine, if_exists="append", index=False)

14

In [14]:
df_location.to_sql(name="location_table", con=engine, if_exists="append", index=False)


53

In [15]:
df_news.to_sql(name="news_info_table", con=engine, if_exists="append", index=False)

287563