In [1]:
import os
import redis
import pandas as pd
from collections import Counter
from tqdm import tqdm
import json
from dotenv import load_dotenv

# 환경변수 로드
load_dotenv()

# Redis 연결 설정
REDIS_HOST = os.getenv('REDIS_HOST')
REDIS_PORT = int(os.getenv('REDIS_PORT'))
REDIS_USERNAME = os.getenv('REDIS_USERNAME')
REDIS_PASSWORD = os.getenv('REDIS_PASSWORD')

r = redis.Redis(
    host=REDIS_HOST,
    port=REDIS_PORT,
    username=REDIS_USERNAME,
    password=REDIS_PASSWORD,
    decode_responses=True
)

In [None]:
# Redis 초기화
def reset_redis():
    print("Initializing Redis database...")
    r.flushdb()
    print("Redis database cleared.")

# CSV 파일 로드 및 텍스트 결합
def load_and_combine_text(data_folder):
    combined_texts = []
    all_data = pd.DataFrame()

    for filename in os.listdir(data_folder):
        if filename.endswith(".csv"):
            filepath = os.path.join(data_folder, filename)
            df = pd.read_csv(filepath, encoding="utf-8")
            if "title" in df.columns and "short_content" in df.columns and "company" in df.columns:
                df["combined"] = df["title"].fillna("") + " " + df["short_content"].fillna("")
                combined_texts.extend(df["combined"].tolist())
                all_data = pd.concat([all_data, df], ignore_index=True)

    return combined_texts, all_data

# Redis 저장 함수
def save_to_redis(df, keywords):
    print("Starting to save data to Redis...")

    # 1. 키워드 목록 저장
    valid_keywords = [kw for kw, _ in keywords if isinstance(kw, str)]
    r.delete("keywords")
    r.sadd("keywords", *valid_keywords)
    print("Keywords saved.")

    # 2. 키워드별 기사 저장
    for keyword in tqdm(valid_keywords, desc="Saving articles for each keyword"):
        articles = df[df["combined"].str.contains(keyword)].copy()

        r.delete(f"keyword:{keyword}:articles")
        for _, row in articles.iterrows():
            row_data = json.dumps(row.to_dict())
            r.rpush(f"keyword:{keyword}:articles", row_data)

    print("Articles saved for all keywords.")

    # 3. 키워드별 언론사 기사 개수 저장
    print("Saving keyword-specific company stats...")
    for keyword in tqdm(valid_keywords, desc="Processing keyword-specific company stats"):
        keyword_articles = df[df["combined"].str.contains(keyword)]
        company_counts = Counter(keyword_articles["company"])

        r.delete(f"keyword:{keyword}:company_stats")
        for company, count in company_counts.items():
            r.hset(f"keyword:{keyword}:company_stats", company, count)

    print("Keyword-specific company stats saved.")

    # 4. 키워드별 기사 개수 저장
    print("Saving keyword stats...")
    r.delete("keyword:stats")
    for keyword in valid_keywords:
        count = len(df[df["combined"].str.contains(keyword)])
        r.hset("keyword:stats", keyword, count)

    print("Keyword stats saved.")
    print("Data saved to Redis.")

In [None]:
# 실행
if __name__ == "__main__":
    data_folder = "../Completed_csv"  

    # Redis 초기화
    reset_redis()

    # 데이터 로드 및 텍스트 결합
    combined_texts, df = load_and_combine_text(data_folder)

    # 상위 50개 키워드 추출
    top_keywords = Counter(" ".join(combined_texts).split()).most_common(50)

    # Redis에 저장
    save_to_redis(df, top_keywords)