In [1]:
!pip install praw



In [2]:
!pip install asyncpraw



In [3]:
!pip install nest_asyncio



In [5]:
import asyncpraw
import pandas as pd
import asyncio
import nest_asyncio
from datetime import datetime

nest_asyncio.apply()  # Required for Jupyter Notebook

In [5]:
import os
print(os.getcwd())  # 현재 작업 디렉토리 확인
os.listdir()        # 현재 디렉토리 내 파일 목록 출력


/Users/joyeongyeong/Documents/Thesis


['Reddit_Data_Scrap.ipynb',
 '2_filtered_products_personal.jsonl',
 '4_Final_matched_review_products_personal.jsonl',
 '.DS_Store',
 '3_Visualization.ipynb',
 '2_filtered_products_beauty.jsonl',
 'Preprocessing_reddit.ipynb',
 '3_matched_review_products_beauty.jsonl',
 'Beauty.jsonl',
 '4_Final_matched_review_products_beauty.jsonl',
 'amazon-review-scraper',
 '3_matched_review_products_personal.jsonl',
 'Personal_Care.jsonl',
 'Amazon_2023_updated.ipynb',
 'meta_Personal_Care.jsonl',
 'meta_All_Beauty.jsonl',
 '.ipynb_checkpoints',
 'Reddit_scrap.ipynb']

In [7]:
# Brand keywords mapping
brand_keywords_map = {
    "La Roche-Posay": ["la roche-posay", "laroche", "effaclar", "toleriane"],
    "Avène": ["avène", "avene", "thermal spring", "cicalfate"],
    "Bioderma": ["bioderma", "sensibio", "hydrabio"],
    "CeraVe": ["cerave", "cerave cleanser", "cerave moisturizer"],
    "Uriage": ["uriage", "bariederm", "eau thermale"],
    "Garnier": ["garnier", "micellar", "skinactive"],
    "L’Oréal Paris": ["l’oréal", "loreal", "revitalift", "hydra genius"],
    "Neutrogena": ["neutrogena", "hydro boost", "rainbath"],
    "Nivea": ["nivea", "soft cream", "q10"]
}


# Brand Position Mapping
brand_positioning_map = {
    "La Roche-Posay": "niche",
    "Avène": "niche",
    "Bioderma": "niche",
    "CeraVe": "niche",
    "Uriage": "niche",
    "Garnier": "mass",
    "L’Oréal Paris": "mass",
    "Neutrogena": "mass",
    "Nivea": "mass",
}


# data period
start_date = datetime(2019, 1, 1)
end_date = datetime(2023, 12, 31)

async def fetch_posts_with_keyword(subreddit_name, keyword, positioning, brand, limit=1000):
    reddit = asyncpraw.Reddit(
        client_id="OQ7pfL7IImXBD1LlNsWUdw",
        client_secret="6povn4KpOaXDSTsWpQClHo2kBhDlSg",
        user_agent="DataCollect"
    )
    merged_rows = []
    try:
        print(f"await subreddit")
        subreddit = await reddit.subreddit(subreddit_name)
        print(f"subreddit search")
        async for post in subreddit.search(keyword, sort='relevance', time_filter='all', limit=limit):
            try:
                post_date = datetime.fromtimestamp(post.created)
                if start_date <= post_date <= end_date:                 # Date filtering
                    await post.load()
                    # Filtering short text
                    if post.selftext and len(post.selftext.split()) < 5:
                        continue
                    post_row = {
                        "positioning": positioning,
                        "brand": brand,
                        "p_title": post.title,
                        "p_score": post.score,
                        "num_comments": post.num_comments,
                        "p_text": post.selftext,
                        "p_created_date": post_date,
                        "c_text": None,
                        "c_score": None,
                        "c_created_date": None
                    }
                    merged_rows.append(post_row)
                    for comment in post.comments:
                        if isinstance(comment, asyncpraw.models.Comment):
                            if comment.body and len(comment.body.split()) < 5:
                                continue
                            comment_row = {
                                "positioning": positioning,
                                "brand": brand,
                                "p_title": post.title,
                                "p_score": post.score,
                                "num_comments": post.num_comments,
                                "p_text": post.selftext,
                                "p_created_date": post_date,
                                "c_text": comment.body,
                                "c_score": comment.score,
                                "c_created_date": datetime.fromtimestamp(comment.created)
                            }
                            merged_rows.append(comment_row)
            except Exception as e:
                continue
    except Exception as e:
        pass
    finally:
        await reddit.close()
    # Delete repetition
    df = pd.DataFrame(merged_rows).drop_duplicates(subset=["p_title", "p_text", "c_text"])
    return df

async def main():
    subreddits = [
        "SkincareAddiction", "EuroSkincare", "beauty",
        "AsianBeauty", "MakeupAddiction", "SkincareScience"
    ]
    min_samples = 500  # the minimum sample per brand
    for brand, positioning in brand_positioning_map.items():
        all_data = pd.DataFrame()
        for keyword in brand_keywords_map[brand]:
            for subreddit in subreddits:
                print(f"Entering fetch posts with {keyword}..")
                df = await fetch_posts_with_keyword(subreddit, keyword, positioning, brand, limit=500)
                print(f"DONE")
                all_data = pd.concat([all_data, df], ignore_index=True)
        # Delete Null/Duplications
        all_data = all_data.drop_duplicates(subset=["p_title", "p_text", "c_text"])
        all_data = all_data.dropna(subset=["p_text"])
        # Warning
        if len(all_data) < min_samples:
            print(f"[Warining] {brand} Data isn't sufficient: {len(all_data)}")
        file_name = f"reddit_{positioning.lower()}_{brand.lower().replace(' ', '_').replace('’', '').replace('é', 'e')}.csv"
        all_data.to_csv(file_name, index=False, encoding="utf-8")
        print(f"Saved: {file_name} ({len(all_data)} rows)")

# Running
await main()

Entering fetch posts with la roche-posay..
await subreddit
subreddit search
DONE
Entering fetch posts with la roche-posay..
await subreddit
subreddit search
DONE
Entering fetch posts with la roche-posay..
await subreddit
subreddit search
DONE
Entering fetch posts with la roche-posay..
await subreddit
subreddit search
DONE
Entering fetch posts with la roche-posay..
await subreddit
subreddit search
DONE
Entering fetch posts with la roche-posay..
await subreddit
subreddit search
DONE
Entering fetch posts with laroche..
await subreddit
subreddit search
DONE
Entering fetch posts with laroche..
await subreddit
subreddit search
DONE
Entering fetch posts with laroche..
await subreddit
subreddit search
DONE
Entering fetch posts with laroche..
await subreddit
subreddit search
DONE
Entering fetch posts with laroche..
await subreddit
subreddit search
DONE
Entering fetch posts with laroche..
await subreddit
subreddit search
DONE
Entering fetch posts with effaclar..
await subreddit
subreddit search


  all_data = pd.concat([all_data, df], ignore_index=True)
  all_data = pd.concat([all_data, df], ignore_index=True)


DONE
Entering fetch posts with cicalfate..
await subreddit
subreddit search
DONE
Entering fetch posts with cicalfate..
await subreddit
subreddit search
DONE
Entering fetch posts with cicalfate..
await subreddit
subreddit search
DONE
Entering fetch posts with cicalfate..
await subreddit
subreddit search
DONE
Entering fetch posts with cicalfate..
await subreddit
subreddit search
DONE
Saved: reddit_niche_avène.csv (6060 rows)
Entering fetch posts with bioderma..
await subreddit
subreddit search
DONE
Entering fetch posts with bioderma..
await subreddit
subreddit search
DONE
Entering fetch posts with bioderma..
await subreddit
subreddit search
DONE
Entering fetch posts with bioderma..
await subreddit
subreddit search
DONE
Entering fetch posts with bioderma..
await subreddit
subreddit search
DONE
Entering fetch posts with bioderma..
await subreddit
subreddit search
DONE
Entering fetch posts with sensibio..
await subreddit
subreddit search
DONE
Entering fetch posts with sensibio..
await subre

  all_data = pd.concat([all_data, df], ignore_index=True)
  all_data = pd.concat([all_data, df], ignore_index=True)


DONE
Entering fetch posts with cerave moisturizer..
await subreddit
subreddit search
DONE
Entering fetch posts with cerave moisturizer..
await subreddit
subreddit search
DONE
Entering fetch posts with cerave moisturizer..
await subreddit
subreddit search
DONE
Entering fetch posts with cerave moisturizer..
await subreddit
subreddit search
DONE
Entering fetch posts with cerave moisturizer..
await subreddit
subreddit search
DONE
Saved: reddit_niche_cerave.csv (18268 rows)
Entering fetch posts with uriage..
await subreddit
subreddit search
DONE
Entering fetch posts with uriage..
await subreddit
subreddit search
DONE
Entering fetch posts with uriage..
await subreddit
subreddit search
DONE
Entering fetch posts with uriage..
await subreddit
subreddit search
DONE
Entering fetch posts with uriage..
await subreddit
subreddit search
DONE
Entering fetch posts with uriage..
await subreddit
subreddit search
DONE
Entering fetch posts with bariederm..
await subreddit
subreddit search
DONE
Entering fet

  all_data = pd.concat([all_data, df], ignore_index=True)
  all_data = pd.concat([all_data, df], ignore_index=True)


DONE
Entering fetch posts with eau thermale..
await subreddit
subreddit search
DONE
Entering fetch posts with eau thermale..
await subreddit
subreddit search
DONE
Saved: reddit_niche_uriage.csv (3304 rows)
Entering fetch posts with garnier..
await subreddit
subreddit search
DONE
Entering fetch posts with garnier..
await subreddit
subreddit search
DONE
Entering fetch posts with garnier..
await subreddit
subreddit search
DONE
Entering fetch posts with garnier..
await subreddit
subreddit search
DONE
Entering fetch posts with garnier..
await subreddit
subreddit search
DONE
Entering fetch posts with garnier..
await subreddit
subreddit search
DONE
Entering fetch posts with micellar..
await subreddit
subreddit search
DONE
Entering fetch posts with micellar..
await subreddit
subreddit search
DONE
Entering fetch posts with micellar..
await subreddit
subreddit search
DONE
Entering fetch posts with micellar..
await subreddit
subreddit search
DONE
Entering fetch posts with micellar..
await subredd

  all_data = pd.concat([all_data, df], ignore_index=True)
  all_data = pd.concat([all_data, df], ignore_index=True)


DONE
Entering fetch posts with revitalift..
await subreddit
subreddit search
DONE
Entering fetch posts with hydra genius..
await subreddit
subreddit search
DONE
Entering fetch posts with hydra genius..
await subreddit
subreddit search
DONE
Entering fetch posts with hydra genius..
await subreddit
subreddit search
DONE
Entering fetch posts with hydra genius..
await subreddit
subreddit search


  all_data = pd.concat([all_data, df], ignore_index=True)
  all_data = pd.concat([all_data, df], ignore_index=True)


DONE
Entering fetch posts with hydra genius..
await subreddit
subreddit search
DONE
Entering fetch posts with hydra genius..
await subreddit
subreddit search
DONE
Saved: reddit_mass_loreal_paris.csv (7975 rows)
Entering fetch posts with neutrogena..
await subreddit
subreddit search
DONE
Entering fetch posts with neutrogena..
await subreddit
subreddit search
DONE
Entering fetch posts with neutrogena..
await subreddit
subreddit search
DONE
Entering fetch posts with neutrogena..
await subreddit
subreddit search
DONE
Entering fetch posts with neutrogena..
await subreddit
subreddit search
DONE
Entering fetch posts with neutrogena..
await subreddit
subreddit search
DONE
Entering fetch posts with hydro boost..
await subreddit
subreddit search
DONE
Entering fetch posts with hydro boost..
await subreddit
subreddit search
DONE
Entering fetch posts with hydro boost..
await subreddit
subreddit search
DONE
Entering fetch posts with hydro boost..
await subreddit
subreddit search
DONE
Entering fetch 

  all_data = pd.concat([all_data, df], ignore_index=True)
  all_data = pd.concat([all_data, df], ignore_index=True)


DONE
Entering fetch posts with soft cream..
await subreddit
subreddit search
DONE
Entering fetch posts with soft cream..
await subreddit
subreddit search
DONE
Entering fetch posts with soft cream..
await subreddit
subreddit search
DONE
Entering fetch posts with soft cream..
await subreddit
subreddit search
DONE
Entering fetch posts with soft cream..
await subreddit
subreddit search
DONE
Entering fetch posts with q10..
await subreddit
subreddit search
DONE
Entering fetch posts with q10..
await subreddit
subreddit search
DONE
Entering fetch posts with q10..
await subreddit
subreddit search
DONE
Entering fetch posts with q10..
await subreddit
subreddit search
DONE
Entering fetch posts with q10..
await subreddit
subreddit search
DONE
Entering fetch posts with q10..
await subreddit
subreddit search
DONE
Saved: reddit_mass_nivea.csv (11286 rows)
