In [None]:
import requests
import csv

def get_wikipedia_content(topic):
    url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "format": "json",
        "titles": topic,
        "prop": "extracts",
        "explaintext": True,
        "redirects": 1
    }
    
    response = requests.get(url, params=params)
    data = response.json()
    pages = data['query']['pages']
    for page_id, page_data in pages.items():
        return page_data.get('extract')

def generate_topic_data_wikipedia(topics, n_per_topic=2000):
    all_data = []
    topic_data_count = {}
    for topic in topics:
        content = get_wikipedia_content(topic)
        if content:
            sentences = content.split(". ")
            samples = [sentence for sentence in sentences if len(sentence.split()) >= 10]
            samples = samples[:n_per_topic]
            topic_data_count[topic] = len(samples)
            all_data.extend((sentence, topic) for sentence in samples)
        else:
            print(f"Failed to retrieve content for {topic}")
            topic_data_count[topic] = 0
    return all_data, topic_data_count

topics = [
"quantum_mechanics", "organic_chemistry", "fluid_dynamics", "thermodynamics","differential_geometry",
"plate_tectonics", "cellular_respiration", "electromagnetic_theory", "nuclear_physics", "crystal_structure",
"evolution_theory", "chemical_bonding", "classical_mechanics", "molecular_biology", "statistical_mechanics",
"crystal_structure", "quantum_field_theory", "particle_physics", "astrophysics", "cosmology",   
"evolution_theory", "chemical_bonding", "classical_mechanics", "molecular_biology", "statistical_mechanics",
"classical_mechanics", "molecular_biology", "statistical_mechanics"
]


topic_data, topic_data_count = generate_topic_data_wikipedia(topics, 2000)

# CSV 파일로 데이터 저장
with open('clear_data.csv', 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['text', 'label'])  # 헤더 추가
    for text, label in topic_data:
        writer.writerow([text, label])

print(f"Total generated data samples: {len(topic_data)}")
print("Sample data:")
for i in range(min(5, len(topic_data))):
    print(f"Text: {topic_data[i][0]}")
    print(f"Label: {topic_data[i][1]}")
    print()

print("\nData count per topic:")
for topic, count in topic_data_count.items():
    print(f"{topic}: {count} samples")

print("\nData saved to topic_data.csv")

In [None]:
import csv
from collections import defaultdict

# 1. CSV 파일을 읽어서 데이터를 메모리에 로드
data_by_topic = defaultdict(list)

with open('clear_data.csv', 'r', encoding='utf-8') as csvfile:
    reader = csv.reader(csvfile)
    next(reader)  # 헤더 스킵
    for text, label in reader:
        data_by_topic[label].append(text)

# 2. 토픽별 샘플 수를 계산하고 정렬
topic_counts = {topic: len(samples) for topic, samples in data_by_topic.items()}
sorted_topics = sorted(topic_counts.items(), key=lambda x: x[1], reverse=True)

# 3. 상위 15개 토픽 선택
top_15_topics = dict(sorted_topics[:15])

# 4. 선택된 토픽의 데이터만 새로운 CSV 파일로 저장
output_filename = 'f_clear_topic.csv'

with open(output_filename, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['text', 'label'])  # 헤더 추가
    
    for topic in top_15_topics.keys():
        for text in data_by_topic[topic]:
            writer.writerow([text, topic])

# 5. 결과 출력
print("전체 토픽 수:", len(topic_counts))
print("\n모든 토픽의 샘플 수 (내림차순):")
for topic, count in sorted_topics:
    print(f"{topic}: {count} samples")

print("\n선택된 상위 15개 토픽:")
for topic, count in top_15_topics.items():
    print(f"{topic}: {count} samples")

print(f"\n총 선택된 샘플 수: {sum(top_15_topics.values())}")
print(f"데이터가 {output_filename}에 저장되었습니다.")

In [None]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

# CSV 파일 읽기
df = pd.read_csv('f_clear_topic.csv', delimiter=',')

def preprocess_text(text):
    # 입력값이 문자열이 아닐 경우 빈 문자열로 처리
    if not isinstance(text, str):
        return ''
    # 소문자 변환
    text = text.lower()
    # 숫자 및 특수 문자 제거
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # 숫자 제거
    # 토큰화
    tokens = word_tokenize(text)
    # 불용어 제거 및 lemmatization
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    tokens = [
        lemmatizer.lemmatize(word) 
        for word in tokens 
        if word not in stop_words and len(word) > 1  # 두 글자 이상만 선택
    ]
    return ' '.join(tokens)

# 전처리 적용 (text 칼럼에만)
df['text'] = df['text'].apply(preprocess_text)

# 빈 행 제거
df = df[df['text'].str.strip() != '']

# 결과 확인
print(f"전처리된 샘플 수: {len(df)}")
print("\n처음 5개 전처리된 샘플:")
print(df.head())

# 전처리된 결과를 새 CSV 파일로 저장
df.to_csv('p_clear_topic.csv', index=False, header=True)

## Ambiguous Topic

In [None]:
import requests
import csv

def get_wikipedia_content(topic):
    url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "format": "json",
        "titles": topic,
        "prop": "extracts",
        "explaintext": True,
        "redirects": 1
    }
    
    response = requests.get(url, params=params)
    data = response.json()
    pages = data['query']['pages']
    for page_id, page_data in pages.items():
        return page_data.get('extract')

def generate_topic_data_wikipedia(topics, n_per_topic=2000):
    all_data = []
    topic_data_count = {}
    for topic in topics:
        content = get_wikipedia_content(topic)
        if content:
            sentences = content.split(". ")
            samples = [sentence for sentence in sentences if len(sentence.split()) >= 10]
            samples = samples[:n_per_topic]
            topic_data_count[topic] = len(samples)
            all_data.extend((sentence, topic) for sentence in samples)
        else:
            print(f"Failed to retrieve content for {topic}")
            topic_data_count[topic] = 0
    return all_data, topic_data_count

topics = [
"machine_learning", "deep_learning", "natural_language_processing", "computer_vision", "artificial_intelligence",
"neural_networks", "pattern_recognition", "data_mining", "big_data_analytics", "knowledge_discovery", "data_mining",
"big_data_analytics", "knowledge_discovery", "cognitive_computing", "reinforcement_learning", "intelligent_systems", "cognitive_computing", "reinforcement_learning", "intelligent_systems", "text_mining", "speech_recognition", "image_processing", "information_retrieval", "semantic_analysis", "computational_linguistics", "sentiment_analysis", "speech_recognition", "image_processing", "information_retrieval", "semantic_analysis", "computational_linguistics", "sentiment_analysis"
]


topic_data, topic_data_count = generate_topic_data_wikipedia(topics, 2000)

# CSV 파일로 데이터 저장
with open('ambi_topic.csv', 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['text', 'label'])  # 헤더 추가
    for text, label in topic_data:
        writer.writerow([text, label])

print(f"Total generated data samples: {len(topic_data)}")
print("Sample data:")
for i in range(min(5, len(topic_data))):
    print(f"Text: {topic_data[i][0]}")
    print(f"Label: {topic_data[i][1]}")
    print()

print("\nData count per topic:")
for topic, count in topic_data_count.items():
    print(f"{topic}: {count} samples")

print("\nData saved to topic_data.csv")

In [None]:
import csv
from collections import defaultdict

# 1. CSV 파일을 읽어서 데이터를 메모리에 로드
data_by_topic = defaultdict(list)

with open('ambi_topic.csv', 'r', encoding='utf-8') as csvfile:
    reader = csv.reader(csvfile)
    next(reader)  # 헤더 스킵
    for text, label in reader:
        data_by_topic[label].append(text)

# 2. 토픽별 샘플 수를 계산하고 정렬
topic_counts = {topic: len(samples) for topic, samples in data_by_topic.items()}
sorted_topics = sorted(topic_counts.items(), key=lambda x: x[1], reverse=True)

# 3. 상위 15개 토픽 선택
top_15_topics = dict(sorted_topics[:15])

# 4. 선택된 토픽의 데이터만 새로운 CSV 파일로 저장
output_filename = 'f_ambi_topic.csv'

with open(output_filename, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['text', 'label'])  # 헤더 추가
    
    for topic in top_15_topics.keys():
        for text in data_by_topic[topic]:
            writer.writerow([text, topic])

# 5. 결과 출력
print("전체 토픽 수:", len(topic_counts))
print("\n모든 토픽의 샘플 수 (내림차순):")
for topic, count in sorted_topics:
    print(f"{topic}: {count} samples")

print("\n선택된 상위 15개 토픽:")
for topic, count in top_15_topics.items():
    print(f"{topic}: {count} samples")

print(f"\n총 선택된 샘플 수: {sum(top_15_topics.values())}")
print(f"데이터가 {output_filename}에 저장되었습니다.")

In [None]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

# CSV 파일 읽기
df = pd.read_csv('f_ambi_topic.csv', delimiter=',')

def preprocess_text(text):
    # 입력값이 문자열이 아닐 경우 빈 문자열로 처리
    if not isinstance(text, str):
        return ''
    # 소문자 변환
    text = text.lower()
    # 숫자 및 특수 문자 제거
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # 숫자 제거
    # 토큰화
    tokens = word_tokenize(text)
    # 불용어 제거 및 lemmatization
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    tokens = [
        lemmatizer.lemmatize(word) 
        for word in tokens 
        if word not in stop_words and len(word) > 1  # 두 글자 이상만 선택
    ]
    return ' '.join(tokens)

# 전처리 적용 (text 칼럼에만)
df['text'] = df['text'].apply(preprocess_text)

# 빈 행 제거
df = df[df['text'].str.strip() != '']

# 결과 확인
print(f"전처리된 샘플 수: {len(df)}")
print("\n처음 5개 전처리된 샘플:")
print(df.head())

# 토픽의 갯수 구하기
topic_count = df['label'].nunique()

print(f"토픽의 갯수: {topic_count}")


# 전처리된 결과를 새 CSV 파일로 저장
df.to_csv('p_ambi_topic.csv', index=False, header=True)

## More ambiguous Topics

In [None]:
import requests
import csv

def get_wikipedia_content(topic):
    url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "format": "json",
        "titles": topic,
        "prop": "extracts",
        "explaintext": True,
        "redirects": 1
    }
    
    response = requests.get(url, params=params)
    data = response.json()
    pages = data['query']['pages']
    for page_id, page_data in pages.items():
        return page_data.get('extract')

def generate_topic_data_wikipedia(topics, n_per_topic=2000):
    all_data = []
    topic_data_count = {}
    for topic in topics:
        content = get_wikipedia_content(topic)
        if content:
            sentences = content.split(". ")
            samples = [sentence for sentence in sentences if len(sentence.split()) >= 10]
            samples = samples[:n_per_topic]
            topic_data_count[topic] = len(samples)
            all_data.extend((sentence, topic) for sentence in samples)
        else:
            print(f"Failed to retrieve content for {topic}")
            topic_data_count[topic] = 0
    return all_data, topic_data_count

topics = [
    "Climate Change", "Global Warming", "Environmental Degradation", "Air Pollution", "Water Pollution",
    "Deforestation", "Biodiversity Loss", "Endangered Species", "Renewable Energy", "Fossil Fuel Dependency",
    "Social Inequality", "Income Inequality", "Gender Inequality", "Racial Inequality", "Educational Inequality",
    "Poverty Reduction", "Global Poverty", "Health Disparities", "Access to Healthcare", "Food Security",
    "Food Scarcity", "Water Scarcity", "Overpopulation", "Urbanization Challenges", "Housing Crisis"
]


topic_data, topic_data_count = generate_topic_data_wikipedia(topics, 2000)

# CSV 파일로 데이터 저장
with open('moreambi_topic.csv', 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['text', 'label'])  # 헤더 추가
    for text, label in topic_data:
        writer.writerow([text, label])

print(f"Total generated data samples: {len(topic_data)}")
print("Sample data:")
for i in range(min(5, len(topic_data))):
    print(f"Text: {topic_data[i][0]}")
    print(f"Label: {topic_data[i][1]}")
    print()

print("\nData count per topic:")
for topic, count in topic_data_count.items():
    print(f"{topic}: {count} samples")

print("\nData saved to topic_data.csv")

In [None]:
import csv
from collections import defaultdict

# 1. CSV 파일을 읽어서 데이터를 메모리에 로드
data_by_topic = defaultdict(list)

with open('moreambi_topic.csv', 'r', encoding='utf-8') as csvfile:
    reader = csv.reader(csvfile)
    next(reader)  # 헤더 스킵
    for text, label in reader:
        data_by_topic[label].append(text)

# 2. 토픽별 샘플 수를 계산하고 정렬
topic_counts = {topic: len(samples) for topic, samples in data_by_topic.items()}
sorted_topics = sorted(topic_counts.items(), key=lambda x: x[1], reverse=True)

# 3. 상위 15개 토픽 선택
top_15_topics = dict(sorted_topics[:15])

# 4. 선택된 토픽의 데이터만 새로운 CSV 파일로 저장
output_filename = 'f_moreambi_topic.csv'

with open(output_filename, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['text', 'label'])  # 헤더 추가
    
    for topic in top_15_topics.keys():
        for text in data_by_topic[topic]:
            writer.writerow([text, topic])

# 5. 결과 출력
print("전체 토픽 수:", len(topic_counts))
print("\n모든 토픽의 샘플 수 (내림차순):")
for topic, count in sorted_topics:
    print(f"{topic}: {count} samples")

print("\n선택된 상위 15개 토픽:")
for topic, count in top_15_topics.items():
    print(f"{topic}: {count} samples")

print(f"\n총 선택된 샘플 수: {sum(top_15_topics.values())}")
print(f"데이터가 {output_filename}에 저장되었습니다.")

In [None]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

# CSV 파일 읽기
df = pd.read_csv('f_moreambi_topic.csv', delimiter=',')

def preprocess_text(text):
    # 입력값이 문자열이 아닐 경우 빈 문자열로 처리
    if not isinstance(text, str):
        return ''
    # 소문자 변환
    text = text.lower()
    # 숫자 및 특수 문자 제거
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # 숫자 제거
    # 토큰화
    tokens = word_tokenize(text)
    # 불용어 제거 및 lemmatization
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    tokens = [
        lemmatizer.lemmatize(word) 
        for word in tokens 
        if word not in stop_words and len(word) > 1  # 두 글자 이상만 선택
    ]
    return ' '.join(tokens)

# 전처리 적용 (text 칼럼에만)
df['text'] = df['text'].apply(preprocess_text)

# 빈 행 제거
df = df[df['text'].str.strip() != '']

# 결과 확인
print(f"전처리된 샘플 수: {len(df)}")
print("\n처음 5개 전처리된 샘플:")
print(df.head())

# 토픽의 갯수 구하기
topic_count = df['label'].nunique()

print(f"토픽의 갯수: {topic_count}")


# 전처리된 결과를 새 CSV 파일로 저장
df.to_csv('p_moreambi_topic.csv', index=False, header=True)