In [None]:
import random
import requests
import csv

# 위키피디아 API를 통해 문서 내용 수집 함수
def get_wikipedia_content(topic):
    url = f"https://en.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "format": "json",
        "titles": topic,
        "prop": "extracts",
        "explaintext": True,
        "redirects": 1
    }
    
    response = requests.get(url, params=params)
    data = response.json()
    pages = data['query']['pages']
    for page_id, page_data in pages.items():
        if 'extract' in page_data:
            return page_data['extract']
        else:
            return None

# 각 토픽별로 800개의 데이터 생성 함수 (위키피디아 API 활용)
def generate_topic_data_wikipedia(topics, n_per_topic=800):
    all_data = []
    failed_topics = []  # 실패한 토픽을 저장할 리스트
    topic_data_count = {}  # 각 토픽별 데이터 수를 저장할 딕셔너리
    for topic in topics:
        content = get_wikipedia_content(topic)
        if content:
            # 텍스트를 n개의 샘플로 나누어 저장
            sentences = content.split(". ")
            samples = [sentence for sentence in sentences if len(sentence.split()) >= 10]
            samples = samples[:n_per_topic] if len(samples) >= n_per_topic else samples
            topic_data_count[topic] = len(samples)  # 각 토픽별 데이터 수 저장
            for sentence in samples:
                all_data.append((sentence, topic))  # 튜플로 저장 (문장, 토픽)
        else:
            print(f"Failed to retrieve content for {topic}")
            failed_topics.append(topic)  # 실패한 토픽 추가
    return all_data, failed_topics, topic_data_count

# 노이즈 데이터 생성
def generate_noise_data(n):
    noise_words = ["random", "unrelated", "irrelevant", "topic", "data", "sentence", "generated"]
    data = []
    for i in range(n):
        sentence = " ".join(random.choices(noise_words, k=8))
        data.append((sentence, "Noise"))  # 노이즈 데이터의 레이블은 "Noise"
    return data

# 토픽 목록
distinct_topics = [
    "Quantum Mechanics", "CRISPR", "General Relativity", "Robotics",
    "Quantum Computing", "Climate Change", "Renewable Energy", "Artificial Intelligence"
]

similar_topics = [
    "Machine Learning", "Deep Learning", "Electric Vehicles",
    "Natural Language Processing", "Computer Vision", "Bioinformatics", "Nanotechnology"
]

# 데이터 생성
topic_data, failed_topics, topic_data_count = generate_topic_data_wikipedia(distinct_topics + similar_topics, 800)
noise_data = generate_noise_data(500)

# 모든 토픽에 대해 데이터 수집이 성공한 경우에만 저장
if not failed_topics:
    all_data = topic_data + noise_data
    with open('/data/data.csv', 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['text', 'label'])
        for text, label in all_data:
            writer.writerow([text, label])
    print(f"Total generated data samples: {len(all_data)}")
    print("Sample data:")
    for i in range(5):
        print(all_data[i])
    
    # 각 토픽별 데이터 수 출력
    print("\nData count per topic:")
    for topic, count in topic_data_count.items():
        print(f"{topic}: {count} samples")
    
    # 노이즈 데이터 수 출력
    print(f"\nNoise data: {len(noise_data)} samples")
else:
    print("Data retrieval failed for some topics. Data not saved.")
    print("Failed topics:")
    for topic in failed_topics:
        print(topic)


In [2]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

# CSV 파일 읽기
# CSV 파일 읽기
df = pd.read_csv('data/data.csv', delimiter=',')

def preprocess_text(text):
    # 입력값이 문자열이 아닐 경우 빈 문자열로 처리
    if not isinstance(text, str):
        return ''
    # 소문자 변환
    text = text.lower()
    # 숫자 및 특수 문자 제거
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # 숫자 제거
    # 토큰화
    tokens = word_tokenize(text)
    # 불용어 제거 및 lemmatization
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# 전처리 적용 (text 칼럼에만)
df['text'] = df['text'].apply(preprocess_text)

# 결과 확인
print(f"전처리된 샘플 수: {len(df)}")
print("\n처음 5개 전처리된 샘플:")
print(df.head())

# 전처리된 결과를 새 CSV 파일로 저장
df.to_csv('data/pre_data.csv', index=False, header=True)


전처리된 샘플 수: 3750

처음 5개 전처리된 샘플:
                                                text              label
0  quantum mechanic fundamental theory describes ...  Quantum Mechanics
1  classical physic describe many aspect nature o...  Quantum Mechanics
2  theory classical physic derived quantum mechan...  Quantum Mechanics
3  measurement quantum system show characteristic...  Quantum Mechanics
4  early attempt understand microscopic phenomeno...  Quantum Mechanics


In [1]:
import pandas as pd

# Load the preprocessed data
df = pd.read_csv('data/pre_data.csv')

# Get the unique labels and their counts
label_counts = df['label'].value_counts()

# Print the label names and their counts
print("Label counts:")
for label, count in label_counts.items():
    print(f"{label}: {count} samples")

Label counts:
Noise: 500 samples
Artificial Intelligence: 364 samples
Climate Change: 350 samples
Robotics: 311 samples
General Relativity: 283 samples
Deep Learning: 257 samples
Machine Learning: 235 samples
Renewable Energy: 227 samples
CRISPR: 181 samples
Quantum Mechanics: 178 samples
Quantum Computing: 174 samples
Electric Vehicles: 166 samples
Computer Vision: 150 samples
Bioinformatics: 148 samples
Nanotechnology: 114 samples
Natural Language Processing: 112 samples
