In [None]:
# ============================================
# 1️⃣ 부서별 서브 부서 확인
# ============================================
sub_departments_by_dept = jd.groupby('department')['sub_department'].unique()
print(sub_departments_by_dept)
# 실행 결과 요약: 전체 부서 개수와 각 부서별 서브 부서 리스트가 출력됩니다.

# ============================================
# 2️⃣ 특정 직무 소개 문구 정제
# ============================================
import re

# 직무 선택
original_text = jd[jd['job_title'] == '승용 조립차 생산지원']['organization_intro'].iloc[0]
print("정제 전 텍스트:\n", original_text)

# 한글과 공백만 남기기
def clean_text(text):
    return re.sub('[^가-힣\s]', '', text)

cleaned_text = clean_text(original_text)
print("\n정제 후 텍스트:\n", cleaned_text)

# ============================================
# 3️⃣ 한국어 NLP 패키지 설치 및 리소스 다운로드
# ============================================
!pip install -q konlpy nltk spacy
import nltk
nltk.download('punkt_tab')
!python -m spacy download ko_core_news_sm

# ============================================
# 4️⃣ 토큰화 및 비교
# ============================================
import time
from konlpy.tag import Okt, Kkma
import spacy

nlp = spacy.load("ko_core_news_sm")
okt = Okt()
kkma = Kkma()

# 1. Okt 토큰화
start_time = time.time()
okt_tokens = okt.morphs(cleaned_text)
okt_time = time.time() - start_time

# 2. Kkma 토큰화
start_time = time.time()
kkma_tokens = kkma.morphs(cleaned_text)
kkma_time = time.time() - start_time

# 3. NLTK word_tokenize
start_time = time.time()
nltk_tokens = nltk.word_tokenize(cleaned_text)
nltk_time = time.time() - start_time

# 4. spaCy 토큰화
start_time = time.time()
spacy_tokens = [token.text for token in nlp(cleaned_text)]
spacy_time = time.time() - start_time

# 출력
print(f"okt: {okt_tokens} ({okt_time:.4f}초)")
print(f"kkma: {kkma_tokens} ({kkma_time:.4f}초)")
print(f"word_tokenize: {nltk_tokens} ({nltk_time:.4f}초)")
print(f"spaCy: {spacy_tokens} ({spacy_time:.4f}초)")

# ============================================
# 5️⃣ 품사 태깅 및 명사 추출
# ============================================
pos_tags = okt.pos(cleaned_text)
morphs = okt.morphs(cleaned_text)
nouns = [word for word, pos in pos_tags if pos == 'Noun']
print("품사 태깅:", pos_tags)
print("어간 추출:", morphs)
print("명사 추출:", nouns)

# ============================================
# 6️⃣ 불용어 제거
# ============================================
stopwords = {
    '우리', '자기', '그', '그것', '이', '이것', '저', '나', '너', '등', '대해', '및', 
    '그런', '또', '하지만', '왜', '때문', '으로', '가', '의', '에', '에서', '로', '와', 
    '과', '을', '를', '으로', '는', '은', '입니다', '들', '만', '것', '같다', '중',
    '사람', '후', '차', '전', '별', '점검', '물량', '환경', '탄력', '물류', '조정'
}
filtered_nouns = [word for word in nouns if word not in stopwords]
print("불용어 제거 후 명사:", filtered_nouns)

# 'tokens' 컬럼 추가
psatcar_jd.loc[psatcar_jd['job_title']=='승용 조립차 생산지원','tokens'] = [' '.join(filtered_nouns)]
print(psatcar_jd[psatcar_jd['job_title']=='승용 조립차 생산지원'][['organization_intro','tokens']].head())

# ============================================
# 7️⃣ LDA 토픽 모델링
# ============================================
!pip install gensim
import gensim
from gensim import corpora
from gensim.models import LdaModel
import pyLDAvis
import pyLDAvis.gensim_models

# organization_intro 정제 후 명사 추출
jd['clean_text'] = jd['organization_intro'].str.replace(r'[^가-힣\s]', '', regex=True)
jd['nouns'] = jd['clean_text'].apply(lambda doc: okt.nouns(doc) if isinstance(doc,str) else [])

# 전체 명사 빈도 계산
from collections import Counter
all_nouns = [n for nouns in jd['nouns'] for n in nouns]
noun_freq = Counter(all_nouns)
freq_df = pd.DataFrame(noun_freq.items(), columns=['noun','freq']).sort_values('freq',ascending=False).reset_index(drop=True)
print(freq_df.head(20))

# 상위 10개 단어를 불용어 처리
top10_stopwords = set(freq_df.head(10)['noun'])
jd['tokens'] = jd['nouns'].apply(lambda noun_list: [w for w in noun_list if w not in top10_stopwords])
empty_tokens = jd['tokens'].map(len)==0
print(f"tokens 빈 리스트인 행 개수: {empty_tokens.sum()}")
print(jd[['nouns','tokens']].head())

# LDA 모델 학습
texts = jd['tokens'].tolist()
dictionary = corpora.Dictionary(texts)
dictionary.filter_extremes(no_below=5, no_above=0.5)
corpus = [dictionary.doc2bow(doc) for doc in texts]

lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=6, random_state=42, update_every=1, chunksize=100, passes=20, alpha='auto', per_word_topics=True)

for tid, terms in lda_model.show_topics(num_topics=6, formatted=False):
    print(f"Topic {tid}: {[w for w,p in terms]}")

# ============================================
# 8️⃣ 부서별 LDA 모델링
# ============================================
grouped = jd.groupby('department')['tokens'].apply(list).to_dict()
lda_models = {}
for sub_dept, tokens_list in grouped.items():
    texts = tokens_list
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    num_topics = len(tokens_list)
    lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, passes=15, random_state=42)
    lda_models[sub_dept] = lda_model

    print(f"부서: {sub_dept}")
    for idx, topic in lda_model.print_topics(num_words=5, num_topics=5):
        print(f"🟦 주제 {idx+1}: {topic}")
    print("\n" + "="*50 + "\n")

# pyLDAvis 시각화
pyLDAvis.enable_notebook()
import pyLDAvis.gensim_models as gensimvis
from joblib import parallel_backend
with parallel_backend('threading', n_jobs=1):
    vis_data = gensimvis.prepare(lda_model, corpus, dictionary)
vis_data

# ============================================
# 9️⃣ IT 로그 데이터 처리
# ============================================
import json
with open('IT_logs.json','r',encoding='utf-8') as f:
    data = json.load(f)
log = pd.DataFrame(data)
log['timestamp'] = pd.to_datetime(log['timestamp'])
log['hour'] = log['timestamp'].dt.hour
log['PC'].value_counts()
log['program'] = log['window_title'].str.extract(r'\[(.*?)\]')
log['file'] = log['window_title'].str.extract(r'-\s(.+)$')
log['merge'] = log['window_title'].str.extract(r'\[.*\]\s*(.*)\s*-\s*(.*)')[1]
log['day'] = log['timestamp'].dt.date
log_grouped = log.groupby(['PC','day'])['merge'].apply(lambda x: list(x)).reset_index()
print(log_grouped.head())

# ============================================
# 10️⃣ Word2Vec 임베딩
# ============================================
from gensim.models import Word2Vec
sentences = log_grouped['merge'].tolist()
model = Word2Vec(min_count=1, window=5, vector_size=30, workers=4)
model.build_vocab(sentences, progress_per=10000)
print("Vocabulary Size:", len(model.wv))
print("Example words:", list(model.wv.index_to_key)[:10])

# Skip-Gram 모델
model = Word2Vec(sentences=sentences, vector_size=30, window=5, min_count=1, sg=1, workers=4, epochs=10)
word_vectors = model.wv
print("Words in vocabulary:", list(word_vectors.index_to_key)[:10])

# ============================================
# 11️⃣ t-SNE 시각화
# ============================================
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import numpy as np

pc_groups = log_grouped.groupby('PC')
pc_vectors = []
labels = []

for pc, group in pc_groups:
    for log in group['merge']:
        log_vector = np.mean([model.wv[word] for word in log if word in model.wv], axis=0)
        if log_vector is not None:
            pc_vectors.append(log_vector)
            labels.append(pc)

pc_vectors_flattened = np.array(pc_vectors)
perplexity_value = min(30, len(pc_vectors_flattened)-1)
tsne = TSNE(n_components=2, perplexity=perplexity_value, learning_rate=200)
transformed_vectors = tsne.fit_transform(pc_vectors_flattened)
label_mapping = {label: idx for idx,label in enumerate(sorted(set(labels)))}
numeric_labels = [label_mapping[label] for label in labels]

plt.figure(figsize=(12,8))
scatter = plt.scatter(transformed_vectors[:,0], transformed_vectors[:,1], c=numeric_labels, cmap='tab20')
plt.legend(handles=scatter.legend_elements()[0], labels=label_mapping.keys())
plt.title("100-day Logs Distribution by PC")
plt.xlabel("t-SNE Dimension 1")
plt.ylabel("t-SNE Dimension 2")
plt.colorbar(scatter)
plt.show()

# ============================================
# 12️⃣ RandomForest 기반 PC 분류
# ============================================
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.model_selection import train_test_split
import seaborn as sns

X_train, X_test, y_train, y_test = train_test_split(pc_vectors_flattened, labels, test_size=0.2, random_state=42)

rf_model = RandomForestClassifier(
    n_estimators=9,
    max_depth=3,
    min_samples_split=10,
    min_samples_leaf=5,
    max_features='sqrt',
    random_state=42
)
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
f1 = f1_score(y_test, y_pred, average='macro')
print(f"Macro F1-Score: {f1:.2f}")

cm = confusion_matrix(y_test, y_pred, labels=rf_model.classes_)
plt.figure(figsize=(8,6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=rf_model.classes_, yticklabels=rf_model.classes_)
plt.title("Confusion Matrix for Random Forest Model")
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# ============================================
# 13️⃣ 특정 PC(E013) 로그 분석
# ============================================
from collections import Counter
logs_e013 = pc_groups.get_group('E013')['merge'].values
e013_counter = Counter([item for log in logs_e013 for item in log])
top_5_e013 = e013_counter.most_common(5)

print("\nTop 5 frequent logs for E013:")
for item, count in top_5_e013:
    print(f"{item}: {count}")
