In [4]:
import pandas as pd
import os
import json
import openai
from openai import OpenAI
import numpy as np
from tqdm.notebook import tqdm, trange
from typing import List
from sklearn.cluster import KMeans 

In [5]:
df = pd.read_csv('./data/abcnews_2020.csv')

In [6]:
df.head()

Unnamed: 0,publish_date,headline_text
0,20200101,a new type of resolution for the new year
1,20200101,adelaide records driest year in more than a de...
2,20200101,adelaide riverbank catches alight after new ye...
3,20200101,adelaides 9pm fireworks spark blaze on riverbank
4,20200101,archaic legislation governing nt women propert...


In [7]:
def create_embeddings(txt_list: List[str], model='text-embedding-3-small') -> List[np.ndarray]:
 
    client = OpenAI()

    response = client.embeddings.create(
    input=txt_list,
    model=model)
    responses = [r.embedding for r in response.data]

    return responses

In [ ]:
batch_size = 2000
headline_emb = list()

headline = df['headline_text'].tolist()

for i in trange(0, len(headline), batch_size):
    i_end = min(len(headline), i+batch_size)
    data_batch = headline[i:i_end]
    
    temp_emb = create_embeddings(data_batch)
    headline_emb.extend(temp_emb)


In [9]:
df = pd.read_csv('./data/abcnews_2020_emb.csv')

In [10]:
df['headline_emb'] = df['headline_emb'].apply(json.loads)

In [12]:
clusters = KMeans(n_clusters=30, random_state=0).fit_predict(df['headline_emb'].tolist())
df['cluster'] = clusters

In [13]:
df.head()

Unnamed: 0,publish_date,headline_text,headline_emb,cluster
0,20200101,a new type of resolution for the new year,"[-0.029918815940618515, 0.027576250955462456, ...",14
1,20200101,adelaide records driest year in more than a de...,"[0.02332385629415512, 0.024166574701666832, 0....",17
2,20200101,adelaide riverbank catches alight after new ye...,"[0.008539590984582901, -0.00674605555832386, 0...",13
3,20200101,adelaides 9pm fireworks spark blaze on riverbank,"[0.03185156360268593, -1.126696679421002e-05, ...",6
4,20200101,archaic legislation governing nt women propert...,"[0.05419066920876503, 0.061877865344285965, 0....",11


In [14]:
df.loc[df['cluster']==18]

Unnamed: 0,publish_date,headline_text,headline_emb,cluster
212,20200104,china to identify cause of mystery pneumonia p...,"[-0.006448061671108007, -0.054372843354940414,...",18
360,20200106,mysterious illness in china is not sars,"[-0.0005059370887465775, -0.033365555107593536...",18
1045,20200115,who says new china coronavirus could spread; w...,"[-0.013549289666116238, -0.028695644810795784,...",18
1237,20200117,thailand finds second case of new chinese coro...,"[-0.02747490629553795, -0.044669169932603836, ...",18
1265,20200118,china reports new virus cases raising concern ...,"[0.004373899661004543, -0.031095510348677635, ...",18
...,...,...,...,...
2385,20200131,how deadly is the coronavirus,"[-0.01725398749113083, 0.0033517233096063137, ...",18
2437,20200131,wall street volatile coronavirus australian do...,"[-0.07781743258237839, -0.02185390144586563, 0...",18
2442,20200131,who coronavirus global emergency,"[-0.025180980563163757, -0.004652089439332485,...",18
2443,20200131,who declares coronavirus outbreak as global he...,"[-0.04890184849500656, -0.033849552273750305, ...",18


### 정보의 다양성 Diversity 측정

In [15]:
from sklearn.metrics.pairwise import cosine_similarity

def calculate_diversity(df, column_name):
 
    # 각각의 임베딩끼리 모두 pairwise cosine similarity를 계산
    embeddings = np.vstack(df[column_name])
    cosine_sim = cosine_similarity(embeddings)
    
    # self-comparisons (diagonal elements)를 제외하고 cosine similarity 계산
    np.fill_diagonal(cosine_sim, np.nan) # 본인과의 similarity는 제외
    avg_distance = np.nanmean(cosine_sim)
    
    return cosine_sim, avg_distance


In [16]:
dist, avg = calculate_diversity(df, 'headline_emb')

In [17]:
avg

0.19837456404442783

In [18]:
diversity_score = {k:calculate_diversity(df.loc[df['cluster']==k], 'headline_emb')[1] for k in range(0, 15)}

In [19]:
diversity_score

{0: 0.30127963292576626,
 1: 0.4339087803488167,
 2: 0.2147994063589832,
 3: 0.3984893541964128,
 4: 0.8579692271248458,
 5: 0.4212757317776546,
 6: 0.3205253159622736,
 7: 0.44165128926019925,
 8: 0.5049445264108715,
 9: 0.18107570874467785,
 10: 0.34823013001835207,
 11: 0.19773536876618047,
 12: 0.4786903719253222,
 13: 0.4870874258713398,
 14: 0.21972834313499837}

In [20]:
### Outlier detection

In [21]:
from sklearn.ensemble import IsolationForest

In [22]:
cluster = df.loc[df['cluster']==10]

In [23]:
iso_forest = IsolationForest(contamination=0.05)  # Adjust contamination as needed
anomalies = iso_forest.fit_predict(cluster['headline_emb'].tolist())

anomalous_headlines = np.array(cluster['headline_text'].tolist())[anomalies == -1]
# print("Anomalous Headlines:", anomalous_headlines)

In [24]:
anomalies

array([ 1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1, -1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1])

In [25]:
anomalous_headlines

array(['three americans killed in al shabaab militant attack in kenya',
       'aeroplane dumps fuel over schools in emergency landing',
       'large air tanker c 130 water bomber crash cooma'], dtype='<U64')