In [1]:
# https://tech.kakao.com/2021/06/25/kakao-ai-recommendation-01/

## Step1. Read Data

In [2]:
import pandas as pd

# data read
# row by item -> user by campaign
data_path = 'data/Ads_CTR_Optimisation.csv'

df = pd.read_csv(data_path)
df.head()

Unnamed: 0,Ad 1,Ad 2,Ad 3,Ad 4,Ad 5,Ad 6,Ad 7,Ad 8,Ad 9,Ad 10
0,1,0,0,0,1,0,0,0,1,0
1,0,0,0,0,0,0,0,0,1,0
2,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0,0,0,0,0


## Step2. Topic modeling

In [15]:
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD

lda_model = LatentDirichletAllocation(n_components=3,               # Number of topics
                                      max_iter=10,                  # Max learning iterations
                                      learning_method='online',   
                                      random_state=100,             # Random state
                                      batch_size=128,               # n docs in each learning iter
                                      evaluate_every = -1,          # compute perplexity every n iters, default: Don't
                                      n_jobs = -1,                  # Use all available CPUs
                                     )
lda_output = lda_model.fit_transform(df)
print(lda_model)  # Model attributes


LatentDirichletAllocation(learning_method='online', n_components=3, n_jobs=-1,
                          random_state=100)


In [16]:
print(lda_model.components_)
print(lda_model.components_.shape) 

[[3.67596654e-01 1.30176405e+03 3.53350098e-01 3.52499355e-01
  4.28403079e-01 3.48914777e-01 3.53893092e-01 1.04840515e+00
  9.56596197e+02 3.53430483e-01]
 [4.45985030e-01 3.45371281e-01 7.33959543e+02 1.20076191e+03
  3.79852882e-01 1.25423028e+02 1.11828506e+03 3.76645549e-01
  3.44437547e-01 4.83383274e+02]
 [1.69726786e+03 3.43910757e-01 3.49582969e-01 3.50186205e-01
  2.71339161e+03 3.54120513e-01 3.51645393e-01 2.07115087e+03
  3.44230497e-01 3.49138529e-01]]
(3, 10)


In [23]:
def get_topics(components, feature_names, n=10):
    
    items = {}
    
    for idx, topic in enumerate(components):
        print("Topic %d:" % (idx+1), {feature_names[i]: topic[i].round(2) for i in topic.argsort()[:-n - 1:-1]})
        item = {feature_names[i]: topic[i].round(2) for i in topic.argsort()[:-n - 1:-1]}
        
        items["Topic %d:" % (idx+1)] = item
        
    return items
# 주제 벡터
terms = df.columns.tolist()
items = get_topics(lda_model.components_,terms)

Topic 1: {'Ad 2': 1301.76, 'Ad 9': 956.6, 'Ad 8': 1.05, 'Ad 5': 0.43, 'Ad 1': 0.37, 'Ad 7': 0.35, 'Ad 10': 0.35, 'Ad 3': 0.35, 'Ad 4': 0.35, 'Ad 6': 0.35}
Topic 2: {'Ad 4': 1200.76, 'Ad 7': 1118.29, 'Ad 3': 733.96, 'Ad 10': 483.38, 'Ad 6': 125.42, 'Ad 1': 0.45, 'Ad 5': 0.38, 'Ad 8': 0.38, 'Ad 2': 0.35, 'Ad 9': 0.34}
Topic 3: {'Ad 5': 2713.39, 'Ad 8': 2071.15, 'Ad 1': 1697.27, 'Ad 6': 0.35, 'Ad 7': 0.35, 'Ad 4': 0.35, 'Ad 3': 0.35, 'Ad 10': 0.35, 'Ad 9': 0.34, 'Ad 2': 0.34}


In [24]:
items

{'Topic 1:': {'Ad 2': 1301.76,
  'Ad 9': 956.6,
  'Ad 8': 1.05,
  'Ad 5': 0.43,
  'Ad 1': 0.37,
  'Ad 7': 0.35,
  'Ad 10': 0.35,
  'Ad 3': 0.35,
  'Ad 4': 0.35,
  'Ad 6': 0.35},
 'Topic 2:': {'Ad 4': 1200.76,
  'Ad 7': 1118.29,
  'Ad 3': 733.96,
  'Ad 10': 483.38,
  'Ad 6': 125.42,
  'Ad 1': 0.45,
  'Ad 5': 0.38,
  'Ad 8': 0.38,
  'Ad 2': 0.35,
  'Ad 9': 0.34},
 'Topic 3:': {'Ad 5': 2713.39,
  'Ad 8': 2071.15,
  'Ad 1': 1697.27,
  'Ad 6': 0.35,
  'Ad 7': 0.35,
  'Ad 4': 0.35,
  'Ad 3': 0.35,
  'Ad 10': 0.35,
  'Ad 9': 0.34,
  'Ad 2': 0.34}}

In [18]:
# 사용자 주제 벡터
sample = pd.DataFrame(df.iloc[0]).T

In [19]:
sample_output = lda_model.transform(sample)
sample_output

array([[0.33334723, 0.08333462, 0.58331815]])

## Step3. 피드백을 통한 실시간 최적화

In [None]:
## 실시간 최적화 -> MAB