In [1]:
import re
import pickle
from collections import defaultdict
from tqdm import tqdm

import pandas as pd

from dbConn.get_query import get_select
from modules.cleaning import cleaning

In [2]:
query = f"select pk, location_pk, content, hashtag \
    from scrapping.scrapping.instagram_post where hashtag != '라이크어로컬' and location_pk notnull;"
posts = get_select(query)

In [3]:
query = f"select post_pk, content \
    from scrapping.scrapping.instagram_comment;"
comments = get_select(query)

In [4]:
post_pk_comments = defaultdict(list)
for (post_pk, content, ) in comments:
    post_pk_comments[post_pk].append(content)

In [5]:
with open("./stopwords.txt", "r") as f:
    stopwords = f.read().splitlines()


def get_hashtag(sentence):
    if sentence:
        pattern = "#([0-9a-zA-Z가-힣]*)"
        hash_w = re.compile(pattern)
        return hash_w.findall(sentence)
    else:
        return []


def get_content_hashtag(posts):
    pk_content_hashtag = defaultdict(dict)
    for (
        pk,
        location_pk,
        content,
        _,
    ) in tqdm(posts):
        get_tag_content = get_hashtag(content)
        all_comments = ' '.join(post_pk_comments[pk])
        get_tag_comment = get_hashtag(all_comments)
        all_tags = get_tag_content + get_tag_comment

        cleaned_tags = cleaning(" ".join(all_tags))
        if cleaned_tags:
            pk_content_hashtag[pk]["hashtag"] = cleaning(" ".join(all_tags))
        else:
            pk_content_hashtag[pk]["hashtag"] = ""

        cleaned = cleaning(content)  # Cleaning
        if cleaned:
            pk_content_hashtag[pk]["content"] = cleaned
        else:
            pk_content_hashtag[pk]["content"] = ""

        if (
            not pk_content_hashtag[pk]["content"]
            and not pk_content_hashtag[pk]["hashtag"]
        ):
            del pk_content_hashtag[pk]

        else:
            pk_content_hashtag[pk]['pk'] = pk
            pk_content_hashtag[pk]['location_pk'] = location_pk

    for pk, dic in pk_content_hashtag.items():
        joined = dic["content"] + " " + dic["hashtag"]
        # joined = [word for word in joined.split() if not (word in stopwords)]
        pk_content_hashtag[pk]['join'] = joined.strip()

    if not pk_content_hashtag[pk]['join']:
        del pk_content_hashtag[pk]
        
    return pk_content_hashtag

In [6]:
# pk_content_hashtag = get_content_hashtag(posts)

# with open('./wordrank_data/pk_content_hashtag.pkl', 'wb') as f:
#     pickle.dump(pk_content_hashtag, f)

100%|██████████| 1779152/1779152 [02:22<00:00, 12485.17it/s]


In [2]:
with open('./wordrank_data/pk_content_hashtag.pkl', 'rb') as f:
    pk_content_hashtag = pickle.load(f)

In [7]:
print(len(list(pk_content_hashtag.keys())))
list(pk_content_hashtag.keys())[-10:]

1740902


[2606672384618638559,
 2606672207325641885,
 2606671680513173050,
 2606671565346021925,
 2606670987882625740,
 2606670880164388488,
 2606670493309475518,
 2606670028378223158,
 2606668507709181424,
 2606667995203811908]

In [8]:
pk_content_hashtag[2606668507709181424]

{'hashtag': '',
 'content': '사진너무잘찍어 딘땨',
 'pk': 2606668507709181424,
 'location_pk': 773850536122935,
 'join': '사진너무잘찍어 딘땨'}

In [9]:
## {'location_pk': texts}
loc_pk_texts = defaultdict(list)
for pk, dic in pk_content_hashtag.items():
    loc_pk_texts[dic['location_pk']].append(dic['join'])

In [10]:
from krwordrank.word import KRWordRank

location_keywords = defaultdict(list)
for location_pk, texts in tqdm(loc_pk_texts.items()):
    if len(texts) > 20:
        wordrank_extractor = KRWordRank(
            min_count = 10,
            max_length = 10,
            verbose=False
        )

        beta = 0.85
        max_iter = 10

        try:
            keywords, rank, graph = wordrank_extractor.extract(texts, beta, max_iter)
            sorted_keywords = sorted(keywords.items(), key=lambda x:x[1], reverse=True)
            location_keywords[location_pk] = sorted_keywords
        except:
            pass
    else:
        pass

len(location_keywords.keys())

  0%|          | 0/70543 [00:00<?, ?it/s]

In [15]:
query = f"select pk, name, raw from scrapping.scrapping.instagram_location;"
loc_infos = get_select(query)

In [16]:
loc_infos_dic = defaultdict(dict)
for (pk, name, raw, ) in loc_infos:
    loc_infos_dic[pk]['name'] = name
    loc_infos_dic[pk]['raw'] = raw

In [21]:
l, n, r, k = [],[],[],[]
for loc_pk, keywords in location_keywords.items():
    l.append(loc_pk)
    n.append(loc_infos_dic[loc_pk]['name'])
    r.append(loc_infos_dic[loc_pk]['raw'])
    k.append(keywords)
print(len(l), len(n), len(r), len(k))

4188 4188 4188 4188


In [22]:
df_loc_keywords = pd.DataFrame({'location_pk': l, 'name': n, 'keywords_score': k, 'raw': r})
df_loc_keywords.to_csv('./wordrank_data/df_loc_keywords_20_mincnt10.csv', encoding='utf-8', index=False)
df_loc_keywords

Unnamed: 0,location_pk,name,keywords_score,raw
0,221484355286107,"Jeonju, Korea","[(전주, 10970.07289114403), (일상, 997.04900113188...","{'pk': 221484355286107, 'lat': 35.823659221906..."
1,245713997,Jeonju,"[(전주, 46336.424911983595), (일상, 4768.285453512...","{'pk': 245713997, 'lat': 35.8219, 'lng': 127.1..."
2,345849689434417,베르자르당,"[(전주, 62.4733045248046), (순창, 49.4889714878311...","{'pk': 345849689434417, 'lat': 35.362007, 'lng..."
3,242947360,전주한옥마을,"[(전주, 4508.08688526555), (여행, 803.755099390192...","{'pk': 242947360, 'lat': 37.5287516782, 'lng':..."
4,110836031109980,카페루츠,"[(오늘도, 17.96774090256985), (전주, 12.68119581235...","{'pk': 110836031109980, 'lat': 35.821234091394..."
...,...,...,...,...
4183,372402660182337,여느꽃,"[(전주여느꽃플라워스튜디오, 23.665361996900394), (여느꽃, 9.9...","{'pk': 372402660182337, 'lat': 7.7063182184948..."
4184,295195034600610,선우메이크업,"[(전주, 102.97322807384937), (선우, 45.66366673357...","{'pk': 295195034600610, 'lat': 35.837501525879..."
4185,106787627705108,바른치킨전주혁신점,"[(전주, 22.19829571590334), (국내산, 18.83295803086...","{'pk': 106787627705108, 'lat': 35.825550387163..."
4186,183992339157180,클래시에스,"[(전주혁신도시, 21.969388171957554), (되었어요, 9.643384...","{'pk': 183992339157180, 'lat': 35.83851825264,..."
