In [None]:
import pandas as pd
import io
import random
import csv
from tqdm import tqdm
from google.colab import files
import nltk
from nltk.util import ngrams
from nltk import word_tokenize
from nltk import ConditionalFreqDist
from nltk.probability import ConditionalProbDist, MLEProbDist
import codecs
from konlpy.tag import Okt

nltk.download('punkt')
!pip3 install konlpy


In [None]:
uploaded = files.upload()
comments_log = list(uploaded.keys())[0]

In [22]:
with codecs.open(comments_log, encoding = 'utf-8') as f:
  reader = csv.reader(f, delimiter='\t')   # Read and process data
  data = list(reader)[1:]

comment = [row[0] for row in data]   # Extract documents
print("텍스트 데이터:", comment[:5])
print("\n문장 개수: ", len(comment))

텍스트 데이터: ['"t1행동"', '"세최미"', '"어이 페이커 이걸로 "동률"이다"', '"연륜"', '"일보후퇴 이보전진"']

문장 개수:  9250


In [14]:
tagger = Okt()

def tokenize(text):

  tokens = ['/'.join(t) for t in tagger.pos(text)]   # Tokenize text using KoNLPy

  return tokens

In [15]:
sentences = []

# Create bigrams from tekenized sentences
for d in tqdm(comment):
  tokens = tokenize(d)
  bigram = ngrams(tokens, 2, pad_left = True, pad_right = True, left_pad_symbol = "", right_pad_symbol = "")
  sentences += [t for t in bigram]


print(sentences[:10])

100%|██████████| 9251/9251 [00:08<00:00, 1120.11it/s]

[('', '\ufeff/Foreign'), ('\ufeff/Foreign', 'Comments/Alpha'), ('Comments/Alpha', ''), ('', '"/Punctuation'), ('"/Punctuation', 't/Alpha'), ('t/Alpha', '1/Number'), ('1/Number', '행동/Noun'), ('행동/Noun', '"/Punctuation'), ('"/Punctuation', ''), ('', '"/Punctuation')]





In [16]:
# Create Conditional Frequency Distribution
cfd = ConditionalFreqDist(sentences)
print(cfd[""].most_common(5))

[('대/Verb', 614), ('와/Verb', 308), ('대상/Noun', 234), ('티원/Noun', 178), ('제/Modifier', 147)]


In [17]:
# Function to find most common words following a given token
def most_common(c, n, pos = None):

  if pos is None:
    return cfd[tokenize(c)[0]].most_common(n)
  else:
    return cfd["/".join([c, pos])].most_common(n)


print(most_common("대", 10))

[('상/Noun', 498), ('황/Noun', 77), ('우/Adverb', 16), ('0/Number', 11), ('마/Noun', 9), ('떡/Noun', 5), ('오/Noun', 4), ('표/Noun', 4), ('1/Number', 3), ('-/Punctuation', 3)]


In [18]:
# Create Conditional Probability Distribution based on word frequencies
cpd = ConditionalProbDist(cfd, MLEProbDist)
print(cpd[tokenize(".")[0]].prob(""))

0.2191780821917808


In [19]:
# Probability of a word w following a context c in bigram
def bigram_prob(c, w):

  context = tokenize(c)[0]
  word = tokenize(w)[0]

  return cpd[context].prob(word)


print(bigram_prob("대", "상"))
print(bigram_prob("상", "대"))

0.7410714285714286
0.0


In [20]:
# Generate a sentence based on the learned probabilities
def generate_sentence(seed = None):

  if seed is not None:
    random.seed(seed)
  c = ""
  sentence = []

  while True:
    if c not in cpd:
      break

    w = cpd[c].generate()

    if w == "":
      break

    word = w.split("/")[0]
    pos = w.split("/")[1]

    # Add words to the sentence based on certain conditions
    if c == "":
      sentence.append(word.title())
    elif c in ["`", "\"","'","("]:
      sentence.append(word)
    elif word in ["'", ".", ",", ")", ":", ";", "?"]:
      sentence.append(word)
    elif pos in ["Josa", "Punctuation", "Suffix"]:
        sentence.append(word)
    elif w in ["임/Noun", "것/Noun", "는걸/Noun", "릴때/Noun",
                "되다/Verb", "이다/Verb", "하다/Verb", "이다/Adjective"]:
        sentence.append(word)
    else:
        sentence.append(" " + word)
    c = w

  return "".join(sentence)

In [21]:
# Print sentences
for i in range(1, 11):
    print(generate_sentence(i))

구마 유시가 막히게 현상금 붙네
표 시기 잘 해 ㅋㅋㅋㅋㅋㅋ
대 상 혁
대 황 식
오우 너 ㅋㅋㅋㅋ 그냥 미쳤네 ㅋㅋㅋㅋㅋㅋㅋ
주도권 나갔다
도사 ㄷㄷ
대 포 고 듀가 만만한가
발차기 실패
압승인데 ㅋㅋㅋㅋㅋㅋㅋ
