In [0]:
import numpy as np
import pandas as pd

In [0]:
import nltk
nltk.download()

In [0]:
from nltk.corpus import  names
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

In [0]:
# remove number and punctuation 
def letters_only(word):
  return word.isalpha()


# remove name entity
from nltk.corpus import names
all_names = set(names.words())

# lancaster stemmer
from nltk.stem import LancasterStemmer
ls = LancasterStemmer()


In [0]:
# lancaster stemming
# 대문자를 소문자로 변환
# 숫자, 기호, 이름 제거

def clean_text(doc):
  cleaned_doc = []
  for word in doc.split(' '): 
    word = word.lower() # ABD -> abd
  
    if letters_only(word) and word not in all_names and len(word) > 2: # remove number and punc. and name entity
      cleaned_doc.append(ls.stem(word))
  return ' '.join(cleaned_doc) 

In [0]:
# stopword 제거

from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 

stop_words = set(stopwords.words('english'))
stop_words.update(['arent', 'didnt', 'couldnt', 'cant', 'doesnt', 'dont', 'hes', 'hadnt', 'hasnt', 'havent', 'isnt', 'mightnt',
                   'mustnt', 'neednt', 'shant', 'shes', 'shouldve', 'shouldnt', 'thatll', 'wasnt', 'werent', 'wont', 'wouldnt', 
                   'youd', 'youll', 'youre', 'youve', 'would', 'maybe', 'might'])

def rm_stopwords(doc):
  result = []
  doc_tokens = word_tokenize(doc)
  for w in doc_tokens:
    if w not in stop_words:
      result.append(w)
  return ' '.join(result)

In [0]:
# count_words 함수 정의
# tuple 데이터를 변수로 받음
# 각 word가 5 가지 classes(attempt, behavior, ideation, indicator, supportive)에서 몇 번 나오는지 세는 함수
# 출력은 dictrionary
def count_words(training_set):
  counts_dict = {}
  for post, labelnumber in training_set:
    for word in word_tokenize(post):
      if word not in counts_dict:
        counts_dict[word] = [0, 0, 0, 0, 0]
      if labelnumber == 0:
        counts_dict[word][0] += 1
      elif labelnumber == 1:
        counts_dict[word][1] += 1
      elif labelnumber == 2:
        counts_dict[word][2] += 1
      elif labelnumber == 3:
        counts_dict[word][3] += 1
      else:
        counts_dict[word][4] += 1
  return counts_dict

In [0]:
# classifier 개발

def classifier(test_set, train_set):
  ##### test set 전 처리
  cleaned_test1 = [clean_text(post) for post in test_set.Post]
  cleaned_test2 = [rm_stopwords(post) for post in cleaned_test1]
  test_set.Post = cleaned_test2

  #### train set 전 처리
  cleaned_train1 = [clean_text(post) for post in train_set.Post]
  cleaned_train2 = [rm_stopwords(post) for post in cleaned_train1]
  train_set.Post = cleaned_train2
  subset = train_set[['Post', 'LabelNumber']]
  tuples = [tuple(x) for x in subset.values]
  data = tuples

  ##### get prior
  # 각 단어 별로 몇 번 나왔는지 확인
  word_counting = count_words(data)
  # piror 확률을 계산하기 위해 각 class 개수 세기
  counts_attempt = 0
  counts_behavior = 0
  counts_ideation = 0
  counts_indicator = 0
  counts_supportive = 0
  
  for i in range(0, len(train_set)):
    if df.Label[i] == 'Attempt':
      counts_attempt += 1
    elif df.Label[i] == 'Behavior':
      counts_behavior += 1
    elif df.Label[i] == 'Ideation':
      counts_ideation += 1
    elif df.Label[i] == 'Indicator':
      counts_indicator += 1
    else:
      counts_supportive += 1
  # 각 class 별 개수를 전체 개수로 나누어 prior 확률 계산
  prob_attempt = counts_attempt / len(train_set)
  prob_behavior = counts_behavior / len(train_set)
  prob_ideation = counts_ideation / len(train_set)
  prob_indicator = counts_indicator / len(train_set)
  prob_supportive = counts_supportive / len(train_set)
  # prior 확률을 list로 만들어줌
  prob_prior = [prob_attempt, prob_behavior, prob_ideation, prob_indicator, prob_supportive]


  ##### get likelihood
  # likelihood 계산하기 위해 train set의 각 class 별 단어 개수 세기
  total_counts_byclass = [0, 0, 0, 0, 0]
  for word in word_counting:
    for i in range(0,5):
      total_counts_byclass[i] += word_counting[word][i]

  # likelihood 계산
  likelihood={}
  for word in word_counting:
    likelihood[word] = [0, 0, 0, 0, 0]
    for i in range(0,5):
      likelihood[word][i] = (word_counting[word][i] + 1) / (total_counts_byclass[i] + len(word_counting))
  
  #### 분류
  result = []
  na_count = 0
  for post in test_set.Post:
    log_prior = np.log(prob_prior)
    tokenword = word_tokenize(post)
    for i in range(0,5):
      for w in tokenword:
        if w in likelihood:
          log_prior[i] = log_prior[i] + np.log(likelihood[w][i]) # log 취해서 더 해줌. 왜냐하면 그대로 두면 값이 너무 작아짐
    result.append(np.where(log_prior == max(log_prior))[0][0])  # 각 class 별로 계산된 확률 중 높은 값의 인덱스를 확인
  result2 = np.array(result)
  return result2



In [0]:
df = pd.read_csv("TRAIN.csv")
df.test = pd.read_csv("TEST.csv")

  


In [0]:
final = classifier(df.test, df)
final

array([4, 4, 2, 2, 2, 2, 4, 2, 2, 2, 2, 4, 1, 2, 2, 2, 4, 2, 4, 2, 2, 4,
       2, 2, 2, 4, 4, 3, 2, 3, 4, 2, 2, 2, 2, 2, 4, 2, 2, 4, 2, 2, 4, 2,
       2, 3, 2, 3, 1, 2, 2, 2, 2, 2, 2, 3, 2, 2, 3, 2, 2, 4, 2, 2, 2, 2,
       2, 2, 3, 2, 2, 3, 4, 2, 4, 4, 4, 2, 2, 4, 3, 3, 4, 4, 2, 2, 2, 2,
       2, 2, 4, 2, 3, 4, 2, 3, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 1, 2, 4,
       4, 2, 2, 2, 2, 3, 4, 2, 2, 2, 2, 2, 2, 2, 4, 2, 4, 2, 2, 1, 4, 2,
       4, 2, 4, 4, 2, 2, 2, 2, 1, 4, 4, 2, 4, 2, 4, 2, 2, 2])

In [0]:
pd.DataFrame(final).to_csv("2015314036.csv", index=False)