In [None]:
path = "dataset path"
filename = 'dataset csv file *.csv'
awlistFile = 'anchor words file *.xlsx'


In [None]:
import numpy as np
import re
import pandas as pd
import scipy.sparse as ss
import matplotlib.pyplot as plt
from matplotlib import font_manager, rc
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_selection import mutual_info_classif
%matplotlib inline

import MeCab
import nltk
from nltk.tokenize import word_tokenize
from nltk import FreqDist 
from pprint import pprint
from tqdm import tqdm
from konlpy.tag import Mecab 
from konlpy.tag import *

mc = Mecab(dicpath='The path of the MeCab-ko dictionary.') # The path of the MeCab-ko dictionary.

stwDF = pd.read_excel(path+awlistFile, sheet_name='stopwords')
stwlist = stwDF.space.to_list()

class MyTokenizer:
    def __init__(self, tagger):
        self.tagger = tagger
    
    def __call__(self, sent):
        postags=['NNP', 'NNG', 'VV', 'VA', 'SL', 'VV+ETN']
        pos = self.tagger.pos(sent)
        pos = [word for (word, pos) in mc.pos(sent, flatten=True) if pos in postags and len(word)>1]
        pos = [word for word in pos if word not in stwlist]
        return pos

my_tokenizer = MyTokenizer(Mecab(dicpath='The path of the MeCab-ko dictionary'))

In [None]:
dldefect_df = pd.read_csv(path+filename, encoding='utf-8')
rawComplaints = dldefect_df.apply(lambda row: " ".join(re.sub("[^a-zA-Z가-힣]+"," ", str(row.complaint)).split()), 1).to_list()  
dldefect_df['complaint'] = rawComplaints
dldefect_df

In [None]:
#doc vectorization
vectorizer = CountVectorizer(max_features=1000, binary=False,
                             ngram_range =(1,1), tokenizer= my_tokenizer, stop_words =stwlist)

doc_word = vectorizer.fit_transform(rawComplaints) 
doc_word = ss.csr_matrix(doc_word)

vectorizer.vocabulary_
idx2vocab = [vocab for vocab, idx in sorted(vectorizer.vocabulary_.items(), key=lambda x:x[1])]
dft_words = list(np.asarray(vectorizer.get_feature_names()))
not_digit_inds = [ind for ind, word in enumerate(dft_words) if not word.isdigit()]
doc_word = doc_word[:,not_digit_inds]
words    = [word for ind,word in enumerate(dft_words) if not word.isdigit()]

print("doc_word's shape: ",doc_word.shape)
print("length of idx2vocab: ",len(idx2vocab), idx2vocab[:10])
print("len of dft_words",len(dft_words))
print("len of not_digit_inds",len(not_digit_inds))
print("len of words: ",len(words))

In [None]:
y_label_all = list(dldefect_df['label']) 
y_labels = list(set(y_label_all))
print(y_labels)

y_label_dict = {(i):y_labels[i] for i in range(0,len(y_labels))}

y_label_encoded=[]
for label in y_labels: 
    for k, v in zip(y_label_dict.keys(), y_label_dict.values()):
        if label == v:
            y_label_encoded.append(k)
            
print(len(y_labels), len(y_label_encoded))  
print(y_label_encoded)

In [None]:
a = mutual_info_classif(doc_word, y_label_all, discrete_features='auto')
len(a)

In [None]:
results = dict(zip(vectorizer.get_feature_names(), 
                   mutual_info_classif(doc_word, y_label_all, discrete_features='auto')))

sorting = sorted(results.items(), key = lambda x : x[1], reverse = True)

In [None]:
aw =[]
miscore=[]
for k, v in enumerate(sorting):
    aw.append(v[0])
    miscore.append(v[1])
    
awDF = pd.DataFrame({'words':aw, 'mi_score': miscore})

awDF