In [1]:
import pandas as pd

python = pd.read_csv("python.csv")
python.head()

Unnamed: 0,text_amount,keyword_mentioned,commentNum,keyword_title_data,link_num_data,player_num_data,img_num_data,past_date,five_day_visitor,label
0,2826,16,6,1,0,1,14,12,180045,0
1,2914,6,0,1,1,0,11,11,5268,0
2,2765,9,0,1,1,0,10,2,3770,0
3,2651,6,3,1,1,0,11,75,5613,0
4,2672,8,0,1,0,0,9,2,1335,0


In [2]:
from sklearn.tree import DecisionTreeClassifier
import sklearn.model_selection as ms

In [3]:
# 트레인 데이터와 테스트 데이터 7:3 비율로 분할
# 참고 사이트 : https://blog.naver.com/winddori2002/221659080425
X = python.iloc[:,:-1]
y = python.iloc[:,-1]

X_train, X_test, y_train, y_test = ms.train_test_split(X, y,test_size = 0.3, random_state = 100)
# DT 객체 생성 및 훈련
dt_clf = DecisionTreeClassifier()
dt_clf.fit(X_train,y_train)

# 예측값 저장
y_pred = dt_clf.predict(X_test)

import sklearn.metrics as mt

# 학습결과 평가 

print('Train_Accuracy: ', dt_clf.score(X_train, y_train),'\n')

accuracy = mt.accuracy_score(y_test, y_pred)
recall = mt.recall_score(y_test, y_pred)
precision = mt.precision_score(y_test, y_pred)
f1_score = mt.f1_score(y_test, y_pred)
matrix = mt.confusion_matrix(y_test, y_pred)

print('Accuracy: ', format(accuracy,'.2f'),'\n')
print('Recall: ', format(recall,'.2f'),'\n')
print('Precision: ', format(precision,'.2f'),'\n')
print('F1_score: ', format(f1_score,'.2f'),'\n')
print('Confusion Matrix:','\n', matrix)

Train_Accuracy:  1.0 

Accuracy:  0.96 

Recall:  0.98 

Precision:  0.98 

F1_score:  0.98 

Confusion Matrix: 
 [[  2   6]
 [  7 274]]


In [4]:
# 교차검증
from sklearn.model_selection import cross_val_score, cross_validate

# 각 폴드의 스코어 
scores = cross_val_score(dt_clf, X, y, cv = 5)
scores

pd.DataFrame(cross_validate(dt_clf, X, y, cv =5))
print('교차검증 평균: ', scores.mean())

교차검증 평균:  0.9304512089810018


In [5]:
from sklearn.model_selection import GridSearchCV

# 테스트하고자 하는 파라미터 값들을 사전타입으로 정의

dt_clf = DecisionTreeClassifier(random_state=33)
parameters = {'max_depth': [3, 5, 7],
              'min_samples_split': [3, 5],
              'splitter': ['best', 'random']}

grid_dt = GridSearchCV(dt_clf, # estimator 객체,
                      param_grid = parameters, cv = 5,
                      # n_jobs = -1: 모든 cpu를 사용)
                      )

grid_dt.fit(X_train, y_train)

result = pd.DataFrame(grid_dt.cv_results_['params'])
result['mean_test_score'] = grid_dt.cv_results_['mean_test_score']
result.sort_values(by='mean_test_score', ascending=False)

Unnamed: 0,max_depth,min_samples_split,splitter,mean_test_score
5,5,3,random,0.962875
1,3,3,random,0.959912
3,3,5,random,0.959912
7,5,5,random,0.959889
0,3,3,best,0.958419
2,3,5,best,0.958419
4,5,3,best,0.953941
9,7,3,random,0.949508
6,5,5,best,0.949497
11,7,5,random,0.945053


In [6]:
best_dt = DecisionTreeClassifier(criterion = 'gini', max_depth=4, min_samples_split=3).fit(X_train, y_train)

In [7]:
python.feature_names = ['text_amount','keyword_mentioned','commentNum','keyword_title_data','link_num_data','player_num_data',
                         'img_num_data','past_date','five_day_visitor','label']
qwe = python.feature_names

  python.feature_names = ['text_amount','keyword_mentioned','commentNum','keyword_title_data','link_num_data','player_num_data',


In [8]:
import numpy as np
from sklearn.tree import _tree

def tree_to_code1(tree, feature_names):
    tree_ = tree.tree_
    feature_name = [feature_names[i] 
                    if i != _tree.TREE_UNDEFINED else "undefined!" 
                    for i in tree_.feature]
    print("def tree({}):".format(", ".join(feature_names)))

    def recurse(node, depth):
        indent = "    " * depth
        if tree_.feature[node] != _tree.TREE_UNDEFINED:
            name = feature_name[node]
            threshold = tree_.threshold[node]
            print("{}if {} <= {}:".format(indent, name, threshold))
            recurse(tree_.children_left[node], depth + 1)
            print("{}else:  # if {} > {}".format(indent, name, threshold))
            recurse(tree_.children_right[node], depth + 1)
        else:
            if(np.argmax(tree_.value[node])==0):
                print("{}return {}".format(indent, np.argmax(tree_.value[node])))
                

    recurse(0, 1)

In [9]:
tree_to_code1(best_dt, qwe)

def tree(text_amount, keyword_mentioned, commentNum, keyword_title_data, link_num_data, player_num_data, img_num_data, past_date, five_day_visitor, label):
    if five_day_visitor <= 164258.0:
        if past_date <= 60.5:
            if img_num_data <= 6.5:
                if past_date <= 9.5:
                else:  # if past_date > 9.5
            else:  # if img_num_data > 6.5
                if five_day_visitor <= 3550.0:
                else:  # if five_day_visitor > 3550.0
        else:  # if past_date > 60.5
            if five_day_visitor <= 40458.5:
                if past_date <= 75.5:
                else:  # if past_date > 75.5
            else:  # if five_day_visitor > 40458.5
                if five_day_visitor <= 44563.0:
                    return 0
                else:  # if five_day_visitor > 44563.0
    else:  # if five_day_visitor > 164258.0
        return 0


In [10]:
#의사결정나무 규칙 txt파일화
def maketxt(asd):
    import sys
    
    temp = sys.stdout
    sys.stdout = open(asd, "w")
    tree_to_code1(best_dt, qwe)
    sys.stdout.close()
    sys.stdout = temp
maketxt("treerule.txt")

In [11]:
def findTextCountInText(fname, word):
    cOunt = 0
    lines = 0
    with open(fname, 'r') as f:
        for line in f:
            lines = lines+1
            if word in line:
                cOunt = cOunt + 1
    return cOunt

In [12]:
def listappend(rulelist,x,lines,level):
    try:
        lines[x] = lines[x].replace("# if","")
        lines[x] = re.sub("\:|\#|","",lines[x])
        sprule = lines[x].split()
        rulelist.append([level,sprule[1],sprule[2],sprule[3]])
    except:
        rulelist.append(['return'])

In [13]:
import os
import re
f = open('treerule.txt')
nodenum = findTextCountInText('treerule.txt.','return 0') #원하는 class의 노드 수 확인

rulelist = []
level = 0

lines = f.readlines()

for x in range(1,len(lines)):#각 라인별 레벨 입력    
    if lines[x].find("                    ")==0:
        level = 5
        listappend(rulelist,x,lines,level)
        
    elif lines[x].find("                ")==0:
        level = 4
        listappend(rulelist,x,lines,level)
        
    elif lines[x].find("            ")==0:
        level = 3
        listappend(rulelist,x,lines,level)
        
    elif lines[x].find("        ")==0:
        level = 2
        listappend(rulelist,x,lines,level)
        
    else:
        level = 1
        listappend(rulelist,x,lines,level)
    
    
    
f.close()

In [14]:
returnrule = []
rulnum = 0;
for i in reversed(range(0,len(rulelist))):
    if rulelist[i][0]=='return':
        returnlevel = rulelist[i-1][0]
        for k in reversed(range(0,i)):
            if returnlevel==0:
                returnrule.append(["","",""])
                rulnum = rulnum+1;
                break
                
            elif returnlevel==rulelist[k][0]:  
                returnrule.append([rulelist[k][1],rulelist[k][2],rulelist[k][3]])
                returnlevel = returnlevel-1
                rulnum = rulnum+1;
                
            else:
                continue

In [15]:
import pandas as pd
usercsv = pd.read_csv('user.csv')

In [16]:
asd = []
asdf = 1

for i in range(0,len(returnrule)):
    if returnrule[i][0]=='five_day_visitor':
        target = str(usercsv.five_day_visitor)
        if returnrule[i][1]=='>':
            if target<=returnrule[i][2]:
                asdf = asdf*0
        else:
            if target>returnrule[i][2]:
                asdf = asdf*0
    
    elif returnrule[i][0]=='keyword_mentioned':
        target = str(usercsv.keyword_mentioned)
        if returnrule[i][1]=='>':
            if target<=returnrule[i][2]:
                asdf = asdf*0
        else:
            if target>returnrule[i][2]:
                asdf = asdf*0
    
    elif returnrule[i][0]=='commentNum':
        target = str(usercsv.commentNum)
        if returnrule[i][1]=='>':
            if target<=returnrule[i][2]:
                asdf = asdf*0
        else:
            if target>returnrule[i][2]:
                asdf = asdf*0
            
    elif returnrule[i][0]=='keyword_title_data':
        target = str(usercsv.keyword_title_data)
        if returnrule[i][1]=='>':
            if target<=returnrule[i][2]:
                asdf = asdf*0
        else:
            if target>returnrule[i][2]:
                asdf = asdf*0
                
    elif returnrule[i][0]=='link_num_data':
        target = str(usercsv.link_num_data)
        if returnrule[i][1]=='>':
            if target<=returnrule[i][2]:
                asdf = asdf*0
        else:
            if target>returnrule[i][2]:
                asdf = asdf*0
    elif returnrule[i][0]=='player_num_data':
        target = str(usercsv.player_num_data)
        if returnrule[i][1]=='>':
            if target<=returnrule[i][2]:
                asdf = asdf*0
        else:
            if target>returnrule[i][2]:
                asdf = asdf*0
                
    elif returnrule[i][0]=='img_num_data':
        target = str(usercsv.img_num_data)
        if returnrule[i][1]=='>':
            if target<=returnrule[i][2]:
                asdf = asdf*0
        else:
            if target>returnrule[i][2]:
                asdf = asdf*0
                
    elif returnrule[i][0]=='past_date':
        target = str(usercsv.past_date)
        if returnrule[i][1]=='>':
            if target<=returnrule[i][2]:
                asdf = asdf*0
        else:
            if target>returnrule[i][2]:
                asdf = asdf*0
            
    elif returnrule[i][0]=='five_day_visitor':
        target = str(usercsv.five_day_visitor)
        if returnrule[i][1]=='>':
            if target<=returnrule[i][2]:
                asdf = asdf*0
        else:
            if target>returnrule[i][2]:
                asdf = asdf*0
    else:
        asd.append(asdf)
        asdf=1
asd.append(asdf)

In [17]:
for i in range(0,len(asd)):
    k = k*asd[i]

if k==0:
    print("해당 글은 조건을 만족하지 않습니다")
else:
    print("해당 글은 조건을 만족합니다")

해당 글은 조건을 만족하지 않습니다


In [18]:
returnrule

[['five_day_visitor', '>', '164258.0'],
 ['', '', ''],
 ['five_day_visitor', '<=', '44563.0'],
 ['five_day_visitor', '>', '40458.5'],
 ['past_date', '>', '60.5'],
 ['five_day_visitor', '<=', '164258.0']]

In [43]:
rules = []

def rule(returnrule):
    
    startline = 0
    chec = 1
    for i in range(0,rulnum-1):
        if returnrule[i][0] == '':
            rules.append("")
            rules.append("다음 규칙은 ")
            startline = i
        elif i == 0 or i == 1:
            rules.append(returnrule[i][0]+returnrule[i][1]+returnrule[i][2])
        else:
            for j in range(startline,i):
                if returnrule[i][0] == returnrule[j][0] and returnrule[i][1] == returnrule[j][1]:
                    break
                else:
                    chec == chec*1
            if chec == 1:
                rules.append(returnrule[i][0]+returnrule[i][1]+returnrule[i][2])
    
    rules.insert(0,"첫번째 규칙은")
            
        

In [44]:
rule(returnrule)

In [45]:
for i in range(len(rules)):
    print(rules[i])

첫번째 규칙은
five_day_visitor>164258.0

다음 규칙은 
five_day_visitor<=44563.0
five_day_visitor>40458.5
past_date>60.5
