In [1]:
# base tool
import pandas as pd
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows',100)
import numpy as np
from sklearn.model_selection import train_test_split
import copy

import warnings
warnings.filterwarnings('ignore')

#visualization
from IPython.display import display
import matplotlib.pyplot as plt
from matplotlib import font_manager, rc
font_path = "C:/Windows/Fonts/gulim.ttc"
font = font_manager.FontProperties(fname=font_path).get_name()
rc('font', family=font)

import plotly.io as pio
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
pio.templates.default = "plotly_white"

# test
from imblearn.under_sampling import RandomUnderSampler
import pingouin as pg
from scipy.stats import chi2_contingency,shapiro

from statsmodels.stats.outliers_influence import variance_inflation_factor

# resampling
from collections import Counter
from imblearn.combine import SMOTEENN
from imblearn.combine import SMOTETomek

# modeling
from pandas.api.types import CategoricalDtype
from statsmodels.miscmodels.ordinal_model import OrderedModel
from sklearn.ensemble import RandomForestClassifier

# evaluation
from sklearn.metrics import accuracy_score,confusion_matrix, plot_confusion_matrix,f1_score, classification_report



In [2]:
def summary(df, pred=None):
    obs = df.shape[0]
    Types = df.dtypes
    Counts = df.apply(lambda x: x.count())
    Min = df.min()
    Max = df.max()
    Uniques = df.apply(lambda x: x.unique().shape[0])
    Nulls = df.apply(lambda x: x.isnull().sum())
    print('Data shape:', df.shape)

    if pred is None:
        cols = ['Types', 'Counts', 'Uniques', 'Nulls', 'Min', 'Max']
        st = pd.concat([Types, Counts, Uniques, Nulls, Min, Max], axis = 1, sort=True)

    st.columns = cols
    print('___________________________\nData Types:')
    print(st.Types.value_counts())
    print('___________________________')
    return st

In [3]:
def vif(df:pd.DataFrame)->None:
    print('vif score')
    df = df.iloc[:,:-1]
    vif_scores = pd.DataFrame() 

    vif_scores["Attribute"] = df.columns 
    # calculating VIF for each feature 
    vif_scores["VIF Scores"] = [round(variance_inflation_factor(df.values, i) ,2)for i in range(len(df.columns))] 
    display(vif_scores)
    print('-'*50)


In [4]:
def pairwise(df:pd.DataFrame,disp:bool =False)->None:
    print('kendall correlation')
    print()
    df = df.iloc[:,:]
    corr = df.corr(method='kendall').round(3)
    display(corr.iloc[:-1,[-1]])
    # (1,2) (2,4) (4,1)? (6,7)
    # 1,2,4 번은 모두 상담사의 관한 질문
    if disp:
        fig = go.Figure(data=go.Heatmap(
            z=corr,
            x=corr.columns,
            y=corr.columns,
            colorscale=px.colors.diverging.RdBu,
            zmin=-1,
            zmax=1
        ))
        fig.show()
    print('-'*50)


In [15]:
def cronbach (df:pd.DataFrame)->None:
    print('cronbach-alpha test:',pg.cronbach_alpha(data=df,ci=0.95))
    print('-'*50)


In [6]:
def chi2(df:pd.DataFrame)->None:
    print('chi-square test')
    chi_df = df.iloc[:,:-1]
    y_ = df.iloc[:,-1]
    for i in range(len(chi_df.columns)):
        cross_tb = pd.crosstab(chi_df.iloc[:,i],y_)
        chi, p, dof, expected = chi2_contingency(cross_tb)
        print('문항',str(i+1),':',end=' ')
        print(f"chi 스퀘어 값: {round(chi,2)}",
            f"p-value (0.05): {p}",
            # f"자유도 수: {dof}",
            # f"기대값: \n{pd.DataFrame(expected)}",
            # f"측정값: \n{cross_tb}", sep = "\n" 
            )
    print('-'*50)



In [7]:
def plot_cm(y_test,y_pred):
    print('test confusion matrix:')
    z = confusion_matrix(y_true=y_test,y_pred= y_pred)



    x = ['0','1','2','3','4']
    y = ['0','1','2','3','4']

    # change each element of z to type string for annotations
    z_text = [[str(y) for y in x] for x in z]

    # set up figure 
    fig = ff.create_annotated_heatmap(z, x=x, y=y, annotation_text=z_text, colorscale='Viridis')

    # add title
    fig.update_layout(title_text='<i><b>Confusion matrix</b></i>',
                    #xaxis = dict(title='x'),
                    #yaxis = dict(title='x')
                    )

    # add custom xaxis title
    fig.add_annotation(dict(font=dict(color="black",size=14),
                            x=0.5,
                            y=-0.15,
                            showarrow=False,
                            text="Predicted value",
                            xref="paper",
                            yref="paper"))

    # add custom yaxis title
    fig.add_annotation(dict(font=dict(color="black",size=14),
                            x=-0.35,
                            y=0.5,
                            showarrow=False,
                            text="Real value",
                            textangle=-90,
                            xref="paper",
                            yref="paper"))

    # adjust margins to make room for yaxis title
    fig.update_layout(margin=dict(t=50, l=200))

    # add colorbar
    fig['data'][0]['showscale'] = True
    fig.show()

In [8]:
def modeling(df:pd.DataFrame)->None:
    ## ordinal regression
    X = df.iloc[:,:-1]
    y = df.iloc[:,-1]
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.3,random_state=0,stratify=y)
    shapiro_y = copy.deepcopy(y_train)
    print('count of train y:',sorted(Counter(y_train).items()))
    cat_type = CategoricalDtype(categories=[0,1,2,3,4], ordered=True)
    y_train = y_train.astype(cat_type)

    # for distr in ['probit','logit']:
    for distr in ['logit']:
        print('Ordinal regression',distr)


        mod = OrderedModel(y_train,
                            X_train,
                            distr=distr)
        res = mod.fit(method='bfgs')
        display(res.summary())

        # train data
        predicted = res.model.predict(res.params, exog=X_train)
        y_pred = pd.Series([i.argmax() for i in predicted]).ravel()
        if distr =='probit':
            residual  = shapiro_y -y_pred
            print(shapiro(residual))
            fig = px.histogram(residual)
            fig.show()

        # test data
        predicted = res.model.predict(res.params, exog=X_test)
        y_pred = pd.Series([i.argmax() for i in predicted]).ravel()
        

        print('-'*50)

        plot_cm(y_test=y_test,y_pred=y_pred)
        print(classification_report(y_test, y_pred))
        print('-'*50)
        print()

    
        
    
    ## random forest
    print('random forest:')
    clf = RandomForestClassifier()
    clf.fit(X_train,y_train)

    display(pd.DataFrame({'index':X.columns,'feature importance':clf.feature_importances_}).round(2))
    y_pred  = clf.predict(X_test)

    print(classification_report(y_test, y_pred))
    print()
    plot_cm(y_test=y_test,y_pred=y_pred)

    print('-'*50)


    


In [9]:
def smoteEnn(df:pd.DataFrame):
    X=df.iloc[:,:-1]
    y=df.iloc[:,-1]

    smote_enn = SMOTETomek(random_state=0)
    X_resampled, y_resampled = smote_enn.fit_resample(X, y)
    print(sorted(Counter(y).items()))
    print(sorted(Counter(y_resampled).items()))
    resampled_df = X_resampled.merge(y_resampled,left_index=True,right_index=True)
    return resampled_df

In [10]:
def pipeline(df:pd.DataFrame) ->None:
    cronbach(df)
    vif(df)
    # chi2(df)
    pairwise(df)
    modeling(df)

In [None]:
rus = RandomUnderSampler(random_state=0)
X_resampled, y_resampled = rus.fit_resample(X_train, y_train)

In [34]:
sorted(Counter(rawData.iloc[:,-1]).items())

[(0, 16), (1, 26), (2, 1333), (3, 12719), (4, 35002)]

# Introduction

# 1. Data Overview

## 1.1 data

In [9]:
# https://www.data.go.kr/data/15092388/fileData.do
# 한국건강증진개발원_국가금연지원서비스 등록정보(만족도)
raw_data = pd.read_csv('./만족도평가(2020).csv',encoding='cp949')
rawData = pd.read_csv('./만족도평가(2020).csv',encoding='cp949')


In [10]:
display(raw_data.head())
print(raw_data.shape)

Unnamed: 0,기관유형,지역,서비스구분,제공기관,출생년도,성별,등록유형,문항1,문항2,문항3,문항4,문항5,문항6,문항7
0,보건소,대전광역시,보건소 금연클리닉,대전 서구보건소,1970~1979,남,보건소,1,1,1,1,3,1,2
1,보건소,경기도,보건소 금연클리닉,경기 수원시 장안구보건소,1950~1959,남,보건소,0,0,0,0,4,0,0
2,보건소,광주광역시,보건소 금연클리닉,광주 광산구보건소,1980~1989,남,보건소,0,0,0,0,3,0,0
3,보건소,경기도,보건소 금연클리닉,경기 파주시보건소,1990~1999,남,보건소,1,1,1,0,3,1,1
4,보건소,경상북도,보건소 금연클리닉,경북 영덕군보건소,1940~1949,남,보건소,1,1,1,1,1,1,1


(55911, 14)


In [11]:
raw_data = raw_data.drop(['문항6'],axis=1)

In [16]:
summary(raw_data)

Data shape: (55911, 13)
___________________________
Data Types:
object    7
int64     6
Name: Types, dtype: int64
___________________________


Unnamed: 0,Types,Counts,Uniques,Nulls,Min,Max
기관유형,object,55911,2,0,금연지원센터,보건소
등록유형,object,55911,11,0,기타,캠페인
문항1,int64,55911,5,0,0,4
문항2,int64,55911,5,0,0,4
문항3,int64,55911,5,0,0,4
문항4,int64,55911,5,0,0,4
문항5,int64,55911,5,0,0,4
문항7,int64,55911,5,0,0,4
서비스구분,object,55911,3,0,단기금연캠프,찾아가는 금연서비스
성별,object,55911,2,0,남,여


# 2.EDA

In [21]:
x = list(raw_data.columns)
x = x[7:]
fig = make_subplots(rows=2, cols=3,subplot_titles=x,vertical_spacing=0.1,x_title='설문조사 결과 히스토그램')

trace0 = go.Histogram( x=raw_data[x[0]],)
trace1 = go.Histogram( x=raw_data[x[1]],)
trace2 = go.Histogram( x=raw_data[x[2]],)
trace3 = go.Histogram( x=raw_data[x[3]],)
trace4 = go.Histogram( x=raw_data[x[4]],)
trace5 = go.Histogram( x=raw_data[x[5]],)


fig.append_trace(trace0,1,1)
fig.append_trace(trace1, 1, 2)
fig.append_trace(trace2, 1, 3)
fig.append_trace(trace3, 2, 1)
fig.append_trace(trace4, 2, 2)
fig.append_trace(trace5, 2, 3)
fig.layout.update(height=700)
fig.update_layout(bargap=0.2)
fig.show()

# Single Response

In [23]:
# survey data slicing
survey_data= raw_data.iloc[:,7:]
# count unique value in row
val_num = survey_data.stack().groupby(level=0).apply(lambda x: len(x.unique().tolist()))
survey_data['unique_val'] = val_num
# replace num of unique val to specific unique value
survey_data['unique_val']=survey_data['unique_val'].map(lambda x: 'multiple_val' if x !=1 else x)
survey_data['unique_val'] = survey_data.apply(lambda x: 'only '+str(x['문항1']) if x['unique_val'] ==1 else x,axis=1)['unique_val']
print(survey_data['unique_val'].value_counts())

multiple_val    49484
only 1           3561
only 0           2098
only 2            765
only 3              2
only 4              1
Name: unique_val, dtype: int64


In [24]:
px.pie(survey_data,names='unique_val')

 H0: 항목1~5 has same value, then  항목7 has same value

In [26]:
# survey data slicing
survey_data= raw_data.iloc[:,7:-1]
# count unique value in row
val_num = survey_data.stack().groupby(level=0).apply(lambda x: len(x.unique().tolist()))
survey_data['unique_val'] = val_num
temp = survey_data.merge(raw_data['문항7'],left_index=True,right_index=True)[survey_data['unique_val']==1]
pred_val = temp['문항1'].ravel()
true_val =temp['문항7'].ravel()
accuracy_score(true_val,pred_val)


0.9430667644900954

In [27]:
#drop
raw_data = raw_data[survey_data['unique_val']!=1]

In [29]:
# dummy 성별, 출생년도
raw_data['성별'].replace({'남':1,'여':0},inplace=True)
year_col = sorted(raw_data['출생년도'].unique(),reverse=True)
raw_data['출생년도'].replace(year_col,range(len(year_col)),inplace=True)
raw_data = raw_data.loc[:,['출생년도','성별','문항1','문항2','문항3','문항4','문항5','문항7']]


In [30]:
#문항 5외 다른 점수 점수 반대로 설정
for col in ['문항1','문항2','문항3','문항4','문항7']:
    raw_data[col] = raw_data[col].replace([0,1,2,3,4],[4,3,2,1,0])

In [34]:
raw_data.to_csv("preprocessed_data.csv", index=False, encoding="utf-8-sig")

# 모델링

In [27]:
rawData = pd.read_csv('preprocessed_data.csv',encoding='utf-8')

In [21]:
rawData.shape

(49096, 8)

In [20]:
# full model without re-sampling
pipeline(rawData)

cronbach-alpha test: (0.5187819818758802, array([0.512, 0.525]))
--------------------------------------------------
vif score


Unnamed: 0,Attribute,VIF Scores
0,출생년도,4.55
1,성별,7.28
2,문항1,252.39
3,문항2,342.9
4,문항3,80.0
5,문항4,245.02
6,문항5,8.97


--------------------------------------------------
kendall correlation



Unnamed: 0,문항7
출생년도,0.005
성별,-0.001
문항1,0.588
문항2,0.633
문항3,0.587
문항4,0.613
문항5,0.305


--------------------------------------------------
count of train y: [(0, 11), (1, 18), (2, 933), (3, 8903), (4, 24502)]
Ordinal regression logit
Optimization terminated successfully.
         Current function value: 0.453934
         Iterations: 147
         Function evaluations: 166
         Gradient evaluations: 166


0,1,2,3
Dep. Variable:,문항7,Log-Likelihood:,-15600.0
Model:,OrderedModel,AIC:,31220.0
Method:,Maximum Likelihood,BIC:,31320.0
Date:,"Sun, 15 May 2022",,
Time:,17:32:57,,
No. Observations:,34367,,
Df Residuals:,34356,,
Df Model:,11,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
출생년도,0.0212,0.009,2.290,0.022,0.003,0.039
성별,-0.0247,0.045,-0.551,0.582,-0.113,0.063
문항1,0.4217,0.052,8.158,0.000,0.320,0.523
문항2,1.4860,0.059,25.193,0.000,1.370,1.602
문항3,0.6819,0.029,23.608,0.000,0.625,0.739
문항4,1.2476,0.052,24.081,0.000,1.146,1.349
문항5,0.4516,0.014,32.092,0.000,0.424,0.479
0/1,4.9055,0.375,13.096,0.000,4.171,5.640
1/2,0.2039,0.244,0.834,0.404,-0.275,0.683


--------------------------------------------------
test confusion matrix:


              precision    recall  f1-score   support

           0       1.00      0.20      0.33         5
           1       0.00      0.00      0.00         8
           2       0.82      0.27      0.41       400
           3       0.78      0.59      0.68      3816
           4       0.86      0.96      0.91     10500

    accuracy                           0.84     14729
   macro avg       0.69      0.40      0.46     14729
weighted avg       0.84      0.84      0.83     14729

--------------------------------------------------

random forest:


Unnamed: 0,index,feature importance
0,출생년도,0.05
1,성별,0.01
2,문항1,0.14
3,문항2,0.26
4,문항3,0.21
5,문항4,0.22
6,문항5,0.12


              precision    recall  f1-score   support

           0       0.50      0.20      0.29         5
           1       0.00      0.00      0.00         8
           2       0.79      0.33      0.47       400
           3       0.81      0.61      0.69      3816
           4       0.86      0.96      0.91     10500

    accuracy                           0.85     14729
   macro avg       0.59      0.42      0.47     14729
weighted avg       0.85      0.85      0.84     14729


test confusion matrix:


--------------------------------------------------


In [28]:
# 출생년도, 성별 제거 model without re-sampling

rawData = rawData.iloc[:,2:]
pipeline(rawData)


cronbach-alpha test: (0.7720115003434147, array([0.769, 0.775]))
--------------------------------------------------
vif score


Unnamed: 0,Attribute,VIF Scores
0,문항1,251.64
1,문항2,342.08
2,문항3,79.96
3,문항4,244.05
4,문항5,8.97


--------------------------------------------------
kendall correlation



Unnamed: 0,문항7
문항1,0.588
문항2,0.633
문항3,0.587
문항4,0.613
문항5,0.305


--------------------------------------------------
count of train y: [(0, 11), (1, 18), (2, 933), (3, 8903), (4, 24502)]
Ordinal regression logit
Optimization terminated successfully.
         Current function value: 0.454013
         Iterations: 139
         Function evaluations: 157
         Gradient evaluations: 157


0,1,2,3
Dep. Variable:,문항7,Log-Likelihood:,-15603.0
Model:,OrderedModel,AIC:,31220.0
Method:,Maximum Likelihood,BIC:,31300.0
Date:,"Sun, 15 May 2022",,
Time:,17:37:51,,
No. Observations:,34367,,
Df Residuals:,34358,,
Df Model:,9,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
문항1,0.4213,0.052,8.148,0.000,0.320,0.523
문항2,1.4852,0.059,25.178,0.000,1.370,1.601
문항3,0.6820,0.029,23.619,0.000,0.625,0.739
문항4,1.2479,0.052,24.086,0.000,1.146,1.349
문항5,0.4511,0.014,32.058,0.000,0.424,0.479
0/1,4.8598,0.371,13.086,0.000,4.132,5.588
1/2,0.2032,0.244,0.832,0.405,-0.275,0.682
2/3,1.4701,0.047,31.483,0.000,1.379,1.562
3/4,1.4391,0.011,126.133,0.000,1.417,1.461


--------------------------------------------------
test confusion matrix:


              precision    recall  f1-score   support

           0       1.00      0.20      0.33         5
           1       0.00      0.00      0.00         8
           2       0.82      0.27      0.40       400
           3       0.79      0.59      0.67      3816
           4       0.86      0.96      0.91     10500

    accuracy                           0.84     14729
   macro avg       0.69      0.40      0.46     14729
weighted avg       0.84      0.84      0.83     14729

--------------------------------------------------

random forest:


Unnamed: 0,index,feature importance
0,문항1,0.11
1,문항2,0.34
2,문항3,0.2
3,문항4,0.26
4,문항5,0.1


              precision    recall  f1-score   support

           0       0.50      0.20      0.29         5
           1       0.00      0.00      0.00         8
           2       0.87      0.29      0.44       400
           3       0.82      0.60      0.69      3816
           4       0.86      0.97      0.91     10500

    accuracy                           0.85     14729
   macro avg       0.61      0.41      0.47     14729
weighted avg       0.85      0.85      0.84     14729


test confusion matrix:


--------------------------------------------------


## TokEE

In [None]:
rawData = pd.read_csv('preprocessed_data.csv',encoding='utf-8')
rawData = rawData.iloc[:,2:]


In [23]:
X=rawData.iloc[:,:-1]
y=rawData.iloc[:,-1]

smote_enn = SMOTEENN(random_state=0)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)
rus = RandomUnderSampler(random_state=0)
X_resampled, y_resampled = rus.fit_resample(X_resampled, y_resampled)
print(sorted(Counter(y).items()))
print(sorted(Counter(y_resampled).items()))
r_resampled = X_resampled.merge(y_resampled,left_index=True,right_index=True)


[(0, 16), (1, 26), (2, 1333), (3, 12719), (4, 35002)]
[(0, 5886), (1, 5886), (2, 5886), (3, 5886), (4, 5886)]


In [32]:
temp =r_resampled.iloc[:,:]
x = list(temp.columns)
fig = make_subplots(rows=1, cols=5,subplot_titles=x,vertical_spacing=0.1,x_title='설문조사 결과 히스토그램')

trace0 = go.Histogram( x=temp[x[0]],)
trace1 = go.Histogram( x=temp[x[1]],)
trace2 = go.Histogram( x=temp[x[2]],)
trace3 = go.Histogram( x=temp[x[3]],)
trace4 = go.Histogram( x=temp[x[4]],)


fig.append_trace(trace0,1,1)
fig.append_trace(trace1, 1, 2)
fig.append_trace(trace2, 1,3)
fig.append_trace(trace3, 1, 4)
fig.append_trace(trace4, 1, 5)
fig.layout.update(height=400)
fig.update_layout(bargap=0.2)
fig.show()

In [25]:
# full model with re-sampling
pipeline(r_resampled)


cronbach-alpha test: (0.9017037462193722, array([0.9  , 0.903]))
--------------------------------------------------
vif score


Unnamed: 0,Attribute,VIF Scores
0,문항1,21.1
1,문항2,26.09
2,문항3,9.08
3,문항4,20.34
4,문항5,6.46


--------------------------------------------------
kendall correlation



Unnamed: 0,문항7
문항1,0.67
문항2,0.662
문항3,0.765
문항4,0.473
문항5,0.438


--------------------------------------------------
count of train y: [(0, 4120), (1, 4120), (2, 4121), (3, 4120), (4, 4120)]
Ordinal regression logit
Optimization terminated successfully.
         Current function value: 0.832990
         Iterations: 36
         Function evaluations: 38
         Gradient evaluations: 38


0,1,2,3
Dep. Variable:,문항7,Log-Likelihood:,-17160.0
Model:,OrderedModel,AIC:,34340.0
Method:,Maximum Likelihood,BIC:,34410.0
Date:,"Sun, 15 May 2022",,
Time:,17:35:24,,
No. Observations:,20601,,
Df Residuals:,20592,,
Df Model:,9,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
문항1,1.1668,0.027,42.918,0.000,1.113,1.220
문항2,1.2416,0.031,40.371,0.000,1.181,1.302
문항3,1.7884,0.023,76.839,0.000,1.743,1.834
문항4,-0.7900,0.023,-34.472,0.000,-0.835,-0.745
문항5,0.8744,0.018,49.967,0.000,0.840,0.909
0/1,6.2038,0.076,81.885,0.000,6.055,6.352
1/2,0.7803,0.014,55.056,0.000,0.752,0.808
2/3,0.9586,0.014,68.841,0.000,0.931,0.986
3/4,1.2317,0.013,91.979,0.000,1.205,1.258


--------------------------------------------------
test confusion matrix:


              precision    recall  f1-score   support

           0       0.60      0.53      0.56      1766
           1       0.58      0.48      0.52      1766
           2       0.58      0.76      0.66      1765
           3       0.94      0.94      0.94      1766
           4       0.98      0.98      0.98      1766

    accuracy                           0.74      8829
   macro avg       0.74      0.74      0.73      8829
weighted avg       0.74      0.74      0.73      8829

--------------------------------------------------

random forest:


Unnamed: 0,index,feature importance
0,문항1,0.13
1,문항2,0.2
2,문항3,0.36
3,문항4,0.18
4,문항5,0.12


              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1766
           1       1.00      1.00      1.00      1766
           2       1.00      1.00      1.00      1765
           3       1.00      1.00      1.00      1766
           4       1.00      1.00      1.00      1766

    accuracy                           1.00      8829
   macro avg       1.00      1.00      1.00      8829
weighted avg       1.00      1.00      1.00      8829


test confusion matrix:


--------------------------------------------------


In [54]:
# 출생년도, 성별, 항목4 제거 model with re-sampling
pipeline(r_resampled.iloc[:,[0,1,2,4,5]])


cronbach-alpha test: (0.8846527190347154, array([0.883, 0.887]))
--------------------------------------------------
vif score


Unnamed: 0,Attribute,VIF Scores
0,문항1,19.68
1,문항2,17.24
2,문항3,9.08
3,문항5,6.38


--------------------------------------------------
kendall correlation



Unnamed: 0,문항7
문항1,0.67
문항2,0.662
문항3,0.765
문항5,0.438


--------------------------------------------------
count of train y: [(0, 4120), (1, 4120), (2, 4121), (3, 4120), (4, 4120)]
Ordinal regression logit
Optimization terminated successfully.
         Current function value: 0.862570
         Iterations: 31
         Function evaluations: 33
         Gradient evaluations: 33


0,1,2,3
Dep. Variable:,문항7,Log-Likelihood:,-17770.0
Model:,OrderedModel,AIC:,35560.0
Method:,Maximum Likelihood,BIC:,35620.0
Date:,"Tue, 10 May 2022",,
Time:,11:51:19,,
No. Observations:,20601,,
Df Residuals:,20593,,
Df Model:,8,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
문항1,0.9210,0.027,34.425,0.000,0.869,0.973
문항2,0.6264,0.025,24.964,0.000,0.577,0.676
문항3,1.7854,0.023,76.448,0.000,1.740,1.831
문항5,0.8225,0.017,47.586,0.000,0.789,0.856
0/1,6.4229,0.078,82.452,0.000,6.270,6.576
1/2,0.7376,0.014,51.328,0.000,0.709,0.766
2/3,0.8204,0.014,59.420,0.000,0.793,0.848
3/4,1.1816,0.014,87.400,0.000,1.155,1.208


--------------------------------------------------
test confusion matrix:


              precision    recall  f1-score   support

           0       0.61      0.56      0.58      1766
           1       0.58      0.44      0.50      1766
           2       0.57      0.75      0.65      1765
           3       0.93      0.94      0.94      1766
           4       0.98      0.98      0.98      1766

    accuracy                           0.73      8829
   macro avg       0.73      0.73      0.73      8829
weighted avg       0.73      0.73      0.73      8829

--------------------------------------------------

random forest:


Unnamed: 0,index,feature importance
0,문항1,0.2
1,문항2,0.25
2,문항3,0.38
3,문항5,0.17


              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1766
           1       0.98      0.90      0.94      1766
           2       0.90      0.98      0.94      1765
           3       1.00      0.98      0.99      1766
           4       0.99      1.00      0.99      1766

    accuracy                           0.97      8829
   macro avg       0.97      0.97      0.97      8829
weighted avg       0.97      0.97      0.97      8829


test confusion matrix:


--------------------------------------------------
