In [4]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules 

# 연관분석

In [5]:
data = [['겨울왕국', '미녀와 야수', '쥬라기 월드', '아바타', '타이타닉'],
       ['스타워즈', '어벤져스', '아이언맨'],
       ['분노의 질주', '어벤져스'],
       ['스타워즈', '아바타', '라이온 킹', '어벤져스'],
       ['쥬라기 월드', '아바타', '해리포터'],
       ['스타워즈', '겨울왕국', '미녀와 야수', '라이온 킹', '타이타닉'],
       ['스타워즈', '어벤져스', '분노의 질주', '쥬라기 월드']]

In [8]:
te = TransactionEncoder()
te_ary = te.fit_transform(data)
te_ary

array([[ True, False,  True, False, False,  True, False, False,  True,
         True, False],
       [False, False, False, False,  True, False,  True,  True, False,
        False, False],
       [False, False, False,  True, False, False, False,  True, False,
        False, False],
       [False,  True, False, False,  True,  True, False,  True, False,
        False, False],
       [False, False, False, False, False,  True, False, False,  True,
        False,  True],
       [ True,  True,  True, False,  True, False, False, False, False,
         True, False],
       [False, False, False,  True,  True, False, False,  True,  True,
        False, False]])

In [9]:
te.columns_

['겨울왕국',
 '라이온 킹',
 '미녀와 야수',
 '분노의 질주',
 '스타워즈',
 '아바타',
 '아이언맨',
 '어벤져스',
 '쥬라기 월드',
 '타이타닉',
 '해리포터']

In [12]:
df = pd.DataFrame(te_ary, columns=te.columns_)

In [13]:
df

Unnamed: 0,겨울왕국,라이온 킹,미녀와 야수,분노의 질주,스타워즈,아바타,아이언맨,어벤져스,쥬라기 월드,타이타닉,해리포터
0,True,False,True,False,False,True,False,False,True,True,False
1,False,False,False,False,True,False,True,True,False,False,False
2,False,False,False,True,False,False,False,True,False,False,False
3,False,True,False,False,True,True,False,True,False,False,False
4,False,False,False,False,False,True,False,False,True,False,True
5,True,True,True,False,True,False,False,False,False,True,False
6,False,False,False,True,True,False,False,True,True,False,False


## 지지도 Support
### apriori(df, min_support=0.5, use_colnames=False, max_len=None, verbose=0, low_memory=False)

In [14]:
frequent_itemsets = apriori(df, use_colnames =True)
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.571429,(스타워즈)
1,0.571429,(어벤져스)


## min_support = 0.23
* 기본 값인 0.5보다 낮춰서 설정 -> (더 많은 결과가 나온다.)

In [15]:
frequent_itemsets = apriori(df, min_support=0.23, use_colnames=True)
frequent_itemsets.sort_values('support', ascending=False)

Unnamed: 0,support,itemsets
4,0.571429,(스타워즈)
6,0.571429,(어벤져스)
14,0.428571,"(어벤져스, 스타워즈)"
5,0.428571,(아바타)
7,0.428571,(쥬라기 월드)
0,0.285714,(겨울왕국)
11,0.285714,"(스타워즈, 라이온 킹)"
15,0.285714,"(아바타, 쥬라기 월드)"
13,0.285714,"(어벤져스, 분노의 질주)"
12,0.285714,"(미녀와 야수, 타이타닉)"


## 신뢰도 Confidence
* association_rules(df, metric='confidence', min_threshold=0.8, support_only=False)

In [16]:
association_rules(frequent_itemsets, metric='confidence', min_threshold=0.5).sort_values('support', 
                                                                                         ascending=False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
10,(어벤져스),(스타워즈),0.571429,0.571429,0.428571,0.75,1.3125,0.102041,1.714286
11,(스타워즈),(어벤져스),0.571429,0.571429,0.428571,0.75,1.3125,0.102041,1.714286
1,(겨울왕국),(미녀와 야수),0.285714,0.285714,0.285714,1.0,3.5,0.204082,inf
18,(겨울왕국),"(미녀와 야수, 타이타닉)",0.285714,0.285714,0.285714,1.0,3.5,0.204082,inf
17,(미녀와 야수),"(겨울왕국, 타이타닉)",0.285714,0.285714,0.285714,1.0,3.5,0.204082,inf
16,"(겨울왕국, 타이타닉)",(미녀와 야수),0.285714,0.285714,0.285714,1.0,3.5,0.204082,inf
15,"(미녀와 야수, 타이타닉)",(겨울왕국),0.285714,0.285714,0.285714,1.0,3.5,0.204082,inf
14,"(미녀와 야수, 겨울왕국)",(타이타닉),0.285714,0.285714,0.285714,1.0,3.5,0.204082,inf
13,(쥬라기 월드),(아바타),0.428571,0.428571,0.285714,0.666667,1.555556,0.102041,1.714286
12,(아바타),(쥬라기 월드),0.428571,0.428571,0.285714,0.666667,1.555556,0.102041,1.714286


# 향상도 Lift

In [17]:
# 향상도 확인
association_rules(frequent_itemsets, metric='lift', min_threshold=1).sort_values('support',
                                                                                ascending=False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
10,(어벤져스),(스타워즈),0.571429,0.571429,0.428571,0.75,1.3125,0.102041,1.714286
11,(스타워즈),(어벤져스),0.571429,0.571429,0.428571,0.75,1.3125,0.102041,1.714286
1,(겨울왕국),(미녀와 야수),0.285714,0.285714,0.285714,1.0,3.5,0.204082,inf
18,(겨울왕국),"(미녀와 야수, 타이타닉)",0.285714,0.285714,0.285714,1.0,3.5,0.204082,inf
17,(미녀와 야수),"(겨울왕국, 타이타닉)",0.285714,0.285714,0.285714,1.0,3.5,0.204082,inf
16,"(겨울왕국, 타이타닉)",(미녀와 야수),0.285714,0.285714,0.285714,1.0,3.5,0.204082,inf
15,"(미녀와 야수, 타이타닉)",(겨울왕국),0.285714,0.285714,0.285714,1.0,3.5,0.204082,inf
14,"(미녀와 야수, 겨울왕국)",(타이타닉),0.285714,0.285714,0.285714,1.0,3.5,0.204082,inf
13,(쥬라기 월드),(아바타),0.428571,0.428571,0.285714,0.666667,1.555556,0.102041,1.714286
12,(아바타),(쥬라기 월드),0.428571,0.428571,0.285714,0.666667,1.555556,0.102041,1.714286


> * 지지도= 0.23 이상, 신뢰도 = 0.5 이상, 향상도 = 1.3 이상으로 데이터를 분석하였다. 
> * 지지도는 스타워즈, 어벤져스가 가장 높았다. 둘은 신뢰도 0.75, 향상도 1.312500으로 모두 같다. 
> ### * 따라서, 스타워즈를 본 사람에게는 어벤져스를 추천하고 어벤져스를 본 사람에게는 스타워즈를 추천한다.

---
# 60191095 강하연 과제2

In [21]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [23]:
df = pd.read_csv('/Users/hayeon/Downloads/2021-2(3-2)/머신러닝/data/titanic.csv', index_col=0)

In [24]:
# 컬럼 제거
df = df.drop(['Name', 'Ticket', 'Cabin', 'Embarked'], axis=1)

# 결측치 대체
df.Age.fillna(df.Age.median(), inplace=True)

# 성별을 진위형 변수로 변경
df['Sex'] = df.Sex.map({'female':0, 'male':1})

# 변수, 정답 분리
X = np.array(df.iloc[:, 1:])
y = np.array(df['Survived'])

# training, test 셋으로 나눔
X_train, X_test, y_train, y_test = train_test_split(X, y)

# 모델적합
tree = DecisionTreeClassifier(max_depth=3)
tree.fit(X_train, y_train)

temp_y_pred = tree.predict(X_test)

temp_acc = accuracy_score(y_test, temp_y_pred)

print('정확도: ', format(temp_acc))

정확도:  0.8161434977578476


In [25]:
tree.feature_importances_

array([0.15084827, 0.62793834, 0.08094074, 0.06433285, 0.        ,
       0.0759398 ])

In [26]:
pd.DataFrame(tree.feature_importances_.reshape((1, -1)), index=['feature_importance'])

Unnamed: 0,0,1,2,3,4,5
feature_importance,0.150848,0.627938,0.080941,0.064333,0.0,0.07594


In [27]:
df.columns

Index(['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare'], dtype='object')

In [28]:
pd.DataFrame(tree.feature_importances_.reshape((1, -1)), columns=df.columns[1:], index=['feature_importance'])

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare
feature_importance,0.150848,0.627938,0.080941,0.064333,0.0,0.07594


> ### **성별의 변수중요도가 가장 높다.**

# 원핫인코딩_ 탑승정보 포함

In [29]:
from sklearn.preprocessing import OneHotEncoder

df = pd.read_csv('/Users/hayeon/Downloads/2021-2(3-2)/머신러닝/data/titanic.csv', index_col=0)
print(df.shape)
df.head()

(891, 11)


Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [30]:
df['Sex'] = df.Sex.map({'female':0, 'male':1})

df = df.drop(['Name', 'Ticket', 'Cabin'], axis=1)

df.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

In [31]:
df.Age.fillna(df.Age.median(), inplace=True)
df.dropna(inplace=True)

In [32]:
df = df.reset_index(drop=True)

In [33]:
df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22.0,1,0,7.2500,S
1,1,1,0,38.0,1,0,71.2833,C
2,1,3,0,26.0,0,0,7.9250,S
3,1,1,0,35.0,1,0,53.1000,S
4,0,3,1,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...
884,0,2,1,27.0,0,0,13.0000,S
885,1,1,0,19.0,0,0,30.0000,S
886,0,3,0,28.0,1,2,23.4500,S
887,1,1,1,26.0,0,0,30.0000,C


In [34]:
encoder = OneHotEncoder()
encoder.fit(df[['Embarked']])

onehot = encoder.transform(df[['Embarked']])
onehot = onehot.toarray()

onehot

array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       ...,
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.]])

In [35]:
onehot = pd.DataFrame(onehot)
onehot

Unnamed: 0,0,1,2
0,0.0,0.0,1.0
1,1.0,0.0,0.0
2,0.0,0.0,1.0
3,0.0,0.0,1.0
4,0.0,0.0,1.0
...,...,...,...
884,0.0,0.0,1.0
885,0.0,0.0,1.0
886,0.0,0.0,1.0
887,1.0,0.0,0.0


In [36]:
# 참고) encoder의 'get_feature_name()'을 사용하면 컬럼이름을 자동생성할 수 있다. 
encoder.get_feature_names()

array(['x0_C', 'x0_Q', 'x0_S'], dtype=object)

In [37]:
onehot.columns = ['C', 'Q', 'S']
onehot

Unnamed: 0,C,Q,S
0,0.0,0.0,1.0
1,1.0,0.0,0.0
2,0.0,0.0,1.0
3,0.0,0.0,1.0
4,0.0,0.0,1.0
...,...,...,...
884,0.0,0.0,1.0
885,0.0,0.0,1.0
886,0.0,0.0,1.0
887,1.0,0.0,0.0


In [38]:
onehot = pd.concat([df, onehot], axis=1)

In [39]:
onehot

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,C,Q,S
0,0,3,1,22.0,1,0,7.2500,S,0.0,0.0,1.0
1,1,1,0,38.0,1,0,71.2833,C,1.0,0.0,0.0
2,1,3,0,26.0,0,0,7.9250,S,0.0,0.0,1.0
3,1,1,0,35.0,1,0,53.1000,S,0.0,0.0,1.0
4,0,3,1,35.0,0,0,8.0500,S,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
884,0,2,1,27.0,0,0,13.0000,S,0.0,0.0,1.0
885,1,1,0,19.0,0,0,30.0000,S,0.0,0.0,1.0
886,0,3,0,28.0,1,2,23.4500,S,0.0,0.0,1.0
887,1,1,1,26.0,0,0,30.0000,C,1.0,0.0,0.0


In [40]:
df = onehot.drop('Embarked', axis=1)

In [41]:
df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,C,Q,S
0,0,3,1,22.0,1,0,7.2500,0.0,0.0,1.0
1,1,1,0,38.0,1,0,71.2833,1.0,0.0,0.0
2,1,3,0,26.0,0,0,7.9250,0.0,0.0,1.0
3,1,1,0,35.0,1,0,53.1000,0.0,0.0,1.0
4,0,3,1,35.0,0,0,8.0500,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...
884,0,2,1,27.0,0,0,13.0000,0.0,0.0,1.0
885,1,1,0,19.0,0,0,30.0000,0.0,0.0,1.0
886,0,3,0,28.0,1,2,23.4500,0.0,0.0,1.0
887,1,1,1,26.0,0,0,30.0000,1.0,0.0,0.0


In [42]:
X = np.array(df.iloc[:, 1:])
y = np.array(df['Survived'])

X_train, X_test, y_train, y_test = train_test_split(X, y)

tree = DecisionTreeClassifier(max_depth=3)
tree.fit(X_train, y_train)


temp_y_pred = tree.predict(X_test)

temp_acc_emb = accuracy_score(y_test, temp_y_pred)

print('정확도: ', format(temp_acc_emb))

정확도:  0.820627802690583


In [43]:
tree.feature_importances_

array([0.14915885, 0.62189074, 0.06139779, 0.07094684, 0.        ,
       0.09660578, 0.        , 0.        , 0.        ])

In [44]:
pd.DataFrame(tree.feature_importances_.reshape((1, -1)), index=['feature_importance'])

Unnamed: 0,0,1,2,3,4,5,6,7,8
feature_importance,0.149159,0.621891,0.061398,0.070947,0.0,0.096606,0.0,0.0,0.0


In [45]:
df.columns

Index(['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'C', 'Q',
       'S'],
      dtype='object')

In [46]:
pd.DataFrame(tree.feature_importances_.reshape((1, -1)), columns=df.columns[1:], index=['feature_importance'])

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,C,Q,S
feature_importance,0.149159,0.621891,0.061398,0.070947,0.0,0.096606,0.0,0.0,0.0


> ### **성별의 변수중요도가 가장 높음, 탑승항구 컬럼의 중요도가 0이다.**

In [47]:
print("정확도(탑승항구x):", format(temp_acc))
print('정확도(탑승항구포함): ', format(temp_acc_emb))

정확도(탑승항구x): 0.8161434977578476
정확도(탑승항구포함):  0.820627802690583


> ## **탑승항구 컬럼을 포함하지 않았을 경우의 정확도가 포함했을 경우보다 더 높다.**