In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# 读取数据集并存放在winedata中并显示前几个数据

In [3]:
winedata_file = open(".\winemag-data_first150k.csv",encoding='utf-8')
winedata = pd.read_csv(winedata_file)
winedata = winedata.dropna()
winedata.head()

Unnamed: 0,﻿id,country,description,designation,points,price,province,region_1,region_2,variety,winery
0,0,US,This tremendous 100% varietal wine hails from ...,Martha's Vineyard,96,235.0,California,Napa Valley,Napa,Cabernet Sauvignon,Heitz
2,2,US,Mac Watson honors the memory of a wine once ma...,Special Selected Late Harvest,96,90.0,California,Knights Valley,Sonoma,Sauvignon Blanc,Macauley
3,3,US,"This spent 20 months in 30% new French oak, an...",Reserve,96,65.0,Oregon,Willamette Valley,Willamette Valley,Pinot Noir,Ponzi
8,8,US,This re-named vineyard was formerly bottled as...,Silice,95,65.0,Oregon,Chehalem Mountains,Willamette Valley,Pinot Noir,Bergström
9,9,US,The producer sources from two blocks of the vi...,Gap's Crown Vineyard,95,60.0,California,Sonoma Coast,Sonoma,Pinot Noir,Blue Farm


# 获取wine数据集的所有属性信息 除去第一个id的属性和description的属性，id属性没有包含任何模式信息，description属性信息处理较为复杂，故全部去除

In [4]:
winedata = winedata.drop([winedata.columns[0]],axis=1)
winedata = winedata.drop([winedata.columns[1]],axis=1)
winedata.info()
winedata.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39241 entries, 0 to 150916
Data columns (total 9 columns):
country        39241 non-null object
designation    39241 non-null object
points         39241 non-null int64
price          39241 non-null float64
province       39241 non-null object
region_1       39241 non-null object
region_2       39241 non-null object
variety        39241 non-null object
winery         39241 non-null object
dtypes: float64(1), int64(1), object(7)
memory usage: 3.0+ MB


Unnamed: 0,country,designation,points,price,province,region_1,region_2,variety,winery
0,US,Martha's Vineyard,96,235.0,California,Napa Valley,Napa,Cabernet Sauvignon,Heitz
2,US,Special Selected Late Harvest,96,90.0,California,Knights Valley,Sonoma,Sauvignon Blanc,Macauley
3,US,Reserve,96,65.0,Oregon,Willamette Valley,Willamette Valley,Pinot Noir,Ponzi
8,US,Silice,95,65.0,Oregon,Chehalem Mountains,Willamette Valley,Pinot Noir,Bergström
9,US,Gap's Crown Vineyard,95,60.0,California,Sonoma Coast,Sonoma,Pinot Noir,Blue Farm


# 首先对数据预处理
# 在数据集中可以发现points和price是连续数据，进行离散化处理
# 可视化观察两者的数据分布

In [5]:
winedata['price'].loc[winedata['price']<200].hist(bins=20)
plt.show()
winedata['points'].hist()
plt.show()

# 首先处理points
# 再处理price

In [6]:
bin = [0,75,80,85,90,95,100]
winedata['points'] = pd.cut(winedata['points'],bin)
winedata['points'] = winedata['points'].astype('str')
bin = [0,20,30,40,50,60,2100]
winedata['price'] = pd.cut(winedata['price'],bin)
winedata['price'] = winedata['price'].astype('str')
winedata.head()

Unnamed: 0,country,designation,points,price,province,region_1,region_2,variety,winery
0,US,Martha's Vineyard,"(95, 100]","(60, 2100]",California,Napa Valley,Napa,Cabernet Sauvignon,Heitz
2,US,Special Selected Late Harvest,"(95, 100]","(60, 2100]",California,Knights Valley,Sonoma,Sauvignon Blanc,Macauley
3,US,Reserve,"(95, 100]","(60, 2100]",Oregon,Willamette Valley,Willamette Valley,Pinot Noir,Ponzi
8,US,Silice,"(90, 95]","(60, 2100]",Oregon,Chehalem Mountains,Willamette Valley,Pinot Noir,Bergström
9,US,Gap's Crown Vineyard,"(90, 95]","(50, 60]",California,Sonoma Coast,Sonoma,Pinot Noir,Blue Farm


# 构建Aprior算法

In [7]:
def createC1( dataSet ):
    """
    构建初始候选项集的列表，即所有候选项集只包含一个元素，
    C1是大小为1的所有候选项集的集合
    """
    C1 = []
    for transaction in np.array(dataSet):
        for item in transaction:
            if [item] not in C1:
                C1.append( [item] )
    C1.sort()
    return list(map( frozenset, C1 ))

def scanD( D, Ck, minSupport ):
    """
    计算Ck中的项集在数据集合D(记录或者transactions)中的支持度,
    返回满足最小支持度的项集的集合，和所有项集支持度信息的字典。
    """
    ssCnt = {}
    for tid in D:
        # 对于每一条transaction
        if Ck is not None:
            for can in Ck:
                # 对于每一个候选项集can，检查是否是transaction的一部分
                # 即该候选can是否得到transaction的支持
                if can.issubset( tid ):
                    ssCnt[ can ] = ssCnt.get( can, 0) + 1
    numItems = float( len( D ) )
    retList = []
    supportData = {}
    for key in ssCnt:
        # 每个项集的支持度
        support = ssCnt[ key ] / numItems
        
        # 将满足最小支持度的项集，加入retList
        if support >= minSupport:
            retList.insert( 0, key )
            
            # 汇总支持度数据
            supportData[ key ] = support
    return retList, supportData

# C1 = createC1(winedata)
# D = list(map(set,np.array(winedata)))
# L,supportData = scanD(D,C1,0.5)
# print(L)

In [12]:
# Aprior算法
def aprioriGen( Lk, k ):
    '''
    由初始候选项集的集合Lk生成新的生成候选项集，
    k表示生成的新项集中所含有的元素个数
    '''
    retList = []
    lenLk = len( Lk )
    for i in range( lenLk ):
        for j in range( i + 1, lenLk ):
            L1 = list( Lk[ i ] )[ : k - 2 ];
            L2 = list( Lk[ j ] )[ : k - 2 ];
            L1.sort();L2.sort()    
            if L1==L2:
                retList.append( Lk[ i ] | Lk[ j ] ) 
    return retList

def apriori( dataSet, minSupport = 0.5 ):
    # 构建初始候选项集C1
    C1 = createC1( dataSet )
    
    # 将dataSet集合化，以满足scanD的格式要求
    D =list( map( set, dataSet ))
    
    # 构建初始的频繁项集，即所有项集只有一个元素
    L1, suppData = scanD( D, C1, minSupport )
    L = [ L1 ]
    # 最初的L1中的每个项集含有一个元素，新生成的
    # 项集应该含有2个元素，所以 k=2
    k = 2
    
    while ( len( L[ k - 2 ] ) > 0 ):
        Ck = aprioriGen( L[ k - 2 ], k )
        Lk, supK = scanD( D, Ck, minSupport )
        
        # 将新的项集的支持度数据加入原来的总支持度字典中
        suppData.update( supK )
        
        # 将符合最小支持度要求的项集加入L
        L.append( Lk )
        
        # 新生成的项集中的元素个数应不断增加
        k += 1
    # 返回所有满足条件的频繁项集的列表，和所有候选项集的支持度信息
    return L, suppData

# 对于置信度大于0.1的模式进行进行挖掘，显示出频繁项集

In [13]:
myDat = list(map(set,np.array(winedata)))
L, suppData = apriori(myDat, 0.1)
L 

[[frozenset({'Columbia Valley'}),
  frozenset({'(60, 2100]'}),
  frozenset({'(80, 85]'}),
  frozenset({'Pinot Noir'}),
  frozenset({'US'}),
  frozenset({'(30, 40]'}),
  frozenset({'Chardonnay'}),
  frozenset({'(20, 30]'}),
  frozenset({'Central Coast'}),
  frozenset({'(40, 50]'}),
  frozenset({'(85, 90]'}),
  frozenset({'(0, 20]'}),
  frozenset({'Washington'}),
  frozenset({'California'}),
  frozenset({'Napa'}),
  frozenset({'(90, 95]'}),
  frozenset({'Cabernet Sauvignon'}),
  frozenset({'Sonoma'})],
 [frozenset({'Chardonnay', 'US'}),
  frozenset({'(0, 20]', 'US'}),
  frozenset({'California', 'Chardonnay'}),
  frozenset({'California', 'Napa'}),
  frozenset({'California', 'US'}),
  frozenset({'(90, 95]', 'US'}),
  frozenset({'California', 'Pinot Noir'}),
  frozenset({'(30, 40]', '(85, 90]'}),
  frozenset({'(20, 30]', '(85, 90]'}),
  frozenset({'(90, 95]', 'California'}),
  frozenset({'Sonoma', 'US'}),
  frozenset({'(20, 30]', 'US'}),
  frozenset({'Columbia Valley', 'Washington'}),
  fro

# 由于支持度大于0.1的频繁项集很多，取置信度大于0.8的进行分析

In [14]:
def select_conf(L,suppData,thredhold=0.8):
    confList = []
    for itemSet in L:
            for record in itemSet:
                if(len(record)>1): 
                    for i in record: 
                        tar = suppData[frozenset([i])]
                        bset = record - frozenset([i])
                        aset = suppData[record]
                        ret = suppData.get(bset)
                        try:
                            if suppData.get(bset):
                                con = aset/ret
                                if con > thredhold:
                                    confList.append([frozenset([i]),record,con])
                        except TypeError as e:
                            print(e)

    return confList

# 提升度相关性度量

In [15]:
def liftCheck(conList,suppData):
    liftList_true=[]
    liftList_no=[]
    liftList_false=[]
    for item in confList:
        score = suppData.get(item[1])/(suppData.get(item[0])*suppData.get(item[1]-item[0]))
        if score>1:
            liftList_true.append(item)
        if score<1:
            liftList_false.append(item)
        if score ==1:
            liftList_no.append(item)
    return liftList_true,liftList_no,liftList_false

# 根据提升度度量查看正相关的关联规则

In [16]:
confList = select_conf(L,suppData)
yes,no,false = liftCheck(confList,suppData)
yes

[[frozenset({'California'}),
  frozenset({'California', 'Chardonnay'}),
  0.8161389172625126],
 [frozenset({'California'}), frozenset({'California', 'Napa'}), 1.0],
 [frozenset({'Washington'}),
  frozenset({'Columbia Valley', 'Washington'}),
  1.0],
 [frozenset({'Columbia Valley'}),
  frozenset({'Columbia Valley', 'Washington'}),
  0.9449526298595231],
 [frozenset({'California'}), frozenset({'California', 'Central Coast'}), 1.0],
 [frozenset({'California'}),
  frozenset({'Cabernet Sauvignon', 'California'}),
  0.826593137254902],
 [frozenset({'California'}),
  frozenset({'(80, 85]', 'California'}),
  0.8234507897934387],
 [frozenset({'California'}), frozenset({'California', 'Sonoma'}), 1.0],
 [frozenset({'California'}),
  frozenset({'California', 'Chardonnay', 'US'}),
  0.8161389172625126],
 [frozenset({'California'}),
  frozenset({'Cabernet Sauvignon', 'California', 'US'}),
  0.826593137254902],
 [frozenset({'California'}), frozenset({'California', 'Sonoma', 'US'}), 1.0],
 [frozenset(

# 全置信度和最大置信度计算

In [17]:
def all_confidence(conList,suppData):
    result = []
    for item in confList:
        score = suppData.get(item[1])/max(suppData.get(item[0]),suppData.get(item[1]-item[0]))
        result.append([score,item])
    return result

def max_confidence(conList,suppData):
    result = []
    for item in confList:
        score = suppData.get(item[1])/min(suppData.get(item[0]),suppData.get(item[1]-item[0]))
        result.append([score,item])
    return result   

# 全置信度大于0.8的关联关系展示

In [18]:
def selsct_confidence(conf,delta,conf_str):
    for item in conf:
        if item[0] >= delta:
            print(conf_str,str(item[0]),item[1])
all_conf = all_confidence(yes,suppData)
selsct_confidence(all_conf,0.8,'全置信度')

全置信度 0.9449526298595231 [frozenset({'Washington'}), frozenset({'Washington', 'Columbia Valley'}), 1.0]
全置信度 0.9449526298595231 [frozenset({'Columbia Valley'}), frozenset({'Washington', 'Columbia Valley'}), 0.9449526298595231]
全置信度 0.9449526298595231 [frozenset({'Columbia Valley'}), frozenset({'US', 'Columbia Valley', 'Washington'}), 0.9449526298595231]
全置信度 0.9449526298595231 [frozenset({'Washington'}), frozenset({'US', 'Columbia Valley', 'Washington'}), 1.0]


# 最大置信度大于0.9的关联关系展示

In [19]:
max_conf = max_confidence(yes,suppData)
selsct_confidence(max_conf,0.9,'最大置信度')

最大置信度 1.0 [frozenset({'US'}), frozenset({'Chardonnay', 'US'}), 1.0]
最大置信度 1.0 [frozenset({'US'}), frozenset({'US', '(0, 20]'}), 1.0]
最大置信度 1.0 [frozenset({'California'}), frozenset({'California', 'Napa'}), 1.0]
最大置信度 1.0 [frozenset({'US'}), frozenset({'California', 'US'}), 1.0]
最大置信度 1.0 [frozenset({'US'}), frozenset({'US', '(90, 95]'}), 1.0]
最大置信度 1.0 [frozenset({'US'}), frozenset({'US', 'Sonoma'}), 1.0]
最大置信度 1.0 [frozenset({'US'}), frozenset({'US', '(20, 30]'}), 1.0]
最大置信度 1.0 [frozenset({'Washington'}), frozenset({'Washington', 'Columbia Valley'}), 1.0]
最大置信度 1.0 [frozenset({'Columbia Valley'}), frozenset({'Washington', 'Columbia Valley'}), 0.9449526298595231]
最大置信度 1.0 [frozenset({'California'}), frozenset({'Central Coast', 'California'}), 1.0]
最大置信度 1.0 [frozenset({'US'}), frozenset({'US', 'Columbia Valley'}), 1.0]
最大置信度 1.0 [frozenset({'US'}), frozenset({'US', '(30, 40]'}), 1.0]
最大置信度 1.0 [frozenset({'US'}), frozenset({'US', '(85, 90]'}), 1.0]
最大置信度 1.0 [frozenset({'US'}), froze

# 计算不平衡比IR

In [21]:
def cal_ir(conList,suppData):
    result = []
    for item in confList:
        score = abs(suppData.get(item[0]) - suppData.get(item[1]-item[0])) / (suppData.get(item[0])+suppData.get(item[1]-item[0]) - suppData.get(item[1]))
        result.append([score,item[0],item[1]-item[0]])
    return result

In [24]:
IR = cal_ir(yes,suppData)
def bigger_ir(conf,delta1,delta2,conf_str):
    for item in conf:
        if item[0] > delta1 and  item[0] < delta2:
            print(conf_str,str(item[0]),item[1],item[2])
bigger_ir(IR,0.4,0.6,'ir')

ir 0.4981269590479346 frozenset({'US'}) frozenset({'(85, 90]'})


# 使用全置信度可以看到，产地Columbia Valley与城市Washington，国家US相关性较大
# 使用不平衡比分析可知，国家US与分数(85,90]平衡性较好