In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [2]:
df = pd.read_csv("C:/Users/Equan/Downloads/DBLP2.csv", encoding='latin-1')

In [3]:
df.head()

Unnamed: 0,id,title,authors,venue,year
0,journals/sigmod/Mackay99,Semantic Integration of Environmental Models f...,D. Scott Mackay,SIGMOD Record,1999
1,conf/vldb/PoosalaI96,Estimation of Query-Result Distribution and it...,"Viswanath Poosala, Yannis E. Ioannidis",VLDB,1996
2,conf/vldb/PalpanasSCP02,Incremental Maintenance for Non-Distributive A...,"Themistoklis Palpanas, Richard Sidle, Hamid Pi...",VLDB,2002
3,conf/vldb/GardarinGT96,Cost-based Selection of Path Expression Proces...,"Zhao-Hui Tang, Georges Gardarin, Jean-Robert G...",VLDB,1996
4,conf/vldb/HoelS95,Benchmarking Spatial Join Operations with Spat...,"Erik G. Hoel, Hanan Samet",VLDB,1995


In [4]:
df['authors'].value_counts()

Richard T. Snodgrass                           25
?                                              23
Michael J. Franklin                            12
Karl Aberer                                    11
Marianne Winslett                              11
                                               ..
Leonard J. Seligman, Michael J. Carey           1
Herb Edelstein                                  1
Arthur J. Bernstein, Narayanan Krishnakumar     1
Michael J. Franklin, Mehmet Altinel             1
Bongki Moon, Quanzhong Li                       1
Name: authors, Length: 2316, dtype: int64

## Data Cleaning

In [5]:
# Drop rows with value = ?
df = df[df.authors != '?']

In [6]:
# Confirm '?' is no more
df['authors'].value_counts()

Richard T. Snodgrass                           25
Michael J. Franklin                            12
Karl Aberer                                    11
Ling Liu                                       11
Marianne Winslett                              11
                                               ..
Leonard J. Seligman, Michael J. Carey           1
Herb Edelstein                                  1
Arthur J. Bernstein, Narayanan Krishnakumar     1
Michael J. Franklin, Mehmet Altinel             1
Bongki Moon, Quanzhong Li                       1
Name: authors, Length: 2315, dtype: int64

In [7]:
# Remove white spaces
df['authors'] = df['authors'].str.strip()

In [8]:
df.head()

Unnamed: 0,id,title,authors,venue,year
0,journals/sigmod/Mackay99,Semantic Integration of Environmental Models f...,D. Scott Mackay,SIGMOD Record,1999
1,conf/vldb/PoosalaI96,Estimation of Query-Result Distribution and it...,"Viswanath Poosala, Yannis E. Ioannidis",VLDB,1996
2,conf/vldb/PalpanasSCP02,Incremental Maintenance for Non-Distributive A...,"Themistoklis Palpanas, Richard Sidle, Hamid Pi...",VLDB,2002
3,conf/vldb/GardarinGT96,Cost-based Selection of Path Expression Proces...,"Zhao-Hui Tang, Georges Gardarin, Jean-Robert G...",VLDB,1996
4,conf/vldb/HoelS95,Benchmarking Spatial Join Operations with Spat...,"Erik G. Hoel, Hanan Samet",VLDB,1995


In [9]:
# Change the authors column into a list
authors_list = list(df['authors'].apply(lambda x:x.split(',')))
authors_list

[['D. Scott Mackay'],
 ['Viswanath Poosala', ' Yannis E. Ioannidis'],
 ['Themistoklis Palpanas',
  ' Richard Sidle',
  ' Hamid Pirahesh',
  ' Roberta Cochrane'],
 ['Zhao-Hui Tang', ' Georges Gardarin', ' Jean-Robert Gruser'],
 ['Erik G. Hoel', ' Hanan Samet'],
 ['Daniel A. Keim'],
 ['Aris M. Ouksel'],
 ['Praveen Seshadri'],
 ['Nandit Soparkar', ' Krithi Ramamritham'],
 ['Phil Janus', " Albert D'Andrea"],
 ['Alfons Kemper', ' Donald Kossmann'],
 ['Nesime Tatbul',
  ' Daniel J. Abadi',
  ' C. Erwin',
  ' Anurag Maskey',
  ' Mitch Cherniack',
  ' Alex Rasin',
  ' Christian Convey',
  ' A. Singer',
  ' Eduardo F. Galvez',
  ' R. Yan',
  ' Ugur Çetintemel',
  ' Ying Xing',
  ' Stanley B. Zdonik',
  ' Michael Stonebraker',
  ' Donald Carney',
  ' M. Hatoun'],
 ['Rajendran M. Sivasankaran',
  ' Bhaskar Purimetla',
  ' John A. Stankovic',
  ' Krithi Ramamritham',
  ' Donald F. Towsley'],
 ['Surojit Chatterjee', ' Gholamhosein Sheikholeslami', ' Aidong Zhang'],
 ['Hector Garcia-Molina', ' Kevin

## Fit & Transform the Authors List

In [10]:
# Fit & Transform the Authors list
te = TransactionEncoder()
te_data = te.fit(authors_list).transform(authors_list)

In [11]:
authors_df = pd.DataFrame(te_data, columns=te.columns_)
authors_df

Unnamed: 0,A. Amorim,A. Anderson,A. Dholakia,A. Kazarov,A. Khivesera,A. Mahboob,A. Pantazi,A. Prasad Sistla,A. Ribeiro,A. Singer,...,Zhao-Hui Tang,Zhimin Chen,Zhiyuan Chen,Zografoula Vagena,Zohreh Nazeri,Zoran Despotovic,eva Kühn,Åsa Hagström,Özgür Ulusoy,Øystein Torbjørnsen
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2588,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2589,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2590,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2591,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


## Generate Frequent Itemsets

In [12]:
frequent_itemsets = apriori(authors_df, min_support=0.0015, use_colnames=True)
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.001928,( A. Prasad Sistla)
1,0.001928,( Abdelsalam Helal)
2,0.006942,( Abraham Silberschatz)
3,0.003085,( Ahmed K. Elmagarmid)
4,0.001543,( Aidong Zhang)
...,...,...
465,0.001543,"( Divyakant Agrawal, K. Selçuk Candan, Wen-S..."
466,0.001543,"( Divyakant Agrawal, Wang-Pin Hsiung, Wen-Sy..."
467,0.001543,"( Ugur Çetintemel, Stanley B. Zdonik, Donald..."
468,0.001543,"( Kaushik Dutta, Krithi Ramamritham, Helen M..."


In [13]:
# Sort the frequent itemsets by support to see the Authors with the highest support values
print(frequent_itemsets.sort_values(by='support',ascending=False))

      support                                           itemsets
89   0.012727                            ( Hector Garcia-Molina)
369  0.011570                             (Richard T. Snodgrass)
60   0.010413                               ( Divesh Srivastava)
155  0.009256                             ( Michael J. Franklin)
84   0.009256                                  ( H. V. Jagadish)
..        ...                                                ...
141  0.001543                                ( Marian H. Nodine)
138  0.001543                                     ( Lucian Popa)
134  0.001543                                ( Leonidas Fegaras)
133  0.001543                             ( Leonard J. Seligman)
469  0.001543  ( Mitch Cherniack,  Ugur Çetintemel,  Stanley ...

[470 rows x 2 columns]


## Association Rule Mining 

In [14]:
# Generate rules using confidence as metric
rules = association_rules(frequent_itemsets, metric='confidence', min_threshold=0.01)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,( Rajeev Rastogi),( Abraham Silberschatz),0.008870,0.006942,0.002314,0.260870,37.579710,0.002252,1.343549
1,( Abraham Silberschatz),( Rajeev Rastogi),0.006942,0.008870,0.002314,0.333333,37.579710,0.002252,1.486695
2,( Abraham Silberschatz),( S. Seshadri),0.006942,0.004628,0.001543,0.222222,48.018519,0.001510,1.279764
3,( S. Seshadri),( Abraham Silberschatz),0.004628,0.006942,0.001543,0.333333,48.018519,0.001510,1.489587
4,( Alfons Kemper),( Donald Kossmann),0.005013,0.006556,0.002314,0.461538,70.398190,0.002281,1.844967
...,...,...,...,...,...,...,...,...,...
169,"( Mitch Cherniack, Stanley B. Zdonik)",( Ugur Çetintemel),0.001543,0.001928,0.001543,1.000000,518.600000,0.001540,inf
170,"( Ugur Çetintemel, Stanley B. Zdonik)",( Mitch Cherniack),0.001928,0.002314,0.001543,0.800000,345.733333,0.001538,4.988430
171,( Mitch Cherniack),"( Ugur Çetintemel, Stanley B. Zdonik)",0.002314,0.001928,0.001543,0.666667,345.733333,0.001538,2.994215
172,( Ugur Çetintemel),"( Mitch Cherniack, Stanley B. Zdonik)",0.001928,0.001543,0.001543,0.800000,518.600000,0.001540,4.992287


In [15]:
# Generate rules using lift as metric
rules2 = association_rules(frequent_itemsets, metric='lift', min_threshold=50)
rules2

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,( Alfons Kemper),( Donald Kossmann),0.005013,0.006556,0.002314,0.461538,70.398190,0.002281,1.844967
1,( Donald Kossmann),( Alfons Kemper),0.006556,0.005013,0.002314,0.352941,70.398190,0.002281,1.537706
2,( Daniela Florescu),( Alon Y. Levy),0.006556,0.005013,0.001928,0.294118,58.665158,0.001895,1.409564
3,( Alon Y. Levy),( Daniela Florescu),0.005013,0.006556,0.001928,0.384615,58.665158,0.001895,1.614346
4,( Divyakant Agrawal),( Amr El Abbadi),0.005399,0.002700,0.001543,0.285714,105.836735,0.001528,1.396221
...,...,...,...,...,...,...,...,...,...
147,"( Mitch Cherniack, Stanley B. Zdonik)",( Ugur Çetintemel),0.001543,0.001928,0.001543,1.000000,518.600000,0.001540,inf
148,"( Ugur Çetintemel, Stanley B. Zdonik)",( Mitch Cherniack),0.001928,0.002314,0.001543,0.800000,345.733333,0.001538,4.988430
149,( Mitch Cherniack),"( Ugur Çetintemel, Stanley B. Zdonik)",0.002314,0.001928,0.001543,0.666667,345.733333,0.001538,2.994215
150,( Ugur Çetintemel),"( Mitch Cherniack, Stanley B. Zdonik)",0.001928,0.001543,0.001543,0.800000,518.600000,0.001540,4.992287


In [17]:
result_df = rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']]
result_df.sort_values(by='support', ascending=False)
result_df.head()

Unnamed: 0,antecedents,consequents,support,confidence,lift
0,( Rajeev Rastogi),( Abraham Silberschatz),0.002314,0.26087,37.57971
1,( Abraham Silberschatz),( Rajeev Rastogi),0.002314,0.333333,37.57971
2,( Abraham Silberschatz),( S. Seshadri),0.001543,0.222222,48.018519
3,( S. Seshadri),( Abraham Silberschatz),0.001543,0.333333,48.018519
4,( Alfons Kemper),( Donald Kossmann),0.002314,0.461538,70.39819


In [19]:
by_support = result_df.sort_values(by='support',ascending=False)
by_support.head()

Unnamed: 0,antecedents,consequents,support,confidence,lift
15,( Beng Chin Ooi),( Kian-Lee Tan),0.003471,0.692308,112.197115
14,( Kian-Lee Tan),( Beng Chin Ooi),0.003471,0.5625,112.197115
35,( Divesh Srivastava),( H. V. Jagadish),0.003085,0.296296,32.012346
34,( H. V. Jagadish),( Divesh Srivastava),0.003085,0.333333,32.012346
89,( Minos N. Garofalakis),( Rajeev Rastogi),0.0027,0.35,39.458696
