# Last FM hometask <br>
https://www.kaggle.com/ravichaubey1506/lastfm <br>
1. Выбрать данные по странам своей группы (совместно): <br>
    3530203_70101: Germany, Netherlands <br>
    3530203_70102: Belarus, Ukraine, Poland, Russian Federation<br>
    3530903_70301: Sweden, Finland, Norway, Denmark, Iceland<br>
    3530903_70302: Spain, Portugal, France, Italy, Belgium<br>
    
2. Попытаться найти полезные с точки зрения продвижения групп (или еще чего-нибудь) и нетривиальные правила, используя алгоритмы Apriori, FPGrowth, FPMax и всевозможные метрики. Хотя бы 5 правил.
3. Вывести эти правила в отдельных ячейках. 
4. Подумать, как можно было бы использовать полученные правила.

In [100]:
import pandas as pd
import seaborn as sns
from mlxtend.frequent_patterns import apriori, association_rules, fpgrowth, fpmax

pd.options.display.max_rows = 100

In [2]:
data = pd.read_csv("lastfm.csv")
data.head()

Unnamed: 0,user,artist,sex,country
0,1,red hot chili peppers,f,Germany
1,1,the black dahlia murder,f,Germany
2,1,goldfrapp,f,Germany
3,1,dropkick murphys,f,Germany
4,1,le tigre,f,Germany


In [3]:
#pip install mlxtend

### Скорректируем данные

In [4]:
data['country'] = data['country'].str.strip(' ')
data['country'] = data['country'].str.capitalize()

In [5]:
data['artist'] = data['artist'].str.strip(' ')
data['artist'] = data['artist'].str.lower()

### Возьмем только Germany, Netherlands

In [6]:
data = data[data['country'].isin(['Germany','Netherlands'])]
# Netherlands Antilles - don't include

In [7]:
data['country'].value_counts()

Germany        24251
Netherlands     9673
Name: country, dtype: int64

### Проверим пустые и неизвестные значения

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 33924 entries, 0 to 289615
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   user     33924 non-null  int64 
 1   artist   33924 non-null  object
 2   sex      33924 non-null  object
 3   country  33924 non-null  object
dtypes: int64(1), object(3)
memory usage: 1.3+ MB


In [9]:
data['sex'].unique()

array(['f', 'm'], dtype=object)

In [10]:
data['country'].unique()

array(['Germany', 'Netherlands'], dtype=object)

In [None]:
for i in data['artist'].unique():
    print(i)

In [12]:
data[(data['artist'] == '[unknown]')].count()

user       95
artist     95
sex        95
country    95
dtype: int64

In [13]:
data = data[(data['artist'] != '[unknown]')]

In [14]:
data[(data['artist'] == '[unknown]')].count()

user       0
artist     0
sex        0
country    0
dtype: int64

### Проверим данные по артистам

In [15]:
data['artist'].value_counts().describe()

count    1001.000000
mean       33.795205
std        35.136747
min         1.000000
25%        13.000000
50%        22.000000
75%        41.000000
max       296.000000
Name: artist, dtype: float64

In [16]:
### Кажется, что артисты с малым количеством слушателей не явлются релевантными
### при поиске паттернов. Возьмем группы со значением больше 10

In [17]:
data = data.groupby(['artist']).filter(lambda x: x['artist'].count() > 10.)

In [18]:
data['artist'].value_counts().describe()

count    826.000000
mean      39.480630
std       36.195868
min       11.000000
25%       17.000000
50%       27.000000
75%       47.000000
max      296.000000
Name: artist, dtype: float64

In [19]:
### Проверим данные по пользователям

In [20]:
data['user'].value_counts().describe()

count    1722.000000
mean       18.937863
std        10.127036
min         1.000000
25%        11.000000
50%        19.000000
75%        26.000000
max        53.000000
Name: user, dtype: float64

In [21]:
### Кажется, что пользователи с малым количеством групп не явлются релевантными
### при поиске паттернов. Возьмем пользователей со значением больше 10

In [22]:
data = data.groupby(['user']).filter(lambda x: x['user'].count() >= 10.)

In [23]:
data['user'].value_counts().describe()

count    1350.000000
mean       22.606667
std         8.172716
min        10.000000
25%        16.000000
50%        22.000000
75%        29.000000
max        53.000000
Name: user, dtype: float64

In [24]:
data.head()

Unnamed: 0,user,artist,sex,country
0,1,red hot chili peppers,f,Germany
1,1,the black dahlia murder,f,Germany
2,1,goldfrapp,f,Germany
3,1,dropkick murphys,f,Germany
4,1,le tigre,f,Germany


# Let's find rules

## By user in two countries

In [25]:
data_by_user = data.groupby(['user', 'country', 'sex'])['artist'].apply(lambda x: ','.join(x).lower().strip()).reset_index()
data_by_user

Unnamed: 0,user,country,sex,artist
0,1,Germany,f,"red hot chili peppers,the black dahlia murder,..."
1,31,Netherlands,f,"jamiroquai,tori amos,jazzanova,radiohead,st. g..."
2,33,Germany,f,"death cab for cutie,tegan and sara,kimya dawso..."
3,51,Germany,f,"kate nash,arctic monkeys,lykke li,adele,tegan ..."
4,62,Germany,f,"beatsteaks,clueso,the kooks,die ärzte,plain wh..."
...,...,...,...,...
1345,19642,Germany,m,"fear factory,hypocrisy,the velvet underground,..."
1346,19662,Germany,m,"blind guardian,in extremo,subway to sally,finn..."
1347,19681,Netherlands,m,"the strokes,muse,led zeppelin,snow patrol,radi..."
1348,19685,Netherlands,m,"philip glass,rammstein,pink floyd,moby,blank &..."


In [26]:
#dummy encoding
dummy_data_by_user = data_by_user['artist'].str.get_dummies(',')
dummy_data_by_user = pd.concat([data_by_user, dummy_data_by_user], axis = 1)
dummy_data_by_user

Unnamed: 0,user,country,sex,artist,...and you will know us by the trail of dead,2pac,3 doors down,30 seconds to mars,36 crazyfists,44,...,venetian snares,vnv nation,volbeat,weezer,within temptation,wolfgang amadeus mozart,yann tiersen,yeah yeah yeahs,yellowcard,zero 7
0,1,Germany,f,"red hot chili peppers,the black dahlia murder,...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,31,Netherlands,f,"jamiroquai,tori amos,jazzanova,radiohead,st. g...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,33,Germany,f,"death cab for cutie,tegan and sara,kimya dawso...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,51,Germany,f,"kate nash,arctic monkeys,lykke li,adele,tegan ...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,62,Germany,f,"beatsteaks,clueso,the kooks,die ärzte,plain wh...",0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1345,19642,Germany,m,"fear factory,hypocrisy,the velvet underground,...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1346,19662,Germany,m,"blind guardian,in extremo,subway to sally,finn...",0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1347,19681,Netherlands,m,"the strokes,muse,led zeppelin,snow patrol,radi...",0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1348,19685,Netherlands,m,"philip glass,rammstein,pink floyd,moby,blank &...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
frequent_itemsets_user = apriori(dummy_data_by_user.drop(columns=['user','country', 'sex', 'artist']), 
                            min_support=0.015, use_colnames=True)

frequent_itemsets_user

Unnamed: 0,support,itemsets
0,0.017778,(...and you will know us by the trail of dead)
1,0.020000,(2pac)
2,0.070370,(3 doors down)
3,0.051111,(30 seconds to mars)
4,0.024444,(36 crazyfists)
...,...,...
1558,0.015556,"(placebo, rammstein, system of a down)"
1559,0.017037,"(red hot chili peppers, rammstein, system of a..."
1560,0.019259,"(rammstein, slipknot, system of a down)"
1561,0.019259,"(rammstein, the offspring, system of a down)"


In [28]:
rules_by_user = association_rules(frequent_itemsets_user, metric="confidence", min_threshold=0.5)
#rules["antecedent_len"] = rules["antecedents"].apply(lambda x: len(x))
rules_by_user

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(a perfect circle),(tool),0.037037,0.058519,0.020000,0.540000,9.227848,0.017833,2.046699
1,(equilibrium),(amon amarth),0.033333,0.045185,0.017037,0.511111,11.311475,0.015531,1.953030
2,(anti-flag),(rise against),0.030370,0.085185,0.021481,0.707317,8.303287,0.018894,3.125617
3,(arch enemy),(in flames),0.028889,0.087407,0.018519,0.641026,7.333768,0.015993,2.542222
4,(maxïmo park),(arctic monkeys),0.065926,0.124444,0.033333,0.505618,4.063002,0.025129,1.771010
...,...,...,...,...,...,...,...,...,...
118,"(placebo, system of a down)",(rammstein),0.028889,0.152593,0.015556,0.538462,3.528753,0.011147,1.836049
119,"(slipknot, rammstein)",(system of a down),0.029630,0.148889,0.019259,0.650000,4.365672,0.014848,2.431746
120,"(slipknot, system of a down)",(rammstein),0.030370,0.152593,0.019259,0.634146,4.155813,0.014625,2.316247
121,"(rammstein, the offspring)",(system of a down),0.025926,0.148889,0.019259,0.742857,4.989339,0.015399,3.309877


In [29]:
# support - показатель «частотности»
# confidence -  показатель того, как часто наше правило срабатывает для всего датасета.
# lift - 
# leverage - отношение «зависимости» items к их «независимости»
# conviction — это «частотность ошибок»  правила
rules_by_user[ 
       (rules_by_user['antecedent support'] > 0.03) &
       (rules_by_user['lift'] > 7) &
       (rules_by_user['support'] > 0.015) 
     ]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(a perfect circle),(tool),0.037037,0.058519,0.02,0.54,9.227848,0.017833,2.046699
1,(equilibrium),(amon amarth),0.033333,0.045185,0.017037,0.511111,11.311475,0.015531,1.95303
2,(anti-flag),(rise against),0.03037,0.085185,0.021481,0.707317,8.303287,0.018894,3.125617
10,(britney spears),(rihanna),0.040741,0.066667,0.020741,0.509091,7.636364,0.018025,1.901235
12,(christina aguilera),(rihanna),0.04,0.066667,0.020741,0.518519,7.777778,0.018074,1.938462
20,(equilibrium),(ensiferum),0.033333,0.042222,0.019259,0.577778,13.684211,0.017852,2.268421
21,(finntroll),(ensiferum),0.035556,0.042222,0.017778,0.5,11.842105,0.016277,1.915556
25,(in extremo),(subway to sally),0.062963,0.065926,0.032593,0.517647,7.85195,0.028442,1.936495
31,(the wombats),(maxïmo park),0.031852,0.065926,0.017037,0.534884,8.113405,0.014937,2.008259
35,(schandmaul),(subway to sally),0.054074,0.065926,0.02963,0.547945,8.311528,0.026065,2.066285


Видим, что если пользователю нравится металкор, то ему понравятся и другие группы металкор, если pop - то посоветовать pop.

In [30]:
frequent_itemsets_user2 = fpgrowth(dummy_data_by_user.drop(columns=['user','country', 'sex','artist']), 
                            min_support=0.015, use_colnames=True)
frequent_itemsets_user2

Unnamed: 0,support,itemsets
0,0.188889,(red hot chili peppers)
1,0.133333,(the killers)
2,0.120000,(jack johnson)
3,0.059259,(the rolling stones)
4,0.054074,(schandmaul)
...,...,...
1558,0.016296,"(subway to sally, ensiferum)"
1559,0.015556,"(in extremo, ensiferum)"
1560,0.015556,"(disturbed, godsmack)"
1561,0.015556,"(godsmack, system of a down)"


In [31]:
rules_by_user2 = association_rules(frequent_itemsets_user2, metric="confidence", min_threshold=0.5)
#rules["antecedent_len"] = rules["antecedents"].apply(lambda x: len(x))
rules_by_user2

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(schandmaul),(subway to sally),0.054074,0.065926,0.029630,0.547945,8.311528,0.026065,2.066285
1,"(schandmaul, subway to sally)",(in extremo),0.029630,0.062963,0.017778,0.600000,9.529412,0.015912,2.342593
2,"(schandmaul, in extremo)",(subway to sally),0.026667,0.065926,0.017778,0.666667,10.112360,0.016020,2.802222
3,"(subway to sally, in extremo)",(schandmaul),0.032593,0.054074,0.017778,0.545455,10.087173,0.016015,2.081037
4,(judas priest),(iron maiden),0.028148,0.056296,0.015556,0.552632,9.816482,0.013971,2.109455
...,...,...,...,...,...,...,...,...,...
118,"(the killers, editors)",(bloc party),0.025926,0.105926,0.016296,0.628571,5.934066,0.013550,2.407123
119,(black sabbath),(metallica),0.032593,0.134074,0.018519,0.568182,4.237820,0.014149,2.005302
120,(godsmack),(disturbed),0.025926,0.074815,0.015556,0.600000,8.019802,0.013616,2.312963
121,(godsmack),(system of a down),0.025926,0.148889,0.015556,0.600000,4.029851,0.011695,2.127778


In [32]:
rules_by_user2[ 
       (rules_by_user2['antecedent support'] > 0.03) &
       (rules_by_user2['lift'] > 7) &
       (rules_by_user2['support'] > 0.015) 
     ]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(schandmaul),(subway to sally),0.054074,0.065926,0.02963,0.547945,8.311528,0.026065,2.066285
3,"(subway to sally, in extremo)",(schandmaul),0.032593,0.054074,0.017778,0.545455,10.087173,0.016015,2.081037
40,(the wombats),(maxïmo park),0.031852,0.065926,0.017037,0.534884,8.113405,0.014937,2.008259
56,"(disturbed, koяn)",(slipknot),0.032593,0.065926,0.016296,0.5,7.58427,0.014148,1.868148
60,"(slipknot, system of a down)",(disturbed),0.03037,0.074815,0.016296,0.536585,7.172181,0.014024,1.996452
62,"(koяn, system of a down)",(slipknot),0.038519,0.065926,0.019259,0.5,7.58427,0.01672,1.868148
63,"(slipknot, system of a down)",(koяn),0.03037,0.086667,0.019259,0.634146,7.317073,0.016627,2.496444
73,(christina aguilera),(rihanna),0.04,0.066667,0.020741,0.518519,7.777778,0.018074,1.938462
98,"(disturbed, koяn)",(papa roach),0.032593,0.058519,0.017037,0.522727,8.932681,0.01513,1.972628
102,(anti-flag),(rise against),0.03037,0.085185,0.021481,0.707317,8.303287,0.018894,3.125617


Нет каких-либо нетривиальных пар.

In [36]:
#Добавим 2 столбца, которые будут описывать пол пользователя
['male'] = dummy_data_by_user['sex'].apply(lambda x: 1 if x == 'm' else 0);
dummy_data_by_user['female'] = dummy_data_by_user['sex'].apply(lambda x: 1 if x == 'f' else 0);
dummy_data_by_user

Unnamed: 0,user,country,sex,artist,...and you will know us by the trail of dead,2pac,3 doors down,30 seconds to mars,36 crazyfists,44,...,volbeat,weezer,within temptation,wolfgang amadeus mozart,yann tiersen,yeah yeah yeahs,yellowcard,zero 7,male,female
0,1,Germany,f,"red hot chili peppers,the black dahlia murder,...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,31,Netherlands,f,"jamiroquai,tori amos,jazzanova,radiohead,st. g...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,33,Germany,f,"death cab for cutie,tegan and sara,kimya dawso...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,51,Germany,f,"kate nash,arctic monkeys,lykke li,adele,tegan ...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,62,Germany,f,"beatsteaks,clueso,the kooks,die ärzte,plain wh...",0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1345,19642,Germany,m,"fear factory,hypocrisy,the velvet underground,...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1346,19662,Germany,m,"blind guardian,in extremo,subway to sally,finn...",0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
1347,19681,Netherlands,m,"the strokes,muse,led zeppelin,snow patrol,radi...",0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
1348,19685,Netherlands,m,"philip glass,rammstein,pink floyd,moby,blank &...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [112]:
frequent_itemsets_user4 = apriori(dummy_data_by_user.drop(columns=['user','country', 'sex', 'artist']), 
                            min_support=0.011, use_colnames=True)

frequent_itemsets_user4

Unnamed: 0,support,itemsets
0,0.017778,(...and you will know us by the trail of dead)
1,0.020000,(2pac)
2,0.070370,(3 doors down)
3,0.051111,(30 seconds to mars)
4,0.024444,(36 crazyfists)
...,...,...
5696,0.014815,"(male, rammstein, slipknot, system of a down)"
5697,0.011111,"(male, sum 41, rammstein, system of a down)"
5698,0.015556,"(male, rammstein, the offspring, system of a d..."
5699,0.014074,"(male, red hot chili peppers, the offspring, s..."


In [115]:
#По 935, 937 строке видим, что и для девушек, и для парней, которые слушают keane, можно рекомендовать coldplay
rules_by_user = association_rules(frequent_itemsets_user4, metric="confidence", min_threshold=0.589)
rules_by_user[ 
    (rules_by_user['consequents'] == {'coldplay'}) 
]
       

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
108,(keane),(coldplay),0.048148,0.208148,0.03037,0.630769,3.030386,0.020348,2.144599
586,"(muse, amy winehouse)",(coldplay),0.018519,0.208148,0.011111,0.6,2.882562,0.007257,1.97963
924,"(jason mraz, jack johnson)",(coldplay),0.018519,0.208148,0.012593,0.68,3.266904,0.008738,2.474537
927,"(male, jason mraz)",(coldplay),0.020741,0.208148,0.012593,0.607143,2.916878,0.008275,2.015623
933,"(keane, muse)",(coldplay),0.017037,0.208148,0.012593,0.73913,3.550983,0.009046,3.035432
934,"(keane, the kooks)",(coldplay),0.016296,0.208148,0.012593,0.772727,3.712391,0.009201,3.484148
935,"(male, keane)",(coldplay),0.03037,0.208148,0.019259,0.634146,3.046611,0.012938,2.164395
937,"(keane, female)",(coldplay),0.017778,0.208148,0.011111,0.625,3.002669,0.007411,2.111605
947,"(red hot chili peppers, moby)",(coldplay),0.020741,0.208148,0.012593,0.607143,2.916878,0.008275,2.015623
961,"(snow patrol, radiohead)",(coldplay),0.020741,0.208148,0.012593,0.607143,2.916878,0.008275,2.015623
