# Last FM hometask <br>
https://www.kaggle.com/ravichaubey1506/lastfm <br>
1. Выбрать данные по странам своей группы (совместно): <br>
    3530203_70101: Germany, Netherlands <br>
    3530203_70102: Belarus, Ukraine, Poland, Russian Federation<br>
    3530903_70301: Sweden, Finland, Norway, Denmark, Iceland<br>
    3530903_70302: Spain, Portugal, France, Italy, Belgium<br>
    
2. Попытаться найти полезные с точки зрения продвижения групп (или еще чего-нибудь) и нетривиальные правила, используя алгоритмы Apriori, FPGrowth, FPMax и всевозможные метрики. Хотя бы 5 правил.
3. Вывести эти правила в отдельных ячейках. 
4. Подумать, как можно было бы использовать полученные правила.

In [3]:
import pandas as pd
import seaborn as sns
from mlxtend.frequent_patterns import apriori, association_rules, fpgrowth, fpmax

pd.options.display.max_rows = 100

In [4]:
data = pd.read_csv("lastfm.csv")
data.head()

Unnamed: 0,user,artist,sex,country
0,1,red hot chili peppers,f,Germany
1,1,the black dahlia murder,f,Germany
2,1,goldfrapp,f,Germany
3,1,dropkick murphys,f,Germany
4,1,le tigre,f,Germany


In [3]:
#pip install mlxtend

### Скорректируем данные

In [5]:
data['country'] = data['country'].str.strip(' ')
data['country'] = data['country'].str.capitalize()

In [6]:
data['artist'] = data['artist'].str.strip(' ')
data['artist'] = data['artist'].str.lower()

### Возьмем только Germany, Netherlands

In [7]:
data = data[data['country'].isin(['Germany','Netherlands'])]
# Netherlands Antilles - don't include

In [8]:
data['country'].value_counts()

Germany        24251
Netherlands     9673
Name: country, dtype: int64

### Проверим пустые и неизвестные значения

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 33924 entries, 0 to 289615
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   user     33924 non-null  int64 
 1   artist   33924 non-null  object
 2   sex      33924 non-null  object
 3   country  33924 non-null  object
dtypes: int64(1), object(3)
memory usage: 1.3+ MB


In [10]:
data['sex'].unique()

array(['f', 'm'], dtype=object)

In [11]:
data['country'].unique()

array(['Germany', 'Netherlands'], dtype=object)

In [12]:
data[(data['artist'] == '[unknown]')].count()

user       95
artist     95
sex        95
country    95
dtype: int64

In [13]:
data = data[(data['artist'] != '[unknown]')]

In [15]:
data[(data['artist'] == '[unknown]')].count()

user       0
artist     0
sex        0
country    0
dtype: int64

### Проверим данные по артистам

In [14]:
data['artist'].value_counts().describe()

count    1001.000000
mean       33.795205
std        35.136747
min         1.000000
25%        13.000000
50%        22.000000
75%        41.000000
max       296.000000
Name: artist, dtype: float64

In [15]:
### Кажется, что артисты с малым количеством слушателей не явлются релевантными
### при поиске паттернов. Возьмем группы со значением больше 10

In [18]:
data = data.groupby(['artist']).filter(lambda x: x['artist'].count() > 10.)

In [19]:
data['artist'].value_counts().describe()

count    826.000000
mean      39.480630
std       36.195868
min       11.000000
25%       17.000000
50%       27.000000
75%       47.000000
max      296.000000
Name: artist, dtype: float64

In [20]:
### Проверим данные по пользователям

In [20]:
data['user'].value_counts().describe()

count    1722.000000
mean       18.937863
std        10.127036
min         1.000000
25%        11.000000
50%        19.000000
75%        26.000000
max        53.000000
Name: user, dtype: float64

In [22]:
### Кажется, что пользователи с 1 или 2 числом групп не явлются релевантными
### при поиске паттернов. Возьмем пользователей со значением больше 3

In [21]:
data = data.groupby(['user']).filter(lambda x: x['user'].count() >= 3.)

In [22]:
data['user'].value_counts().describe()

count    1671.000000
mean       19.470975
std         9.802002
min         3.000000
25%        11.000000
50%        19.000000
75%        27.000000
max        53.000000
Name: user, dtype: float64

In [23]:
data.head()

Unnamed: 0,user,artist,sex,country
0,1,red hot chili peppers,f,Germany
1,1,the black dahlia murder,f,Germany
2,1,goldfrapp,f,Germany
3,1,dropkick murphys,f,Germany
4,1,le tigre,f,Germany


## Поиск правил

In [24]:
data_by_user = data.groupby(['user', 'country', 'sex'])['artist'].apply(lambda x: ','.join(x).lower().strip()).reset_index()
data_by_user

Unnamed: 0,user,country,sex,artist
0,1,Germany,f,"red hot chili peppers,the black dahlia murder,..."
1,31,Netherlands,f,"jamiroquai,tori amos,jazzanova,radiohead,st. g..."
2,33,Germany,f,"death cab for cutie,tegan and sara,kimya dawso..."
3,42,Germany,f,"soundtrack,groove coverage,avril lavigne,the r..."
4,51,Germany,f,"kate nash,arctic monkeys,lykke li,adele,tegan ..."
...,...,...,...,...
1666,19662,Germany,m,"blind guardian,in extremo,subway to sally,finn..."
1667,19672,Germany,m,"system of a down,björk,portishead,the prodigy,..."
1668,19681,Netherlands,m,"the strokes,muse,led zeppelin,snow patrol,radi..."
1669,19685,Netherlands,m,"philip glass,rammstein,pink floyd,moby,blank &..."


In [25]:
#dummy encoding
dummy_data_by_user = data_by_user['artist'].str.get_dummies(',')
dummy_data_by_user = pd.concat([data_by_user, dummy_data_by_user], axis = 1)
dummy_data_by_user

Unnamed: 0,user,country,sex,artist,...and you will know us by the trail of dead,2pac,3 doors down,30 seconds to mars,36 crazyfists,44,...,venetian snares,vnv nation,volbeat,weezer,within temptation,wolfgang amadeus mozart,yann tiersen,yeah yeah yeahs,yellowcard,zero 7
0,1,Germany,f,"red hot chili peppers,the black dahlia murder,...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,31,Netherlands,f,"jamiroquai,tori amos,jazzanova,radiohead,st. g...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,33,Germany,f,"death cab for cutie,tegan and sara,kimya dawso...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,42,Germany,f,"soundtrack,groove coverage,avril lavigne,the r...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,51,Germany,f,"kate nash,arctic monkeys,lykke li,adele,tegan ...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1666,19662,Germany,m,"blind guardian,in extremo,subway to sally,finn...",0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1667,19672,Germany,m,"system of a down,björk,portishead,the prodigy,...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1668,19681,Netherlands,m,"the strokes,muse,led zeppelin,snow patrol,radi...",0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1669,19685,Netherlands,m,"philip glass,rammstein,pink floyd,moby,blank &...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
frequent_itemsets_user = apriori(dummy_data_by_user.drop(columns=['user','country', 'sex', 'artist']), 
                            min_support=0.015, use_colnames=True)

frequent_itemsets_user

Unnamed: 0,support,itemsets
0,0.018552,(2pac)
1,0.059246,(3 doors down)
2,0.041891,(30 seconds to mars)
3,0.020347,(36 crazyfists)
4,0.020946,(50 cent)
...,...,...
996,0.017355,"(rammstein, koяn, system of a down)"
997,0.015560,"(slipknot, koяn, system of a down)"
998,0.016756,"(rammstein, metallica, system of a down)"
999,0.015560,"(slipknot, rammstein, system of a down)"


## Правило 1

In [37]:
rules_by_user = association_rules(frequent_itemsets_user, metric="support", min_threshold=0.015)

In [40]:
rules_by_user[ 
    (rules_by_user['conviction'] < 1) 
    ]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
382,(nirvana),(coldplay),0.091562,0.177139,0.016158,0.176471,0.996224,-6.1e-05,0.999188
383,(coldplay),(nirvana),0.177139,0.091562,0.016158,0.091216,0.996224,-6.1e-05,0.99962
398,(rammstein),(coldplay),0.136445,0.177139,0.020946,0.153509,0.866599,-0.003224,0.972084
399,(coldplay),(rammstein),0.177139,0.136445,0.020946,0.118243,0.866599,-0.003224,0.979357


"Антиправила" с lift < 1   
Такие рекомендации делать не стоит

## Правило 2

In [41]:
rules_by_user[ 
    (rules_by_user['confidence'] > 0.2) & 
    (rules_by_user['lift'] > 1) &
    (rules_by_user['consequents'] == {'maxïmo park'}) 
    ]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
98,(arctic monkeys),(maxïmo park),0.102932,0.054458,0.02693,0.261628,4.804178,0.021324,1.280576
258,(bloc party),(maxïmo park),0.089168,0.054458,0.022741,0.255034,4.683089,0.017885,1.269241
836,(mando diao),(maxïmo park),0.067624,0.054458,0.01556,0.230088,4.225032,0.011877,1.228117
862,(the kooks),(maxïmo park),0.097546,0.054458,0.021544,0.220859,4.055552,0.016232,1.213569


Не самая популярная группа maxïmo park    
Можно для продвижения рекомендовать ее пользователям, слушающим arctic monkeys, bloc party, mando diao, the kooks