# Warsztaty Python w Data Science

---

## Machine Learning - część 4 z 5. Unsupervised Learning

- ### Clustering
- ### Przekleństwo wymiarowości (ang. __*Dimensionality Curse*__)
- ### Reguły asocjacyjne - algorytm Apriori

---

https://scikit-learn.org/stable/modules/clustering.html

---

## Dane syntetyczne

In [None]:
import matplotlib.pyplot as plt

from sklearn.datasets import make_classification
from sklearn.datasets import make_blobs
from sklearn.datasets import make_gaussian_quantiles
plt.style.use("dark_background")
plt.figure(figsize=(8, 8))

plt.title("Three blobs", fontsize='small')
X1, Y1 = make_blobs(n_features=2, centers=3, random_state=4)
plt.scatter(X1[:, 0], X1[:, 1], marker='o', c=Y1,
            s=25, edgecolor='w')


plt.show()

---
# Clustering K-means

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline 
import warnings
warnings.filterwarnings('ignore')
from sklearn.cluster import KMeans
from sklearn import datasets
plt.style.use("dark_background")

df = pd.DataFrame(X1)

kmeanModel = KMeans(n_clusters=3)
kmeanModel.fit(df)

df['k_means']=kmeanModel.predict(df)
df['target']=Y1
fig, axes = plt.subplots(1, 2, figsize=(16,8))
axes[0].scatter(df[0], df[1], c=df['target'])
axes[1].scatter(df[0], df[1], c=df['k_means'])
axes[0].set_title('Actual', fontsize=18);
axes[1].set_title('K-Means', fontsize=18);

---
## Przekleństwo wymiarowości (ang. __*Dimensionality Curse*__)

### "Przekleństwo gęstości informacji"
- 2-wymiarowy sześcian o boku 1 
  - każdy bok próbkujemy co 0.1 - $10^2$ próbek 
- 10-wymiarowy sześcian o boku 1 
  - każdy bok próbkujemy co 0.1 - $10^{10}$ próbek 
- 100-wymiarowy sześcian o boku 1 
  - każdy bok próbkujemy co 0.1 - $10^{100}$ próbek 

### "Przeklęństwo odległości"
- 2-wymiarowy sześcian o boku 1 
  - ma przekątną $\sqrt{2} \approx 1.41$ 
- 10-wymiarowy sześcian o boku 1 
  - ma przekątną $\sqrt{10} \approx 3.16$  
- 100-wymiarowy sześcian o boku 1 
  - ma przekątną $\sqrt{100} = 10$  

  

---
## Iris Dataset

This data sets consists of 3 different types of irises’ (Setosa, Versicolour, and Virginica) petal and sepal length, stored in a 150x4 numpy.ndarray

The rows being the samples and the columns being: 
- Sepal Length 
- Sepal Width 
- Petal Length 
- Petal Width


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline 
import warnings
warnings.filterwarnings('ignore')
from sklearn.cluster import KMeans
from sklearn import datasets
iris = datasets.load_iris()

data=pd.DataFrame(iris['data'])
data.head()

In [None]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn import datasets
from sklearn.decomposition import PCA

# import some data to play with
iris = datasets.load_iris()
X = iris.data[:, :2]  # we only take the first two features.
y = iris.target

x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5



fig = plt.figure(1, figsize=(8, 6))
ax = Axes3D(fig, elev=-150, azim=110)
X_reduced = PCA(n_components=3).fit_transform(iris.data)
ax.scatter(X_reduced[:, 0], X_reduced[:, 1], X_reduced[:, 2], c=y,
           cmap=plt.cm.Set1, edgecolor='k', s=40)
ax.set_title("First three PCA directions")
ax.set_xlabel("1st eigenvector")
ax.w_xaxis.set_ticklabels([])
ax.set_ylabel("2nd eigenvector")
ax.w_yaxis.set_ticklabels([])
ax.set_zlabel("3rd eigenvector")
ax.w_zaxis.set_ticklabels([])

plt.show();

In [None]:
distortions = []
K = range(1,10)
for k in K:
    kmeanModel = KMeans(n_clusters=k)
    kmeanModel.fit(data)
    distortions.append(kmeanModel.inertia_)

In [None]:
plt.figure(figsize=(16,8))
plt.plot(K, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.show()

## n_clusters=3

In [None]:
df=pd.DataFrame(iris['data'])

kmeanModel = KMeans(n_clusters=3)
kmeanModel.fit(df)

df['k_means']=kmeanModel.predict(df)
df['target']=iris['target']
fig, axes = plt.subplots(1, 2, figsize=(16,8))
axes[0].scatter(df[0], df[1], c=df['target'])
axes[1].scatter(df[0], df[1], c=df['k_means'])
axes[0].set_title('Actual', fontsize=18);
axes[1].set_title('K-Means N=3', fontsize=18);

## n_clusters=2

In [None]:
df=pd.DataFrame(iris['data'])

kmeanModel = KMeans(n_clusters=2)
kmeanModel.fit(df)

df['k_means']=kmeanModel.predict(df)
df['target']=iris['target']
fig, axes = plt.subplots(1, 2, figsize=(16,8))
axes[0].scatter(df[0], df[1], c=df['target'])
axes[1].scatter(df[0], df[1], c=df['k_means'])
axes[0].set_title('Actual', fontsize=18);
axes[1].set_title('K-Means N=2', fontsize=18);

## n_clusters=4

In [None]:
df=pd.DataFrame(iris['data'])

kmeanModel = KMeans(n_clusters=4)
kmeanModel.fit(df)

df['k_means']=kmeanModel.predict(df)
df['target']=iris['target']
fig, axes = plt.subplots(1, 2, figsize=(16,8))
axes[0].scatter(df[0], df[1], c=df['target'])
axes[1].scatter(df[0], df[1], c=df['k_means'])
axes[0].set_title('Actual', fontsize=18);
axes[1].set_title('K-Means N=4', fontsize=18);

---
### Hierarchical Clustering

In [None]:
import numpy as np

from matplotlib import pyplot as plt
from scipy.cluster.hierarchy import dendrogram
from sklearn.datasets import load_iris
from sklearn.cluster import AgglomerativeClustering


def plot_dendrogram(model, **kwargs):
    # Create linkage matrix and then plot the dendrogram

    # create the counts of samples under each node
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack([model.children_, model.distances_,
                                      counts]).astype(float)

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)


iris = load_iris()
X = iris.data

# setting distance_threshold=0 ensures we compute the full tree.
model = AgglomerativeClustering(distance_threshold=0, n_clusters=None)

model = model.fit(X)
plt.title('Hierarchical Clustering Dendrogram')
# plot the top three levels of the dendrogram
plot_dendrogram(model, truncate_mode='level', p=3)
plt.xlabel("Number of points in node (or index of point if no parenthesis).")
plt.show()

---
# Reguły asocjacyjne - algorytm Apriori

Tworzy reguły "jeśli A to B" pisane $A \rightarrow B$

__*Support*__ dla $X$ - jaka część zbioru tranzakcyjnego zawiera $X$

$$supp(X) = \frac{|\{X \subset T\}|}{|T|}$$


__*Confidence*__ dla $X \rightarrow Y$ - jak często dana reguła jest prawdziwa

$$conf(X \rightarrow Y) = supp(X \cup Y) \div supp(X)$$
jaka część zbioru tranzakcyjnego zawiera $X$


__*lift*__ dla $X \rightarrow Y$ - jak zależne są $X$ i $Y$
$$lift(X \rightarrow Y) = \frac{supp(X \cup Y)}{supp(X) \times supp(Y)}$$

- $lift < 1$ - przedmioty są komplementarne
- $lift > 1$ - przedmioty są skorelowane
- $lift = 1$ - przedmioty są niezależne


!pip install efficient-apriori

In [None]:
from efficient_apriori import apriori

itemsetlist = [['eggs', 'bacon', 'soup'],
                ['eggs', 'bacon', 'apple'],
                ['soup', 'bacon', 'banana']]

freqitemset, rules = apriori(itemsetlist, min_support=0.5, min_confidence=0.5)

print(rules)  

## MovieLens

100,000 ratings and 3,600 tag applications applied to 9,000 movies by 600 users. Last updated 9/2018.

https://grouplens.org/datasets/movielens/

In [None]:
import pandas as pd

movies = pd.read_csv('data/ml-latest-small/movies.csv')
movies.head()

In [None]:
id_title = {}
for index, row in movies.iterrows():
    id_title[row['movieId']] = row['title']

In [None]:
id_title[260]

In [None]:
ratings = pd.read_csv('data/ml-latest-small/ratings.csv')
ratings.head(10)

In [None]:
ratings['rating'].value_counts()

In [None]:
ratings = ratings[ ratings['rating'] > 3]

In [None]:
ratings['rating'].value_counts()

In [None]:
def to_item_set_list(df):
    usermap = {}
    for index, row in df.iterrows():
        userlist = usermap.get(row['userId'], [])
        userlist.append(int(row['movieId']))
        usermap[row['userId']] = userlist
    return usermap.values()

In [None]:
to_item_set_list(ratings.head(50))

In [None]:
itemsetlist = to_item_set_list(ratings)

In [None]:
len(itemsetlist)

In [None]:
freqitemset, rules = apriori(itemsetlist, min_support=0.1, min_confidence=0.8)
len(rules)

In [None]:
from collections import Counter


def get_recommendations(rules, movieid):
    ret = []
    for rule in rules:
        if rule.lhs[0]==movieid:
            ret.append(rule.rhs[0])
    return sorted(Counter(ret).items(),key = lambda x: x[1], reverse=True)

In [None]:
get_recommendations(rules, 260)

In [None]:
for recommendation in get_recommendations(rules, 260):
    print(id_title[recommendation[0]], recommendation[1])

## SPMF data-mining

- 202 algorytmy
- 41 w tej samej kategorii co Apriori

http://www.philippe-fournier-viger.com/spmf/index.php?link=algorithms.php