In [1]:
import time 
import warnings 

import numpy as np
import pandas as pd

from sklearn import cluster
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler

import plotly.plotly as py
import plotly.graph_objs as go

In [28]:
def metricPlot(coords,dim):
    marker=dict(
        size=8,
        line=dict(
            color='rgb(0, 0, 0)',
            width=0.5
        )
    )
    
    if dim==3:
        tmp=pd.DataFrame(coords, columns=['para1','para2','coefficient','time'])
        X=tmp['para1']
        Z=tmp['coefficient']
        text=tmp['time']
        Y=tmp['para2']
        
        trace = go.Scatter3d(
            x=X,
            y=Y,
            z=Z,
            text=text,
            mode='markers',
            marker=marker
        )
    else:
        tmp=pd.DataFrame(coords, columns=['para1','coefficient','time'])
        X=tmp['para1']
        Y=tmp['coefficient']
        text=tmp['time']
        
        trace = go.Scatter(
            x=X,
            y=Y,
            text=text,
            mode='markers',
            marker=marker
        )         

    data = [trace]
    
    layout = go.Layout(
        margin=dict(
            l=0,
            r=0,
            b=0,
            t=0
        ),
        scene = dict(
        xaxis=dict(title='para1'),
        yaxis=dict(title='para2'),
        zaxis=dict(title='silhouette')
        )
    )
    fig = go.Figure(data=data, layout=layout)
    py.iplot(fig, world_readable=True, filename='simple-3d-scatter')

In [3]:
def clustering(userModel):

    X=userModel.loc[userModel['theme_id']==userModel['theme_id'][0]]
    X=X.loc[:,['problem_duration_graded','problem_duration_ungraded','messagiesNum','video_duration']]    
    X = StandardScaler().fit_transform(X)
    
    floatArr=[i for i in np.arange(0.1,1.1,0.1)]
    intArr=[i for i in range(2,30)]
    
    #eps float and minsamples int
    algorithm=cluster.DBSCAN()   
    
    #eigen_tol float and n_clusters int (optional)
    #algorithm=cluster.SpectralClustering(eigen_solver='arpack',affinity='nearest_neighbors') #Поиграть с параметрами
    
    #n_clusters int
    #algorithm=cluster.AgglomerativeClustering()
    
    #n_clusters int and batch_size int
    #algorithm=cluster.MiniBatchKMeans()
    
    dim=3
    #dim=2
    
    coords=[]
    
    for para1 in floatArr:
    #for para1 in intArr:
        for para2 in intArr:
            silhouette,time=clusteringParams(X,para1,para2,algorithm)
            coords.append((para1,para2,silhouette,time))
    """
    for para1 in intArr:
        silhouette,time=clusteringParams(X,para1,0,algorithm)
        coords.append((para1,silhouette,time))
    """
        
            
    return coords,dim
    

In [17]:
def clusteringParams(X,para1,para2,algorithm):

    algorithm.set_params(eps=para1, min_samples=para2) #dbscan
    #algorithm.set_params(eigen_tol=para1) #spectural
    #algorithm.set_params(n_clusters=para1) #agglomerative
    #algorithm.set_params(n_clusters=para1,batch_size=para2) #k-means
    
    t0 = time.time()
    
    with warnings.catch_warnings():
        warnings.filterwarnings(
            "ignore",
            message="the number of connected components of the " +
            "connectivity matrix is [0-9]{1,2}" +
            " > 1. Completing it to avoid stopping the tree early.",
            category=UserWarning)
        warnings.filterwarnings(
            "ignore",
            message="Graph is not fully connected, spectral embedding" +
            " may not work as expected.",
            category=UserWarning)
        algorithm.fit(X)
    t1 = time.time()

    if len(np.unique(algorithm.labels_))>1:
        silhouette=silhouette_score(X,labels=algorithm.labels_)
    else:
        silhouette=-1
    return silhouette, (t1-t0)
    
    
    

In [5]:
from getUserStudyModel import getModel
userModel=getModel()

#userModel = pd.read_csv('user_model.csv', sep=',', encoding='utf-8')

TypeError: to_csv() got an unexpected keyword argument 'ignore_index'

In [6]:
userModel.to_csv(path_or_buf='problems_starts_time.csv',index=False)

In [18]:
coords,dim=clustering(userModel)

In [29]:
metricPlot(coords,dim)