In [1]:
#DB
from questions.models import Solution, Cluster

# Helpers
import numpy as np

# Preprocessing
from analyzer import python_analyzer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from vectorizer import NCutVectorizer

# Learning
from clustering import Clustering

## Data

In [2]:
## Cleaning database
last_id = 132
problems = Problem.objects.filter(id__gt=132)
solutions_obj = Solution.objects.filter(problem__in=problems).update(ignore=True)
print("Problems to be ignored: %d" % problems.count())

problems = Problem.objects.filter(id__lte=132)
print("Problems to be used: %d" % problems.count())

solutions_obj = Solution.objects.filter(problem__in=problems, ignore=False).order_by('id')
print("Solutions to be user: %d" % solutions_obj.count())

docs_id = []
questions = []
solutions = []

# Fill separated structures
for sol in solutions_obj:
    docs_id.append(sol.id)
    questions.append(sol.problem.content)
    solutions.append(sol.content)

print("Got %d documents" %(solutions_obj.count()))

Problems to be ignored: 591
Problems to be used: 132
Solutions to be user: 54
Got 54 documents


### Preprocessing solutions into bag of words ###

In [3]:
def create_bag_of_words(docs, vectorizer_method, binary=False, min_df=0.2):
    vectorizer = vectorizer_method(analyzer = python_analyzer,
                                   binary=binary,
                                   min_df=min_df) 
    train_data_features = vectorizer.fit_transform(docs)
    try:
        train_data_features = train_data_features.toarray()
    # It's already an array
    except AttributeError:
        pass
    return train_data_features

In [None]:
%%time

vectorizers = [CountVectorizer, TfidfVectorizer, NCutVectorizer]
min_df = np.arange(0.05, 0.5, 0.05)
binary = [True, False]
cluster_methods = ['nmf', 'lda']
#, 'hierarchical', 'gaussian_mixture', 'spectral_clustering']
    
# Grid search
for v in vectorizers:
    for m in min_df:
        for b in binary:
            train_data_features = create_bag_of_words(solutions, v, binary=b, min_df=m)
            
            # Remove rows containing only zeros (weird exercises)
            solution_sample = train_data_features[~(train_data_features==0).all(1)]
            if solution_sample.shape != train_data_features.shape:
                error = {
                    "vectorizer": v,
                    "min_df": m,
                    "binary": b
                }
                print("ERROR: %s" % error)
            
            clusters = range(2, min(train_data_features.shape)+1)
            for k in clusters:
                clustering = Clustering(train_data_features, k)
                gap = Gap(train_data_features, k)
                for method in cluster_methods:
                    model, document_topic, word_topic = getattr(clustering, method)()
                    k_gap = gap.calculate_gap(clustering, method)
#                     k_elbow = 
#                     k_silhouette =                     

In [18]:
from scipy.spatial.distance import pdist
import scipy
import numpy as np

class Gap(object):
    def __init__(self, data, k, nrefs=20):
        self._prepare_ref_dataset(data, nrefs)
        self.k = k
        self.X = data
        
    def _prepare_ref_dataset(self, data, nrefs=20):
        shape = data.shape
        # Set bounding box
        tops = data.max(axis=0)
        bots = data.min(axis=0)

        # Generate distribution
        self.refs = scipy.random.random_sample(size=(shape[0],shape[1],nrefs))

        # Set offset for random uniform samples to be inside bounding boxes
        dists = scipy.matrix(scipy.diag(tops-bots))
        for i in range(nrefs):
            self.refs[:,:,i] = self.refs[:,:,i]*dists+bots

        return self.refs
    
    def calculate_wk(self, X, y, distance='cosine'):
        """ Calculate nr, dr and wk """
        min_k = np.array(y).min()
        max_k = np.array(y).max()
        wk = []

        for i in range(min_k, max_k+1):
            # Calculate Dr for each cluster
            obs_idx = np.where(y == i)[0]
            nr = obs_idx.shape[0]

    #         if nr == 0:
    #             raise TypeError("There are empty clusters.")

            # Get pairwise distance
            dist = pdist(X[obs_idx], distance).sum()

            # Calculate Wk
            wki = dist/(2*nr)
            wk.append(wki)

        return np.array(wk).sum()
    
    def calculate_ref_wk(self, method, k):
        self.wk_refs = []
        
        for ref in range(self.refs.shape[2]):
            ref_clustering = Clustering(self.refs[:,:,ref], k)
            model, document_topic, word_topic = getattr(ref_clustering, method)()
            clusters = ref_clustering.document_topic.argmax(axis=1)
            wk_ref = self.calculate_wk(self.refs[:,:,ref], clusters)
            log_wk_ref = np.log(wk_ref)
            self.wk_refs.append(log_wk_ref)
            
        return self.wk_refs
            
    def calculate_gap(self, clustering, method):
        # Calculate Wk for original data
        clusters = clustering.document_topic.argmax(axis=1)
        wk = self.calculate_wk(self.X, clusters)
        log_wk = np.log(wk)

        # Calculate Wk for reference distribution
        wk_refs = self.calculate_ref_wk(method, self.k)
        wkb = np.array(wk_refs)
        wkb_sum = wkb.sum()
        wkb_std = wkb.std()
        B = wkb.shape[0]

        # Calculate Gap
        gap_k = (1/B*wkb_sum) - log_wk
        error = np.sqrt(wkb_std*(1+(1/B)))

        return gap_k, error