In [1]:
#DB
from questions.models import Solution, Cluster

# Helpers
import numpy as np

# Preprocessing
from analyzer import python_analyzer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from vectorizer import NCutVectorizer

## Data

In [2]:
## Cleaning database
last_id = 132
problems = Problem.objects.filter(id__gt=132)
solutions_obj = Solution.objects.filter(problem__in=problems).update(ignore=True)
print("Problems to be ignored: %d" % problems.count())

problems = Problem.objects.filter(id__lte=132)
print("Problems to be used: %d" % problems.count())

solutions_obj = Solution.objects.filter(problem__in=problems, ignore=False).order_by('id')
print("Solutions to be user: %d" % solutions_obj.count())

docs_id = []
questions = []
solutions = []

# Fill separated structures
for sol in solutions_obj:
    docs_id.append(sol.id)
    questions.append(sol.problem.content)
    solutions.append(sol.content)

print("Got %d documents" %(solutions_obj.count()))

Problems to be ignored: 591
Problems to be used: 132
Solutions to be user: 54
Got 54 documents


### Preprocessing solutions into bag of words ###

In [3]:
def create_bag_of_words(docs, vectorizer_method, binary=False, min_df=0.2):
    vectorizer = vectorizer_method(analyzer = python_analyzer,
                                   binary=binary,
                                   min_df=min_df) 
    train_data_features = vectorizer.fit_transform(docs)
    try:
        train_data_features = train_data_features.toarray()
    # It's already an array
    except AttributeError:
        pass
    return train_data_features

In [4]:
vectorizers = [CountVectorizer, TfidfVectorizer, NCutVectorizer]
min_df = np.arange(0.05, 0.5, 0.05)
binary = [True, False]
clustering = ['nmf', 'lda', 'hierarchical', 'gm', 'spectral']
k = range(2, min(solution_sample.shape))
    
# Grid search
for v in vectorizers:
    for m in min_df:
        for b in binary:
            train_data_features = create_bag_of_words(solutions, v, binary=b, min_df=m)
            print(train_data_features.shape)

(54, 236)
(54, 236)
(54, 83)
(54, 83)
(54, 54)
(54, 54)
(54, 44)
(54, 44)
(54, 39)
(54, 39)
(54, 31)
(54, 31)
(54, 23)
(54, 23)
(54, 22)
(54, 22)
(54, 18)
(54, 18)
(54, 236)
(54, 236)
(54, 83)
(54, 83)
(54, 54)
(54, 54)
(54, 44)
(54, 44)
(54, 39)
(54, 39)
(54, 31)
(54, 31)
(54, 23)
(54, 23)
(54, 22)
(54, 22)
(54, 18)
(54, 18)
(54, 236)
(54, 236)
(54, 83)
(54, 83)
(54, 54)
(54, 54)
(54, 44)
(54, 44)
(54, 39)
(54, 39)
(54, 31)
(54, 31)
(54, 23)
(54, 23)
(54, 22)
(54, 22)
(54, 18)
(54, 18)
