In [57]:
# Input
from db import PythonProblems
import io
from scipy import io as sio

# Helpers
import numpy as np
from collections import Counter
from tqdm import tqdm
import random

# Preprocessing
import tokenize
from sklearn.feature_extraction.text import CountVectorizer

### Connecting to database

In [4]:
db = PythonProblems('python.sqlite')

In [5]:
removed_itens = ['NEWLINE', 'STRING', 'ENDMARKER', 'NUMBER', 'INDENT', 'DEDENT', "NL", 'COMMENT', 'ERRORTOKEN']
allowed_itens = ['NAME', 'OP']
cursor = db.conn.cursor()
docs = []
docs_id = []
docs_category = []
errors = []

# lendo os dados
cursor.execute("""
SELECT solution.id, solution.content, problem.category FROM solution, problem where solution.problem_id = problem.id;
""")


for idx, row in enumerate(cursor.fetchall()):
    file = io.StringIO(row[1])
    doc = []
    try:
        for item in tokenize.generate_tokens(file.readline):
            if tokenize.tok_name[item[0]] not in removed_itens:
                if tokenize.tok_name[item[0]] in allowed_itens:
                    doc.append(item[1])
                else:
                    print("%s %s" % (tokenize.tok_name[item[0]], item[1]))
    except (IndentationError, tokenize.TokenError):
        errors.append("Please, fix solution %d before continuing" % (idx+1))
        
    docs.append(' '.join(doc))
    docs_id.append(row[0])
    docs_category.append(row[2])

print("Got %d documents" %(idx+1))

if not errors:
    print("Success in parsing all documents! You may go on!")
else:
    for item in errors:
        print(item)

Got 758 documents
Success in parsing all documents! You may go on!


### Preprocessing solutions into bag of words ###

In [23]:
# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool.  
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = ['print'],   \
                             #max_features = 26d,
                             binary=False,
                             min_df=0.05
                            ) 

# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of 
# strings.

train_data_features = vectorizer.fit_transform(docs)

# Numpy arrays are easy to work with, so convert the result to an 
# array
# Document-term matrix
train_data_features = train_data_features.toarray()
train_data_features.shape

(758, 27)

In [53]:
# Take a look at the words in the vocabulary
vocab = vectorizer.get_feature_names()
vocab_cell = np.asarray(vocab).astype(object)

In [52]:
categories = list(set(docs_category))
none_idx = categories.index(None)
categories[none_idx] = "None"
categories.sort(reverse=True)
data_array = np.empty((len(categories), 1), dtype=object)

for i, item in enumerate(categories):
    if item == "None":
        item = None
    idx = np.where(np.array(docs_category) == item)
    print("Train set with class %s has %d observations." %(item, len(idx[0])))
    train = train_data_features[idx]
    to_remove = np.where(~train.any(axis=1))[0]
    train = np.array([np.delete(train, to_remove, 0)]).astype(np.double)
    print("%d obs were removed. Total: %d" %(len(to_remove), len(train[0])))
    data_array[i] = [train[0]]

Train set with class string has 121 observations.
17 obs were removed. Total: 104
Train set with class math has 82 observations.
2 obs were removed. Total: 80
Train set with class loop has 13 observations.
0 obs were removed. Total: 13
Train set with class list has 48 observations.
11 obs were removed. Total: 37
Train set with class function has 29 observations.
1 obs were removed. Total: 28
Train set with class file has 4 observations.
0 obs were removed. Total: 4
Train set with class dict has 38 observations.
6 obs were removed. Total: 32
Train set with class conditional has 13 observations.
0 obs were removed. Total: 13
Train set with class None has 410 observations.
33 obs were removed. Total: 377


In [60]:
sio.savemat('python.mat', mdict={'train_set': data_array.T, 'marc_label': vocab_cell, 'test_set': data_array.T})

In [74]:
data_array_none = np.delete(data_array, (8), axis=0)

In [82]:
sio.savemat('python_none.mat', mdict={'train_set': data_array_none.T, 'marc_label': vocab_cell, 'test_set': data_array_none.T})