In [0]:
!mkdir -p local_modules/db

In [43]:
%%writefile local_modules/db/__init__.py
# Save a module init file that contains a custom function that we'll use
# to verify that import works.

""" Class to create database with problems and solutions in Python """
import sqlite3
from sqlite3 import Error


class PythonProblems(object):
    def __init__(self, db_file, sql_file=None):
        """ initialize with database filename """
        self.db_file = db_file
        self.create_connection()
        if sql_file:
            self.create_tables(sql_file)

    def create_connection(self):
        """ create a database connection to the SQLite database
            specified by db_file
        """
        try:
            self.conn = sqlite3.connect(self.db_file)
            return self.conn
        except Error as e:
            print(e)

        return None
    def close_connection(self):
        self.conn.close()

    def create_tables(self, create_table_sql):
        """ create a table from the create_table_sql statement
        :param create_table_sql: a file containing SQL statements
        return
        """
        try:
            c = self.conn.cursor()
            with open(create_table_sql, 'r') as sql_file:
                sql_command = sql_file.read()
            statements = sql_command.split(';')
            for statement in statements:
                c.execute(statement)
            self.conn.commit()
        except Error as e:
            print(e)


    def insert_rows(self, problems, solutions, mode='i'):
        """ Append rows to be inserted into row list """

        # Two modes: i (insert) or a (append). When in mode i, reset list. With
        # mode is a, then just append to existing list
        if mode == 'i':
            self.rows = []

        for item in solutions:
            row_dict = {}
            idx = item["idx"]
            row_dict["problem"] = problems[idx]
            row_dict["solution"] = item
            self.rows.append(row_dict)

    def populate(self):
        """
        Create a new project into the projects table
        :param rows: dict list containing table and column names in keys and respective values
        :return: list with ids
        """
        sql_template = ''' INSERT INTO %s(%s) VALUES(%s) '''
        total_problems = 0
        total_solutions = 0
        repeated_problems = []
        repeated_solutions = 0
        idx_old = -1
        problem_ids = []
        for item in self.rows:
            # Get problem and solution
            problem = item["problem"]
            solution = item["solution"]

            # If problem has not already been inserted (same problem,
            # different solutions), add problem
            ### TODO: this solutions only avoids repetitions if they are in a
            # sequence. Make it generic.
            idx_current = solution["idx"]
            if idx_current > idx_old:
                problem_sql = sql_template % ("problem", ','.join(problem.keys()),
                    ','.join(list('?'*len(problem.keys()))))
                cur = self.conn.cursor()
                # If it is a repeated problem from another crawler run. skip it
                try:
                    cur.execute(problem_sql, list(problem.values()))
                    total_problems += 1
                except sqlite3.IntegrityError:
                    repeated_problems.append(item)
                    continue
            # We are keeping problem_ids in order. If it is repeated we add
            # null value to maintain order and idx difference.
                problem_ids.append(cur.lastrowid)
            else:
                problem_ids.append(None)

            # Add problem
            print(repeated_problems)
            problem_id = problem_ids[idx_current]
            solution["problem_id"] = problem_id
            del solution["idx"]
            solution_sql = sql_template % ("solution", ','.join(solution.keys()),
                    ','.join(list('?'*len(solution.keys()))))
            cur = self.conn.cursor()
            try:
                cur.execute(solution_sql, list(solution.values()))
                total_solutions += 1
            except sqlite3.IntegrityError:
                repeated_solutions += 1

            # Update control
            idx_old = idx_current
        # gravando no bd
        self.conn.commit()
        self.repeated_problems = repeated_problems
        return total_problems, total_solutions, len(repeated_problems), repeated_solutions

Overwriting local_modules/db/__init__.py


In [1]:
#import sys
#sys.path.append('local_modules')

from db import PythonProblems
import io
import tokenize
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from sklearn.decomposition import NMF
#from sklearn.decomposition import ProjectedGradientNMF
from sklearn.preprocessing import normalize
import time
from itertools import product

In [45]:
!pip install bokeh



In [2]:
from bokeh.models import ColumnDataSource
import bokeh.plotting

In [0]:
# Install a Drive FUSE wrapper.
# https://github.com/astrada/google-drive-ocamlfuse
#!apt-get install -y -qq software-properties-common python-software-properties module-init-tools
#!add-apt-repository -y ppa:alessandro-strada/ppa 2>&1 > /dev/null
!apt-get update -qq 2>&1 > /dev/null
!apt-get -y install -qq google-drive-ocamlfuse fuse

In [0]:
# Generate auth tokens for Colab
from google.colab import auth
auth.authenticate_user()

In [49]:
# Generate creds for the Drive FUSE library.
from oauth2client.client import GoogleCredentials
creds = GoogleCredentials.get_application_default()
import getpass
!google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret} < /dev/null 2>&1 | grep URL
vcode = getpass.getpass()
!echo {vcode} | google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret}

··········


In [50]:
# Create a directory and mount Google Drive using that directory.
!mkdir -p drive
!google-drive-ocamlfuse drive

fuse: mountpoint is not empty
fuse: if you are sure this is safe, use the 'nonempty' mount option


### Connecting to database

In [3]:
db = PythonProblems('python.sqlite')

In [4]:
removed_itens = ['NEWLINE', 'STRING', 'ENDMARKER', 'NUMBER', 'INDENT', 'DEDENT', "NL", 'COMMENT', 'ERRORTOKEN']
allowed_itens = ['NAME', 'OP']
cursor = db.conn.cursor()
docs = []
errors = []

# lendo os dados
cursor.execute("""
SELECT * FROM solution;
""")

for idx, linha in enumerate(cursor.fetchall()):
    file = io.StringIO(linha[1])
    doc = []
    try:
        for item in tokenize.generate_tokens(file.readline):
            if tokenize.tok_name[item[0]] not in removed_itens:
                if tokenize.tok_name[item[0]] in allowed_itens:
                    doc.append(item[1])
                else:
                    print("%s %s" % (tokenize.tok_name[item[0]], item[1]))
    except (IndentationError, tokenize.TokenError):
        errors.append("Please, fix solution %d before continuing" % (idx+1))
        
    docs.append(' '.join(doc))

if not errors:
    print("Success in parsing all documents! You may go on!")
else:
    for item in errors:
        print(item)

Success in parsing all documents! You may go on!


In [5]:
# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool.  
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             #max_features = 26
                            ) 

# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of 
# strings.
train_data_features = vectorizer.fit_transform(docs)

# Numpy arrays are easy to work with, so convert the result to an 
# array
train_data_features = train_data_features.toarray()

### NCut weights

In [6]:
doc_mat_norm = normalize(train_data_features)

In [7]:
S = np.dot(doc_mat_norm.T, doc_mat_norm) + 0.001

In [8]:
S.shape

(1508, 1508)

In [9]:
D = np.power(np.sum(S, axis=1), -0.5) * np.eye(S.shape[0])

In [10]:
Y = np.dot(D, train_data_features.T)

In [11]:
Y.shape

(1508, 758)

# Cálculo da NMF

In [12]:
models = {}

source = ColumnDataSource(data={"x": [0], "y": [0]})

plot = bokeh.plotting.figure(title="Ncut", plot_height=500, plot_width=900)
plot.line(x='x', y='y', color="#2222aa", line_width=2, source=source)
plot.circle(x='x', y='y', color="#2222aa", fill_color="white", size=8, source=source)
bokeh.plotting.show(plot)

In [13]:
def edges_sum(cond_1, cond_2, S):
    nodes_1 = np.where(cond_1)[0]
    nodes_2 = np.where(cond_2)[0]
    
    indices = product(nodes_1, nodes_2)
    return np.sum(S[idx] for idx in indices)


def n_cut(mat_U, mat_S):
    
    term_clusters = mat_U.argmax(axis=1)
    
    n_k = mat_U.shape[1]
    
    result = 0.0
    for idx_k in range(n_k):
        
        cond_g_k = term_clusters == idx_k
        num = edges_sum(cond_g_k, ~cond_g_k, mat_S)
        den = edges_sum(cond_g_k,  cond_g_k, mat_S) + num
        
        result += num/den
        
    return 0.5 * result

In [14]:
import random

In [15]:
num_topics = list(range(5, 50))
random.shuffle(num_topics)
random.shuffle(num_topics)
random.shuffle(num_topics)

In [16]:
for k in [10,]:
    
    start_time = time.time()
    #model = ProjectedGradientNMF(n_components=k, init="nndsvda", max_iter=1000)
    model = NMF(n_components=k, init="nndsvda", max_iter=1000)
    V_T = model.fit_transform(Y.T)
    U_T = model.components_
    n_cut_val = n_cut(U_T.T, S)
    
    models[k] = {
        "model": model,
        "V_T": V_T,
        "n_cut": n_cut_val,
    }
    
    end_time = time.time()
    
    #print("%d: %f" %(k, end_time - start_time))
    
    source.data['x'] = sorted(models.keys())
    source.data['y'] = [models[x]["n_cut"]/x for x in source.data['x']]
    source.stream(source.data)

In [0]:
# Take a look at the words in the vocabulary
vocab = vectorizer.get_feature_names()
vocab_cell = np.asarray(vocab).astype(object)

In [0]:
model = NMF()
W = model.fit_transform(train_data_features)
H = model.components_

In [0]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [0]:
print("\nTopics in NMF model (Frobenius norm):")
tfidf_feature_names = vectorizer.get_feature_names()
print_top_words(model, tfidf_feature_names, 26)


Topics in NMF model (Frobenius norm):
Topic #0: true false while if format append column def elif else float for import in input int len num print random range result_str return row str and
Topic #1: format print while and input import if str append column def elif else false float for return result_str true int len num row random range in
Topic #2: if while true append column def elif else false float for format import in input int len num print random range result_str return row str and
Topic #3: print append else def random float len while if column elif false for format in import true input int num range result_str return row str and
Topic #4: result_str column and else row range for elif input if append def false float format while import true int len num print random return str in
Topic #5: column row for range and in false else while format import str return len int input true if random float num print elif def append result_str
Topic #6: print input format true while if append