In [58]:
from db import PythonProblems
import io
import tokenize
from sklearn.feature_extraction.text import (CountVectorizer, HashingVectorizer, 
                                             TfidfTransformer, TfidfVectorizer)
import numpy as np
from sklearn.decomposition import NMF
from sklearn.cluster import KMeans
from sklearn.preprocessing import normalize
import random
import time
from itertools import product
from collections import Counter
from sklearn.pipeline import make_pipeline
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_samples, silhouette_score
import matplotlib.cm as cm


In [2]:
from bokeh.models import ColumnDataSource, HoverTool, TapTool
from bokeh.io import push_notebook, show, output_notebook
from bokeh.layouts import row
from bokeh.plotting import figure
output_notebook()

### Connecting to database

In [3]:
db = PythonProblems('python.sqlite')

In [4]:
removed_itens = ['NEWLINE', 'STRING', 'ENDMARKER', 'NUMBER', 'INDENT', 'DEDENT', "NL", 'COMMENT', 'ERRORTOKEN']
allowed_itens = ['NAME', 'OP']
cursor = db.conn.cursor()
docs = []
errors = []

# lendo os dados
cursor.execute("""
SELECT * FROM solution;
""")

for idx, linha in enumerate(cursor.fetchall()):
    file = io.StringIO(linha[1])
    doc = []
    try:
        for item in tokenize.generate_tokens(file.readline):
            if tokenize.tok_name[item[0]] not in removed_itens:
                if tokenize.tok_name[item[0]] in allowed_itens:
                    doc.append(item[1])
                else:
                    print("%s %s" % (tokenize.tok_name[item[0]], item[1]))
    except (IndentationError, tokenize.TokenError):
        errors.append("Please, fix solution %d before continuing" % (idx+1))
        
    docs.append(' '.join(doc))

if not errors:
    print("Success in parsing all documents! You may go on!")
else:
    for item in errors:
        print(item)

Success in parsing all documents! You may go on!


In [29]:
# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool.  
# vectorizer = CountVectorizer(analyzer = "word",   \
#                              tokenizer = None,    \
#                              preprocessor = None, \
#                              stop_words = None,   \
#                              #max_features = 26d,
#                              binary=True
#                             ) 

#vectorizer = HashingVectorizer(#n_features=opts.n_features,
                               #alternate_sign=False,
#                                norm=None, 
#                                binary=False)
vectorizer = TfidfVectorizer(min_df=2)

#vectorizer = make_pipeline(vectorizer, TfidfTransformer())


# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of 
# strings.
train_data_features = vectorizer.fit_transform(docs)

# Numpy arrays are easy to work with, so convert the result to an 
# array
# Document-term matrix
train_data_features = train_data_features.toarray()

In [30]:
train_data_features.shape

(758, 499)

# Cálculo de K-means

In [69]:
hover = HoverTool(tooltips=[
#    ("index", "$index"),
    ("(x,y)", "($x{0}, $y)")
])
opts = dict(plot_width=250, plot_height=250, min_border=0)
models = {}

data = {'x_values': [],
        'y_values': []
       }

source = ColumnDataSource(data=data)

opts = dict(plot_width=900, plot_height=500, min_border=0, title="Silhouette")
plot = figure(**opts)
plot.add_tools(hover)
l = plot.line(x='x_values', y='y_values', color="#2222aa", line_width=2, source=source)
c = plot.circle(x='x_values', y='y_values', color="#2222aa", fill_color="white", size=8, source=source)
h = show(plot, notebook_handle=True)

In [70]:
num_topics = list(range(2, 200))
#random.shuffle(num_topics)
#random.shuffle(num_topics)
#random.shuffle(num_topics)

In [71]:
#nt = sorted(num_topics[:50])
models = {}

for k in num_topics:
    start_time = time.time()
    model = KMeans(n_clusters=k, init='k-means++', max_iter=1000, n_init=100)
    cluster_labels = model.fit_predict(train_data_features)
    silhouette_avg = silhouette_score(train_data_features, cluster_labels)
    
    models[k] = {
        "model": model,
        "silhouette": [silhouette_avg]
    }
    x_values = [k]
    y_values = [models[k]["silhouette"]]
    #print("%d: %f" %(k, end_time - start_time))

    new_data = {'x_values': x_values,
                'y_values': y_values
               }
    source.stream(new_data)
    push_notebook(h)

In [None]:
for k in num_topics:
# Compatibility
    X = train_data_features
    n_clusters = k
    
    fig, ax1 = plt.subplots(1, 1)
    fig.set_size_inches(18, 7)

    # The 1st subplot is the silhouette plot
    # The silhouette coefficient can range from -1, 1 but in this example all
    # lie within [-0.1, 1]
    ax1.set_xlim([-0.1, 1])
    # The (n_clusters+1)*10 is for inserting blank space between silhouette
    # plots of individual clusters, to demarcate them clearly.
    ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10])
    
    y_lower = 10
    for i in range(n_clusters):
        
        # Compute the silhouette scores for each sample
        sample_silhouette_values = silhouette_samples(X, cluster_labels)
        # Aggregate the silhouette scores for samples belonging to
        # cluster i, and sort them
        ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]

        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm.spectral(float(i) / n_clusters)
        ax1.fill_betweenx(np.arange(y_lower, y_upper),
                          0, ith_cluster_silhouette_values,
                          facecolor=color, edgecolor=color, alpha=0.7)

        # Label the silhouette plots with their cluster numbers at the middle
        ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

        # Compute the new y_lower for next plot
        y_lower = y_upper + 10  # 10 for the 0 samples

    ax1.set_title("The silhouette plot for the various clusters.")
    ax1.set_xlabel("The silhouette coefficient values")
    ax1.set_ylabel("Cluster label")

    # The vertical line for average silhouette score of all the values
    ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

    ax1.set_yticks([])  # Clear the yaxis labels / ticks
    ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
    plt.show()

In [291]:
def coherence_norm(data, tf_idf_norm):
    clusters = np.argmax(data['V_T'], axis=1)
    n_clusters = data['V_T'].shape[1]
    
    clusters_norm = []
    clusters_size = []
    coherence_cluster = []
    
    for idx_cluster in range(n_clusters):
        
        cluster_data = tf_idf_norm[clusters == idx_cluster]
        avg_vector = np.average(cluster_data, axis=0)
        cluster_norm = np.sqrt(np.power(avg_vector, 2).sum())
        clusters_norm.append(cluster_norm)
        clusters_size.append(cluster_data.shape[0])
        
        if cluster_data.shape[0] > 3:
            coherence_cluster.append(cluster_norm)
    
    
    data['avg_clusters_norm'] = clusters_norm
    data['clusters_size'] = clusters_size
    data['coherence_norm'] = np.average(clusters_norm)
    return np.average(coherence_cluster), np.std(clusters_norm)

In [292]:
for key in sorted(models.keys()):
    coherence_norm(models[key], doc_mat_norm)

In [293]:
data_x = sorted(models.keys())
data_ncut = [models[x]["n_cut"]/x for x in data_x]
data_coh = [models[x]["coherence_norm"] for x in data_x]

hover = bokeh.models.HoverTool(
        tooltips=[
            ("index", "$index"),
            ("(x, y)", "($x{0}, $y)"),        ]
    )


plot = figure(title="Ncut vs Coherence", plot_height=500, plot_width=900,
    tools=[hover, bokeh.models.PanTool(), bokeh.models.WheelZoomTool()])

plot.line(data_x, data_ncut, color="#2222aa", line_width=2)
#plot.circle(data_x, data_ncut, color="#2222aa", fill_color="white", size=8)

plot.extra_y_ranges = {"coherence": bokeh.models.Range1d(start=0.0, end=1.0)}
plot.line(data_x, data_coh, line_width=2, color="red", y_range_name="coherence")
plot.add_layout(bokeh.models.LinearAxis(y_range_name="coherence"), 'right')

show(plot)

In [28]:
# Take a look at the words in the vocabulary
vocab = vectorizer.get_feature_names()
vocab_cell = np.asarray(vocab).astype(object)
vocab_cell

array(['append', 'def', 'elif', 'else', 'for', 'format', 'if', 'import',
       'in', 'input', 'int', 'len', 'print', 'range', 'return', 'while'],
      dtype=object)

In [304]:
Counter(np.argmax(models[90]['V_T'], axis=1))

Counter({0: 7,
         1: 2,
         2: 1,
         3: 10,
         4: 1,
         5: 8,
         6: 5,
         7: 2,
         8: 2,
         9: 11,
         10: 4,
         11: 2,
         12: 2,
         13: 2,
         14: 4,
         15: 4,
         16: 1,
         17: 3,
         18: 2,
         19: 1,
         20: 2,
         21: 9,
         22: 3,
         23: 3,
         24: 2,
         25: 1,
         26: 5,
         27: 2,
         28: 11,
         29: 4,
         30: 7,
         31: 2,
         32: 4,
         33: 5,
         34: 2,
         35: 6,
         36: 5,
         37: 6,
         38: 19,
         39: 3,
         40: 4,
         41: 2,
         42: 5,
         43: 5,
         44: 6,
         45: 5,
         46: 6,
         47: 3,
         48: 4,
         49: 29,
         50: 1,
         51: 14,
         52: 19,
         53: 1,
         54: 3,
         55: 7,
         56: 3,
         57: 1,
         58: 6,
         59: 2,
         60: 1,
         61: 3,
         62

In [7]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [8]:
print("\nTopics in NMF model (Frobenius norm):")
tfidf_feature_names = vectorizer.get_feature_names()
print_top_words(model, tfidf_feature_names, 26)


Topics in NMF model (Frobenius norm):
Topic #0: true false while if format append column def elif else float for import in input int len num print random range result_str return row str and
Topic #1: format print while and input import if str append column def elif else false float for return result_str true int len num row random range in
Topic #2: if while true append column def elif else false float for format import in input int len num print random range result_str return row str and
Topic #3: print append else def random float len while if column elif false for format in import true input int num range result_str return row str and
Topic #4: result_str column and else row range for elif input if append def false float format while import true int len num print random return str in
Topic #5: column row for range and in false else while format import str return len int input true if random float num print elif def append result_str
Topic #6: print input format true while if append