## 1. Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import nltk

import pyLDAvis.sklearn

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC 
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier 
from sklearn.metrics import accuracy_score

from utils import make_dataset, preprocess_txt, create_corpus, create_vocab, show_most_freq_n

%reload_ext autoreload
%autoreload 2

## 2. Data Preprocessing

In [2]:
# read all datasets
df_appceleratorstudio = pd.read_csv('./data/appceleratorstudio.csv')
df_aptanastudio = pd.read_csv('./data/aptanastudio.csv')
df_bamboo = pd.read_csv('./data/bamboo.csv')
df_clover = pd.read_csv('./data/clover.csv')
df_datamanagement = pd.read_csv('./data/datamanagement.csv')
df_duracloud = pd.read_csv('./data/duracloud.csv')
df_jirasoftware = pd.read_csv('./data/jirasoftware.csv')
df_mesos = pd.read_csv('./data/mesos.csv')
df_moodle = pd.read_csv('./data/moodle.csv')
df_mule = pd.read_csv('./data/mule.csv')
df_mulestudio = pd.read_csv('./data/mulestudio.csv')
df_springxd = pd.read_csv('./data/springxd.csv')
df_talenddataquality = pd.read_csv('./data/talenddataquality.csv')
df_talendesb = pd.read_csv('./data/talendesb.csv')
df_titanium = pd.read_csv('./data/titanium.csv')
df_usergrid = pd.read_csv('./data/usergrid.csv')

In [3]:
dataset_arr = [df_appceleratorstudio,
        df_aptanastudio,
        df_bamboo,
        df_clover,
        df_datamanagement,
        df_duracloud,
        df_jirasoftware,
        df_mesos,
        df_moodle,
        df_mule,
        df_mulestudio,
        df_springxd,
        df_talenddataquality,
        df_talendesb,
        df_titanium,
        df_usergrid]

In [4]:
#get union of datasets
text, labels = make_dataset(dataset_arr)

#create corpus
corpus = create_corpus(text)
len(corpus)

20761

In [5]:
vocab = create_vocab(corpus)
len(vocab)

22458

### 3. Word Frequencies as Features (BOW)


In [6]:
def doc_word_frequencies(doc):
    word_freqs = nltk.FreqDist(doc)
    
    return word_freqs

In [7]:
doc_word_freqs = doc_word_frequencies(corpus[0])
print(len(doc_word_freqs))
doc_word_freqs

14


FreqDist({'function': 3, 'arg': 2, 'literal': 2, 'object': 2, 'invocation': 2, 'type': 2, 'instance': 1, 'idea': 1, 'add': 1, 'metadata': 1, ...})

In [8]:
def vocab_frequencies(corpus):
    corpus_flat = [w for doc in corpus for w in doc]
    fdist = nltk.FreqDist(corpus_flat)

    return fdist

In [9]:
vocab_freqs = vocab_frequencies(corpus)
vocab_freqs.most_common(15)

[('code', 9877),
 ('error', 8391),
 ('file', 6569),
 ('create', 6527),
 ('add', 5689),
 ('test', 5017),
 ('project', 4985),
 ('studio', 4860),
 ('new', 4833),
 ('run', 4819),
 ('build', 4713),
 ('user', 4403),
 ('result', 4334),
 ('use', 4320),
 ('need', 4235)]

In [10]:
def doc_feature_extraction(doc, vocab):
    freqs = doc_word_frequencies(doc)
    doc_freqs = {token:freqs[token] for token in vocab}
    doc_vector = list(doc_freqs.values())
        
    return  doc_vector, doc_freqs

In [11]:
doc_vector, doc_freqs = doc_feature_extraction(corpus[0], vocab)
print(len(doc_vector))
print(doc_freqs["literal"])
print(doc_vector)

22458
2
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [12]:
def corpus_feature_extraction(corpus, vocab):
    corpus_features = [doc_feature_extraction(doc, vocab)[0] for doc in corpus]
    
    return corpus_features

In [13]:
corpus_features = corpus_feature_extraction(corpus, vocab)
len(corpus_features)

20761

In [14]:
from sklearn.manifold import TSNE

model = TSNE(n_components=2, random_state=42)

tsne_data = model.fit_transform(corpus_features)
# creating a new data frame which help us in ploting the result data
tsne_data = np.vstack((tsne_data.T, labels)).T
tsne_df = pd.DataFrame(data=tsne_data, columns=('dim_1','dim_2', 'label'))
# Ploting the result of tsne
sns.FacetGrid(tsne_df, hue='label', size=6).map(plt.scatter, 'dim_1','dim_2').add_legend()
plt.show()

MemoryError: 

# 4. Classification and prediction

In [15]:
features_train, features_test, labels_train, labels_test = train_test_split(corpus_features, labels, test_size=0.3, random_state=111)

In [24]:
svc = LinearSVC(multi_class='crammer_singer')
knc = KNeighborsClassifier(n_neighbors=50)
mnb = MultinomialNB(alpha=0.2)
dtc = DecisionTreeClassifier(min_samples_split=7, random_state=111)
lrc = LogisticRegression(solver='lbfgs', penalty='l2', multi_class='multinomial', max_iter=10000)
rfc = RandomForestClassifier(n_estimators=31, random_state=111)
abc = AdaBoostClassifier(n_estimators=62, random_state=111)
bc = BaggingClassifier(n_estimators=9, random_state=111)
etc = ExtraTreesClassifier(n_estimators=9, random_state=111)

In [23]:
clfs = {'SVC' : svc,'KN' : knc, 'NB': mnb, 'DT': dtc, 'LR': lrc, 'RF': rfc, 'AdaBoost': abc, 'BgC': bc, 'ETC': etc}

In [18]:
def train_clf(clf, train_x, train_y):
    clf.fit(train_x, train_y)

In [19]:
def predict_labels(clf, features):
    return (clf.predict(features))

In [21]:
pred_scores = []
for k,v in clfs.items():
    train_clf(v, features_train, labels_train)
    pred = predict_labels(v,features_test)
    pred_scores.append((k, [accuracy_score(labels_test,pred)]))

MemoryError: 

In [None]:
df = pd.DataFrame.from_items(pred_scores, orient='index', columns=['Score'])
df.sort_values(by=['Score'])

In [None]:
df.sort_values(by=['Score']).plot(kind='bar', figsize=(11,4), ylim=(0.1,1.0));