In [60]:
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
import matplotlib.pyplot as plt
import spacy 
from spacy.matcher import Matcher
from spacy.scorer import Scorer
from spacy.training import Example
from spacy.tokens import Doc
from spacy.util import filter_spans
import pandas as pd
import re
from collections import Counter
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import f1_score
import numpy as np
import copy
import csv

# configuration
n_tag_posts = 500
n_answer_posts = 2000

In [None]:
# Load csv data
filepath = os.path.join(os.getcwd(), 'QueryResults_sample.csv')

stack_posts = pd.read_csv(filepath, sep = ",")

print("loaded csv data")

In [61]:
# drop all duplicates in posts
df = pd.DataFrame(stack_posts[0:n_tag_posts])
df = df.drop_duplicates(["QuestionId"])
df = df.sort_values(by=["QuestionId"])
#df.duplicated(["QuestionId"])


# get all tags of questions
tag_set = set()
tag_list = []
for tags in df["Tags"]:
    # clean tags from '>' and '<' occurences
    tags = re.sub('><', ' ', tags) 
    tags = re.sub('<|>', '', tags)
    # add single tag of tags and add it to lists and sets
    for tag in tags.split():               
        tag_list.append(tag)

#tag_set = set(tag_list)
# filter term 'machine-learning', because sql export filters for this term
tag_set = set(filter(lambda a: a != 'machine-learning', tag_list))
tag_Counter = Counter(tag_list)

print(tag_set)

{'supervised-learning', 'tic-tac-toe', 'apriori', 'algorithmic-trading', 'sampling', 'projection', 'cascade-classifier', 'signal-processing', 'ruby', 'computer-science', 'cryptography', 'fuzzy-search', 'mallet', 'statistics', 'kernel-density', 'gearman', 'azure-machine-learning-studio', 'r', 'treemodel', 'authentication', 'neural-network', 'tf-idf', 'reinforcement-learning', 'pos-tagger', 'dlib', 'black-box', 'word2vec', 'data-analysis', 'matrix', 'test-data', 'vgg-net', 'cuda', 'vectorization', 'deep-learning', 'pylearn', 'python-3.x', 'liblinear', 'metrics', 'distributed-computing', 'time-series', 'cvx', 'ensemble-learning', 'large-files', 'mahout', 'lstm', 'gnuplot', 'matplotlib', 'probability-density', 'python', 'confusion-matrix', 'precision-recall', 'security', 'api', 'pam', 'tokenize', 'bayesian', 'knn', 'audio', 'sift', 'object-recognition', 'text', 'scikit-learn', 'mysql', 'algorithm', 'kernel', 'cloudera', 'differentiation', 'e-commerce', 'genetic-programming', 'implementatio

In [62]:
# clean posts and match words
nlp = spacy.load("en_core_web_lg")
matcher = Matcher(nlp.vocab)

technology_pattern1 = [{'POS': 'PROPN', 'OP': '+'},
                       {'POS': 'NUM', 'OP': '?'}
                      ]

technology_pattern2 = [{'OP': '+', 'POS': 'PROPN'},
                       {'TEXT': '-', 'OP': '+'},
                       {'POS': 'VERB', 'OP': '+'}
                      ]

technology_pattern3 = [{'OP': '+', 'POS': 'NOUN'},
                       {'TEXT': '-', 'OP': '?'},
                       {'POS': 'PROPN', 'OP': '+'}
                      ]

ml_pattern1 = [{'LOWER': 'machine', 'OP': '!'},
                       #{'TEXT': '-', 'OP': '!'},
                       {'LOWER': 'learning', 'OP': '!'}
                      ]


word_set = set()
regex_pattern = '(<(pre|code|blockquote|a|strike)(.|\n)*?\/(pre|code|blockquote|a|strike)>)*?|<(p|b|br|br(.|\n)*?\/|sub|sup|em|strong|hr|s|i|ol|ul|li|code)*?>|<\/(p|b|br|sub|sup|em|strong|s|i|ol|ul|li|div|pre|blockquote|a|code)>|<h(.|\n)*?>(.|\n)*?<\/h(.|\n)*?>*?|(<(img|div|ol|ul|li)(.|\n)*?\/*?>)|\n'

matcher.add("match_technology1", [technology_pattern1])
matcher.add("match_technology2", [technology_pattern2])
matcher.add("match_technology3", [technology_pattern3])
#matcher.add("unmatch_ml_pattern", [ml_pattern1])

for text in stack_posts["AnswerBody"][n_tag_posts:n_answer_posts]:
    text = re.sub(regex_pattern, '', text, flags=re.I)
    text = re.sub('\(|\)', ' ', text, flags=re.I)    
    doc = nlp(text)    
    
    matches = matcher(doc)    
    match_set = set()
    for match_id, start, end in matches:
        match_set.add(doc[start:end])
    [word_set.add(filtered_span) for filtered_span in filter_spans(match_set)]
    
print("finished")

finished


In [63]:
# Term similarity

# loop through relavant words and get their vectors
technology_list = []
technology_string_list = []
technology_counter = Counter()
for tag in tag_set:
    term_vector = []    
    tag_doc = nlp(str(tag))
    #tag_vector = tag_doc.vector    
    
    for i,span in enumerate(word_set):        
        if tag_doc.similarity(span) >= 0.8:
        # very slow
        # if cosine_similarity([tag_vector], [span_vector])[0][0] >= 0.7:
            if span.text not in technology_string_list:
                technology_list.append(span)
                technology_string_list.append(span.text)
            technology_counter[span.text] = technology_counter[span.text] + 1

print("finished")

  if tag_doc.similarity(span) >= 0.8:


finished


In [75]:

with open('technology_list.txt', 'w', newline='') as myfile:
    myfile.truncate(0)  
    
for technology in technology_list:    
    with open('technology_list.txt', 'a', newline='') as myfile:        
        wr = csv.writer(myfile)
        wr.writerow([technology])

In [68]:
technology_list

[Ruby,
 Statistics,
 Kernel Density Estimation,
 R,
 R.The R,
 r,
 Convolutional Neural Networks,
 Neural Networks,
 Neural Network,
 Convolutional Neural Network,
 modelsArtificial Neural Networks,
 Recurrent Neural Network,
 word2vec,
 Data,
 analysis Kibana,
 Analysis ofStructured Data,
 data scienceA,
 Singular Matrix,
 Confusion Matrix,
 Matrix,
 test-setModel fittingOverall accuracyBetter,
 CUDA,
 Deep Learning,
 Mahout,
 Python,
 Execute Python,
 0.My Python,
 Python API,
 bit Python,
 Information Security,
 System Security,
 API,
 URLGetRankedNamedEntities API,
 Java API,
 PAM,
 KNN,
 knn clusfier;normalize,
 SIFT,
 text categorization Largeron,
 Scikit-learn,
 findNeighbors - get,
 SQL,
 Algorithms,
 kernel SVC.If,
 Scala,
 scala,
 MapReduce,
 Node.js,
 PCA,
 example PCA,
 Decision Trees,
 Decision Tree,
 Hadoop,
 Unsupervised Deep Learning,
 Facebook,
 Web Searchand Data Mining,
 Data Mining,
 Gaussian,
 Nearest Neighbors,
 Nearest Neighbour,
 Regex,
 GLM,
 PHP,
 PhP SDK,
 El