In [1]:
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
import matplotlib.pyplot as plt
import spacy 
from spacy.matcher import Matcher
import pandas as pd
import re
from collections import Counter
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import copy
from bokeh.plotting import figure, show
from bokeh.transform import jitter
import math
import csv

# configuration
n_tag_posts = 500
n_answer_posts = 2000

In [2]:
# Load csv data
filepath = os.path.join(os.getcwd(), 'QueryResults_sample_42000.csv')
stack_posts = pd.read_csv(filepath, sep = ",")

print("loaded csv data")

loaded csv data


In [3]:
stack_posts

Unnamed: 0,QuestionUserId,QuestionUserReputation,QuestionUserDN,Tags,QuestionId,QuestionScore,title,QuestionBody,AcceptedAnswer,AnswerUserId,AnswerUserReputation,AnswerUserDN,AnswerScore,AnswerId,AnswerBody
0,3625340,33,user3625340,<image-processing><machine-learning><svm><feat...,27729199,0,How to find Relevent Features for Comparing Di...,<p>Currently we are doing a project on diagram...,27733517.0,1056563,45835,StephenBoesch,0,27733517,<p>In regard solely to the difference in scale...
1,4409773,788,Avis,<java><machine-learning><svm><encog>,27729238,1,SVM using Encog in Java for beginners,<p>I am beginner in SVM. Could someone please ...,27808712.0,173355,3162,JeffHeaton,1,27808712,<p>In Encog SVM is just a classification or re...
2,4408281,715,datavinci,<python-2.7><machine-learning>,27730775,1,Why does not the following code snippet run su...,<p>I was reading Programming Collective Intell...,27730829.0,367273,436215,NPE,1,27730829,<blockquote>\n <p>NameError: global name 'lin...
3,3512217,119,Shlomi,<machine-learning><svm>,27730870,-1,division of two proper kernels,"<p>Let <img src=""https://i.stack.imgur.com/Z1G...",,1060350,70512,Has QUIT--Anony-Mousse,0,27742921,"<p>K2(x,z) can be 0.</p>\n\n<p>Then this value..."
4,4405757,14440,user7,<machine-learning><classification><weka><libsv...,27732503,0,One class SVM to detect outliers,<p>My problem is</p>\n\n<blockquote>\n <p>I w...,,1060350,70512,Has QUIT--Anony-Mousse,5,27739848,<p>Your data is not formatted appropriately fo...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42539,15787388,9,Max,<python><machine-learning><deep-learning><pred...,67347475,0,Making Predictions Based on 2 Sets of Data in ...,"<p>Imagine you have two sources. For example, ...",,9346942,21,Parsa Abbasi,0,67350681,"<p>It's kind of a time series problem, therefo..."
42540,13853726,3,Azzam Radman,<python><machine-learning><random-forest>,67350118,-1,GPU for Random Forest Regressor,<p>I am still new to Machine Learning and have...,67350263.0,14333379,40,Al_P,0,67350263,<p>If you use sklearn random forest implementa...
42541,13248351,47,gumecf,<python><tensorflow><machine-learning><artific...,67351628,1,loading data into X_train and Y_train,<p>If this is the organisation of my data how ...,67351909.0,9984384,54,Orbital,1,67351909,"<p>You can use a <a href=""https://www.tensorfl..."
42542,15763070,13,Lburris12,<tensorflow><machine-learning><keras><shapes><...,67351796,0,Keras incompatible shapes NN,<p>So I have this neural network and I am feed...,67351849.0,9215780,6990,M.Innat,0,67351849,<p>Your last layer uses <code>linear</code> ac...


In [4]:
# cleaning functions

# clean all angle brackets from string
def clean_tags(tags):    
    tag_list = []
    # clean tags from '>' and '<' occurences
    tags = re.sub('><', ' ', tags) 
    tags = re.sub('<|>', '', tags)
    # add single tag of tags and add it to lists and sets
    for tag in tags.split():
        if tag != 'machine-learning':
            tag_list.append(tag)
    return tag_list



In [5]:
n_posts = 42500

# drop all duplicates in posts
df_w_dupes = pd.DataFrame(stack_posts[0:n_posts])
df_wo_dupes = df_w_dupes.drop_duplicates(["QuestionId"])
df_sorted_wo_dupes = df_wo_dupes.sort_values(by=["QuestionId"])

In [6]:
# extact user stats (reputation in question/answers, question-/answers counts, amount of accepted answers etc.)
users = dict()

def create_user_stats(id, display_name):
    # add user if userid does not exist
    if id not in users:
        users[id] = dict()    
        users[id]["display_name"] = display_name
        
# customize stats which every user has
def customize_user_base_stats(id, score, post_type):    
    users[id][post_type + "_reputation"] = users[id].get(post_type + "_reputation", 0) + score * 10   
    users[id][post_type + '_count'] = users[id].get(post_type + '_count', 0) + 1


# iterate over dataframe tuples
for i, post in enumerate(df_w_dupes.itertuples()):  
    # create DataFrame from pandas.core.frame so column names can be used instead of indexes
    post = pd.DataFrame(post).transpose().drop(0, axis=1)
    post.columns = stack_posts.columns
    
    # customize question user stats if question is not a duplicate of previous question
    if post["QuestionId"][0] != df_w_dupes.iloc[i - 1]["QuestionId"]:
        question_user_id = post["QuestionUserId"][0]
        create_user_stats(question_user_id, post["QuestionUserDN"][0])
        
        # customize question user stats        
        customize_user_base_stats(question_user_id,post["QuestionScore"][0], "question")
        cleaned_tags = clean_tags(post["Tags"][0])
        users[question_user_id]["question_tags"] = set(users[question_user_id].get("question_tags", set()).union(cleaned_tags))
                    
                
    # create answer user stats
    answer_user_id = post["AnswerUserId"][0]
    create_user_stats(answer_user_id, post["AnswerUserDN"][0])
        
    customize_user_base_stats(answer_user_id,post["AnswerScore"][0], "answer")
    if post["AcceptedAnswer"][0] == post["AnswerId"][0]:
        users[answer_user_id]["accepted_answer_count"] = users[answer_user_id].get("accepted_answer_count", 0) + 1
        users[answer_user_id]["answer_reputation"] = users[answer_user_id].get("answer_reputation", 0) + 15
    
    
        

In [None]:
# read technology list
technology_set = set()
with open('technology_list.txt', 'r', newline='') as myfile:        
    reader = csv.reader(myfile)
    for row in reader:        
        technology_set.add(row[0].lower())

print("import successful")


In [7]:
# clean posts and match words
nlp = spacy.load("en_core_web_lg")
matcher = Matcher(nlp.vocab)

technology_pattern1 = [{'POS': 'PROPN', 'OP': '+'},
                       {'POS': 'NUM', 'OP': '?'}
                      ]

technology_pattern2 = [{'OP': '+', 'POS': 'PROPN'},
                       {'TEXT': '-', 'OP': '+'},
                       {'POS': 'VERB', 'OP': '+'}
                      ]

technology_pattern3 = [{'OP': '+', 'POS': 'NOUN'},
                       {'TEXT': '-', 'OP': '?'},
                       {'POS': 'PROPN', 'OP': '+'}
                      ]

ml_pattern1 = [{'LOWER': 'machine', 'OP': '!'},
                       #{'TEXT': '-', 'OP': '!'},
                       {'LOWER': 'learning', 'OP': '!'}
                      ]


word_set = set()
regex_pattern = '(<(pre|code|blockquote|a|strike)(.|\n)*?\/(pre|code|blockquote|a|strike)>)*?|<(p|b|br|br(.|\n)*?\/|sub|sup|em|strong|hr|s|i|ol|ul|li|code)*?>|<\/(p|b|br|sub|sup|em|strong|s|i|ol|ul|li|div|pre|blockquote|a|code)>|<h(.|\n)*?>(.|\n)*?<\/h(.|\n)*?>*?|(<(img|div|ol|ul|li)(.|\n)*?\/*?>)|\n'

matcher.add("match_technology1", [technology_pattern1])
matcher.add("match_technology2", [technology_pattern2])
matcher.add("match_technology3", [technology_pattern3])
#matcher.add("unmatch_ml_pattern", [ml_pattern1])

for text, id in zip(df_w_dupes[["AnswerBody","AnswerUserId"]][n_tag_posts:n_answer_posts]):
    text = re.sub(regex_pattern, '', text, flags=re.I)
    text = re.sub('\(|\)', ' ', text, flags=re.I)    
    doc = nlp(text)    
    
    matches = matcher(doc)    
    match_set = set()
    users[id]["topic_counter"] = Counter()
    for match_id, start, end in matches:
        #match_set.add(doc[start:end])
        users[id]["topic_counter"][doc[start:end].text] = users[id]["topic_counter"][doc[start:end].text] + 1
    
    
    #[word_set.add(filtered_span) for filtered_span in filter_spans(match_set)]
    
print("finished")

ValueError: too many values to unpack (expected 2)

In [24]:
user_df = pd.DataFrame(users).transpose()
sorted_user_df = user_df.sort_values(by=['answer_reputation'], ascending=False)
#sorted_user_df = sorted_user_df[0:100]

In [29]:
sorted_user_df[0:20]

Unnamed: 0,display_name,question_reputation,question_count,question_tags,answer_reputation,answer_count,accepted_answer_count
4685471,desertnaut,10.0,1.0,"{shap, scikit-learn}",31545,577,457.0
2658050,lejlot,,,,24935,617,361.0
5974433,Marcin Możejko,70.0,3.0,"{recurrent-neural-network, cntk, scikit-learn,...",22255,238,159.0
712995,Maxim,40.0,1.0,"{python, deep-learning, neural-network, tensor...",20900,378,244.0
1714410,Shai,1020.0,9.0,"{python, neural-network, normalization, batch-...",16575,309,195.0
2099607,today,,,,14030,318,250.0
3374996,Vivek Kumar,,,,9155,182,129.0
5545260,dga,,,,8435,14,9.0
2097240,Daniel Möller,,,,8215,186,105.0
562769,Martin Thoma,480.0,6.0,"{reproducible-research, neural-network, numpy,...",7810,92,32.0


In [None]:
# top Users
top_answer_users = sorted_user_df[0:20]
indexes = stack_posts[top_answer_users.index in stack_posts["AnswerUserId"]].index

#top_answer_users_posts = stack_posts.drop(top_answer_users.index[:], columns="QuestionUserId") #[stack_posts.AnswerUserId in top_answer_users.index]
indexes


In [31]:
# top users diagram
p = figure(plot_width=500, plot_height=500)
p.hex_dot(top_users["answer_count"], sorted_user_df["answer_reputation"], size=10, fill_color="#fcc203", fill_alpha=0.5)
show(p)

In [33]:
top_users

Unnamed: 0,display_name,question_reputation,question_count,question_tags,answer_reputation,answer_count,accepted_answer_count
4685471,desertnaut,10.0,1.0,"{shap, scikit-learn}",31545,577,457.0
2658050,lejlot,,,,24935,617,361.0
5974433,Marcin Możejko,70.0,3.0,"{recurrent-neural-network, cntk, scikit-learn,...",22255,238,159.0
712995,Maxim,40.0,1.0,"{python, deep-learning, neural-network, tensor...",20900,378,244.0
1714410,Shai,1020.0,9.0,"{python, neural-network, normalization, batch-...",16575,309,195.0
2099607,today,,,,14030,318,250.0
3374996,Vivek Kumar,,,,9155,182,129.0
5545260,dga,,,,8435,14,9.0
2097240,Daniel Möller,,,,8215,186,105.0
562769,Martin Thoma,480.0,6.0,"{reproducible-research, neural-network, numpy,...",7810,92,32.0
