In [12]:
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
import matplotlib.pyplot as plt
import spacy 
from spacy.matcher import Matcher, PhraseMatcher
import pandas as pd
import re
from collections import Counter
import sklearn
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from bokeh.plotting import figure, show
from bokeh.transform import jitter
from bokeh.io import curdoc
import csv
import math

# configuration
n_tag_posts = 500
n_answer_posts = 42636

In [2]:
# Load csv data
filepath = os.path.join(os.getcwd(), 'QueryResults_sample_42636_14_05_21.csv')
stack_posts = pd.read_csv(filepath, sep = ",")

print("loaded csv data")

loaded csv data


In [4]:
stack_posts

Unnamed: 0,QuestionUserId,QuestionUserReputation,QuestionUserDN,Tags,QuestionId,QuestionScore,title,QuestionBody,QuestionDate,AcceptedAnswer,AnswerUserId,AnswerUserReputation,AnswerUserDN,AnswerScore,AnswerId,AnswerBody,AnswerDate
0,3625340,33,user3625340,<image-processing><machine-learning><svm><feat...,27729199,0,How to find Relevent Features for Comparing Di...,<p>Currently we are doing a project on diagram...,2015-01-01 08:03:13,27733517.0,1056563,45925,StephenBoesch,0,27733517,<p>In regard solely to the difference in scale...,2015-01-01 18:39:02
1,4409773,788,Avis,<java><machine-learning><svm><encog>,27729238,1,SVM using Encog in Java for beginners,<p>I am beginner in SVM. Could someone please ...,2015-01-01 08:10:29,27808712.0,173355,3162,JeffHeaton,1,27808712,<p>In Encog SVM is just a classification or re...,2015-01-06 22:58:03
2,4408281,715,datavinci,<python-2.7><machine-learning>,27730775,1,Why does not the following code snippet run su...,<p>I was reading Programming Collective Intell...,2015-01-01 12:22:49,27730829.0,367273,436785,NPE,1,27730829,<blockquote>\n <p>NameError: global name 'lin...,2015-01-01 12:32:19
3,3512217,119,Shlomi,<machine-learning><svm>,27730870,-1,division of two proper kernels,"<p>Let <img src=""https://i.stack.imgur.com/Z1G...",2015-01-01 12:37:59,,1060350,70610,Has QUIT--Anony-Mousse,0,27742921,"<p>K2(x,z) can be 0.</p>\n\n<p>Then this value...",2015-01-02 13:22:27
4,4405757,14440,user7,<machine-learning><classification><weka><libsv...,27732503,0,One class SVM to detect outliers,<p>My problem is</p>\n\n<blockquote>\n <p>I w...,2015-01-01 16:26:06,,1060350,70610,Has QUIT--Anony-Mousse,5,27739848,<p>Your data is not formatted appropriately fo...,2015-01-02 09:20:50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42631,14926602,21,Waqar Kaleem Khan,<tensorflow><machine-learning><keras><deep-lea...,67441958,0,LSTM input layer shape in Keras using function...,<p>I am trying to implement LSTMs on drug data...,2021-05-07 21:38:47,,5927701,2972,data_person,1,67442769,<p>Try adding an <code>Embedding Layer</code> ...,2021-05-07 23:36:34
42632,3973175,4261,con,<python><python-3.x><machine-learning><shap>,67443411,2,How to get feature names of shap_values from T...,"<p>I am doing a shap tutorial, and attempting ...",2021-05-08 01:56:51,67444552.0,3954379,5202,Lucas,2,67444552,"<p>The features are indeed in the same order, ...",2021-05-08 06:02:39
42633,15847767,1,Pearl,<python><machine-learning><keras><neural-network>,67443744,-1,"What does hidden_layer = layers.Dense(100, act...","<p>I saw the following in <a href=""https://www...",2021-05-08 03:14:14,,15166370,45,Vibhav Surve,0,67443785,<p>See input layer is nothing but how many neu...,2021-05-08 03:22:03
42634,14551464,61,Onkar Chougule,<machine-learning><deep-learning><pytorch><con...,67449276,0,Initalize using previous .pth and train for fu...,<p>How do I initialize a UNet model from its p...,2021-05-08 15:36:01,,14551464,61,Onkar Chougule,0,67451326,<pre><code>state= torch.load(&quot;/content/mo...,2021-05-08 19:21:01


In [3]:
# cleaning functions

# clean all angle brackets from string
def clean_tags(tags):    
    tag_list = []
    # clean tags from '>' and '<' occurences
    tags = re.sub('><', ' ', tags) 
    tags = re.sub('<|>', '', tags)
    # add single tag of tags and add it to lists and sets
    for tag in tags.split():
        if tag != 'machine-learning':
            tag_list.append(tag)
    return tag_list



In [4]:
# drop all duplicates in posts
df_w_dupes = pd.DataFrame(stack_posts[0:n_answer_posts])
df_wo_dupes = df_w_dupes.drop_duplicates(["QuestionId"])
df_sorted_wo_dupes = df_wo_dupes.sort_values(by=["QuestionId"])

In [5]:
# extact user stats (reputation in question/answers, question-/answers counts, amount of accepted answers etc.)
users = dict()

def create_user_stats(id, display_name):
    # add user if userid does not exist
    if id not in users:
        users[id] = dict()    
        users[id]["display_name"] = display_name
        
# customize stats which every user has
def customize_user_base_stats(id, score, post_type):    
    users[id][post_type + "_reputation"] = users[id].get(post_type + "_reputation", 0) + score * 10   
    users[id][post_type + '_count'] = users[id].get(post_type + '_count', 0) + 1


# iterate over dataframe tuples
for i, post in enumerate(df_w_dupes.itertuples()):  
    # create DataFrame from pandas.core.frame so column names can be used instead of indexes
    post = pd.DataFrame(post).transpose().drop(0, axis=1)
    post.columns = stack_posts.columns
    
    # customize question user stats if question is not a duplicate of previous question
    if post["QuestionId"][0] != df_w_dupes.iloc[i - 1]["QuestionId"]:
        question_user_id = post["QuestionUserId"][0]
        create_user_stats(question_user_id, post["QuestionUserDN"][0])
        
        # customize question user stats        
        customize_user_base_stats(question_user_id,post["QuestionScore"][0], "question")
        cleaned_tags = clean_tags(post["Tags"][0])
        users[question_user_id]["question_tags"] = set(users[question_user_id].get("question_tags", set()).union(cleaned_tags))
                    
                
    # create answer user stats
    answer_user_id = post["AnswerUserId"][0]
    create_user_stats(answer_user_id, post["AnswerUserDN"][0])
        
    customize_user_base_stats(answer_user_id,post["AnswerScore"][0], "answer")
    users[answer_user_id]["answer_tags"] = set(users[answer_user_id].get("answer_tags", set()).union(cleaned_tags))
    if post["AcceptedAnswer"][0] == post["AnswerId"][0]:
        users[answer_user_id]["accepted_answer_count"] = users[answer_user_id].get("accepted_answer_count", 0) + 1
        users[answer_user_id]["answer_reputation"] = users[answer_user_id].get("answer_reputation", 0) + 15        

    
        

In [8]:
# read technology list
technology_set = set()
with open('technology_list_customized.txt', 'r', newline='') as myfile:        
    reader = csv.reader(myfile)
    for row in reader:        
        technology_set.add(row[0].lower())

print("import successful")


IndexError: list index out of range

In [6]:
user_df = pd.DataFrame(users).transpose()
sorted_user_df = user_df.sort_values(by=['answer_reputation'], ascending=False)
#sorted_user_df = sorted_user_df[0:100]

In [8]:
sorted_user_df[0:20]

Unnamed: 0,display_name,question_reputation,question_count,question_tags,answer_reputation,answer_count,answer_tags,accepted_answer_count
4685471,desertnaut,10.0,1.0,"{scikit-learn, shap}",31680,579,"{unsupervised-learning, tensorflow-datasets, l...",458.0
2658050,lejlot,,,,25015,618,"{prediction, object, reinforcement-learning, s...",363.0
5974433,Marcin Możejko,70.0,3.0,"{grid-search, nlp, keras, neural-network, cntk...",22335,238,"{unsupervised-learning, python-3.x, image, num...",159.0
712995,Maxim,40.0,1.0,"{deep-learning, tensorflow, neural-network, py...",20950,378,"{unsupervised-learning, machine-translation, r...",244.0
1714410,Shai,1020.0,9.0,"{normalization, computer-vision, neural-networ...",16625,309,"{unsupervised-learning, python-3.x, semantic-s...",195.0
2099607,today,,,,14080,318,"{unsupervised-learning, python-3.x, semantic-s...",250.0
3374996,Vivek Kumar,,,,9175,182,"{python-3.x, unsupervised-learning, numpy, pip...",129.0
5545260,dga,,,,8445,14,"{computer-vision, python-2.7, image-recognitio...",9.0
2097240,Daniel Möller,,,,8265,186,"{python-3.x, numpy, image-processing, batch-no...",105.0
562769,Martin Thoma,490.0,6.0,"{python-3.x, keras, numpy, image-processing, s...",7830,92,"{python-3.x, image, reinforcement-learning, pr...",32.0


In [9]:
# select posts from top users
top_answer_users = sorted_user_df[0:20]
top_users_posts = stack_posts[stack_posts["AnswerUserId"].isin(top_answer_users.index)]

In [10]:
# clean answers
regex_pattern = '(<(pre|code|blockquote|a|strike)(.|\n)*?\/(pre|code|blockquote|a|strike)>)*?|<(p|b|br|br(.|\n)*?\/|sub|sup|em|strong|hr|s|i|ol|ul|li|code)*?>|<\/(p|b|br|sub|sup|em|strong|s|i|ol|ul|li|div|pre|blockquote|a|code)>|<h(.|\n)*?>(.|\n)*?<\/h(.|\n)*?>*?|(<(img|div|ol|ul|li)(.|\n)*?\/*?>)|\n'
def clean_bodys(text):
    text = re.sub(regex_pattern, '', text, flags=re.I)
    text = re.sub('\(|\)', ' ', text, flags=re.I)
    return text

In [13]:
# check for technologies
nlp = spacy.load("en_core_web_lg")
matcher = PhraseMatcher(nlp.vocab)

technology_pattern = [nlp(text) for text in technology_set]
matcher.add("TECHNOLOGIES", technology_pattern)


for answer in top_users_posts["AnswerBody"][1:2]:
    cleaned_text= clean_bodys(answer)
    
    doc = nlp(cleaned_text)
    print(doc)    
    matches = matcher(doc)
    print(matches)
    break


Your data is not formatted appropriately for this problem.If you putpairs into a SVM, what you are really putting into the SVM are sparse vectors that consist of a single one, corresponding to your word, i.e.Anything a classifier can do on such data is overfit and memorize. On unknown new words, the result will be useless.If you want your classifier to be able to abstract and generalize, then you need to carefully extract features from your words.Possible features would be n-grams. So the word "example" could be represented asNow your classifier/SVM could learn that having the n-gram "ple" is typical for nouns.Results will likely be better if you add "beginning-of-word" and "end-of-word" symbol,and maybe also use more than one n-gram length, e.g.but of course, the more you add the larger your data set and search space grows, which again may lead to overfitting.
[(11244915669818549327, 27, 28)]


In [53]:
# top users diagram
p = figure(plot_width=500, plot_height=500)

curdoc().theme = 'light_minimal'
p.circle_dot(top_answer_users["answer_count"], top_answer_users["answer_reputation"], size=15, fill_color="#348abd", fill_alpha=0.4, line_color="black")
p.xaxis.axis_label = 'Anzahl Posts'
p.yaxis.axis_label = 'Reputation'
show(p)

# Antworten - Akzeptierte Antworten

In [54]:
p = figure(plot_width=500, plot_height=500)

curdoc().theme = 'light_minimal'
p.circle(top_answer_users["answer_count"], top_answer_users["accepted_answer_count"], size=15, fill_color="#348abd", fill_alpha=0.4, line_color="black")
p.xaxis.axis_label = 'Anzahl Antworten'
p.yaxis.axis_label = 'Anzahl akzeptierte Anwtorten'
show(p)

# Metrik - Antworten/akzeptierte Antworten
Rate an akzeptierten Antworten auf Grundlagen von Gesamtanzahl der Antworten unter Experten

In [64]:
total_answers = 0

for answer_amount in top_answer_users["answer_count"]:
    total_answers += answer_amount

total_acc_answers = 0
for acc_answer_amount in top_answer_users["accepted_answer_count"]:
    if math.isnan(acc_answer_amount):
        continue
    total_acc_answers += acc_answer_amount

acc_answer_answer_ratios = []

for user in top_answer_users.itertuples():
    acc_answer_answer_ratio = user[8]/user[6]
    acc_answer_answer_ratios.append(acc_answer_answer_ratio)

acc_answer_answer_ratios_df = pd.DataFrame(acc_answer_answer_ratios, index=top_answer_users["display_name"])
acc_answer_answer_ratios_df.columns = ["Anzahl akzeptierte Antworten/Antwortanzahl"]
acc_answer_answer_ratios_df.fillna(0, inplace=True)
acc_answer_answer_ratios_df = acc_answer_answer_ratios_df.sort_values(by=["Anzahl akzeptierte Antworten/Antwortanzahl"], ascending=False)
pd.options.display.float_format = "{:,.3f}".format

acc_answer_answer_ratio = total_acc_answers/total_answers
print(acc_answer_answer_ratio)
display(acc_answer_answer_ratios_df)


0.5920076573342905


Unnamed: 0_level_0,Anzahl akzeptierte Antworten/Antwortanzahl
display_name,Unnamed: 1_level_1
desertnaut,0.791
today,0.786
Vivek Kumar,0.709
Marcin Możejko,0.668
mrry,0.646
Maxim,0.646
dga,0.643
Shai,0.631
MaxU,0.588
lejlot,0.587


# Metrik - Reputation pro Antwort

In [61]:
rep_per_answer_list = []
for user in top_answer_users.itertuples():
    rep_per_answer = user[5]/user[6]
    rep_per_answer_list.append(rep_per_answer)
    
rep_per_answer_df = pd.DataFrame(rep_per_answer_list, index=top_answer_users["display_name"])
rep_per_answer_df.columns = ["Reputation/Antwort"]
rep_per_answer_df.fillna(0, inplace=True)
pd.options.display.float_format = "{:,.3f}".format
rep_per_answer_df = rep_per_answer_df.sort_values(by=["Reputation/Antwort"], ascending=False)
display(rep_per_answer_df)

rep_per_answer_df["Reputation/Antwort"].sum()/20


Unnamed: 0_level_0,Reputation/Antwort
display_name,Unnamed: 1_level_1
runhani,4840.0
dga,603.214
Salvador Dali,188.857
stackoverflowuser2010,164.022
Marcin Możejko,93.845
mrry,92.215
Martin Thoma,85.109
MaxU,61.438
Maxim,55.423
desertnaut,54.715


332.05791736784363

# Anzahl Tag pro Experte

In [57]:
tags_per_user = []
for tags in top_answer_users["answer_tags"]:
    tags_per_user.append(len(tags))

tags_per_user_df = pd.DataFrame(tags_per_user, index=top_answer_users["display_name"])
tags_per_user_df.columns = ["Anzahl Tags"]
tags_per_user_df.fillna(0, inplace=True)
pd.options.display.float_format = "{:,.3f}".format
display(tags_per_user_df)

Unnamed: 0_level_0,Anzahl Tags
display_name,Unnamed: 1_level_1
desertnaut,190
lejlot,256
Marcin Możejko,89
Maxim,174
Shai,83
today,102
Vivek Kumar,99
dga,15
Daniel Möller,67
Martin Thoma,75


In [None]:
answer_tags = top_answer_users["answer_tags"]
for tag in answer_tags:
    print(tag)

In [66]:
top_users = sorted_user_df[0:20]
sorted_user_tag_df = pd.DataFrame(top_users, columns=["answer_tags"], index=top_users.index)
sorted_user_tag_df["user"] = top_users.index
sorted_user_tag_df.fillna("0", inplace=True)

# Metrik Reputation in "machine-learning"/Gesamtreputation

In [138]:
top_users_total_rep = top_users_posts.drop_duplicates(subset=["AnswerUserId"])
top_users_total_rep
expert_reputations = []
for expert in top_users.itertuples():
    
    user_reputation = top_users_total_rep.loc[top_users_total_rep['AnswerUserId'] == expert[0]]["AnswerUserReputation"].values[0]          
    display_name = expert[1]
    calculated_rep = expert[5]                    
    #expert_reputations.append([display_name, user_reputation, calculated_rep])
    expert_reputations.append(calculated_rep/user_reputation)
    
expert_reputations_df = pd.DataFrame(expert_reputations)
expert_reputations_df.columns = ["display_name", "user_reputation", "calculated_rep"]



Unnamed: 0,display_name,user_reputation,calculated_rep
0,desertnaut,45566,31680
1,lejlot,56698,25015
2,Marcin Możejko,34861,22335
3,Maxim,47758,20950
4,Shai,92508,16625
5,today,26890,14080
6,Vivek Kumar,28954,9175
7,dga,20741,8445
8,Daniel Möller,74002,8265
9,Martin Thoma,90914,7830


In [132]:
expert_reputations_df

# Tag Similarity

In [79]:
user_tag_list = []
for i,tags in enumerate(sorted_user_tag_df["answer_tags"]):
        
    if tags != "0":    
        for tag in tags:        
            user_tag = [sorted_user_tag_df.iloc[i]["user"], tag]
            user_tag_list.append(user_tag)
        


user_tag_list_df = pd.DataFrame(user_tag_list, columns=["userid", "tag"])
user_tag_list_df["count"] = 1

user_tags_matrix = user_tag_list_df.pivot(index="userid", columns="tag", values="count")
user_tags_matrix.fillna(0, inplace=True)
user_tag_similarity = cosine_similarity(user_tags_matrix)

sim_df = pd.DataFrame(user_tag_similarity, index=user_tags_matrix.index, columns=user_tags_matrix.index)
display(user_tags_matrix)
display(sim_df)

tag,.net,accord.net,activation-function,adaboost,adam,algorithm,amazon-sagemaker,anaconda,analysis,analytics,...,word-embedding,word2vec,xgbclassifier,xgboost,xls,xor,yellowbrick,yelp,yolo,zero-padding
userid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
349130,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
562769,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
712995,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
1060350,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1090562,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1714410,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2097240,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2099607,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2658050,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3374996,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


userid,349130,562769,712995,1060350,1090562,1714410,2097240,2099607,2658050,3374996,3574081,4561314,4685471,4785185,5025009,5545260,5741205,5974433,6730309,10908375
userid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
349130,1.0,0.431,0.361,0.233,0.296,0.398,0.443,0.404,0.326,0.296,0.376,0.397,0.312,0.38,0.282,0.292,0.303,0.444,0.113,0.424
562769,0.431,1.0,0.376,0.336,0.322,0.368,0.324,0.377,0.39,0.406,0.347,0.405,0.377,0.422,0.37,0.268,0.376,0.392,0.115,0.388
712995,0.361,0.376,1.0,0.296,0.33,0.349,0.37,0.435,0.403,0.358,0.324,0.317,0.352,0.418,0.279,0.215,0.3,0.41,0.114,0.323
1060350,0.233,0.336,0.296,1.0,0.173,0.202,0.191,0.204,0.416,0.392,0.179,0.296,0.375,0.433,0.362,0.165,0.338,0.263,0.071,0.21
1090562,0.296,0.322,0.33,0.173,1.0,0.363,0.319,0.31,0.294,0.297,0.385,0.329,0.29,0.285,0.289,0.405,0.304,0.351,0.261,0.315
1714410,0.398,0.368,0.349,0.202,0.363,1.0,0.469,0.402,0.322,0.243,0.33,0.311,0.295,0.385,0.261,0.312,0.23,0.384,0.165,0.397
2097240,0.443,0.324,0.37,0.191,0.319,0.469,1.0,0.508,0.305,0.307,0.386,0.346,0.355,0.319,0.203,0.347,0.256,0.505,0.183,0.41
2099607,0.404,0.377,0.435,0.204,0.31,0.402,0.508,1.0,0.322,0.308,0.329,0.347,0.417,0.369,0.27,0.23,0.299,0.462,0.198,0.422
2658050,0.326,0.39,0.403,0.416,0.294,0.322,0.305,0.322,1.0,0.408,0.227,0.346,0.44,0.508,0.349,0.21,0.356,0.431,0.094,0.282
3374996,0.296,0.406,0.358,0.392,0.297,0.243,0.307,0.308,0.408,1.0,0.27,0.366,0.489,0.442,0.525,0.234,0.479,0.362,0.101,0.324


In [80]:
threshold = 0.3

tag_similarity_filtered = sim_df
tag_similarity_filtered.values[tag_similarity_filtered <= threshold] = 0
tag_similarity_filtered

userid,349130,562769,712995,1060350,1090562,1714410,2097240,2099607,2658050,3374996,3574081,4561314,4685471,4785185,5025009,5545260,5741205,5974433,6730309,10908375
userid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
349130,1.0,0.431,0.361,0.0,0.0,0.398,0.443,0.404,0.326,0.0,0.376,0.397,0.312,0.38,0.0,0.0,0.303,0.444,0.0,0.424
562769,0.431,1.0,0.376,0.336,0.322,0.368,0.324,0.377,0.39,0.406,0.347,0.405,0.377,0.422,0.37,0.0,0.376,0.392,0.0,0.388
712995,0.361,0.376,1.0,0.0,0.33,0.349,0.37,0.435,0.403,0.358,0.324,0.317,0.352,0.418,0.0,0.0,0.0,0.41,0.0,0.323
1060350,0.0,0.336,0.0,1.0,0.0,0.0,0.0,0.0,0.416,0.392,0.0,0.0,0.375,0.433,0.362,0.0,0.338,0.0,0.0,0.0
1090562,0.0,0.322,0.33,0.0,1.0,0.363,0.319,0.31,0.0,0.0,0.385,0.329,0.0,0.0,0.0,0.405,0.304,0.351,0.0,0.315
1714410,0.398,0.368,0.349,0.0,0.363,1.0,0.469,0.402,0.322,0.0,0.33,0.311,0.0,0.385,0.0,0.312,0.0,0.384,0.0,0.397
2097240,0.443,0.324,0.37,0.0,0.319,0.469,1.0,0.508,0.305,0.307,0.386,0.346,0.355,0.319,0.0,0.347,0.0,0.505,0.0,0.41
2099607,0.404,0.377,0.435,0.0,0.31,0.402,0.508,1.0,0.322,0.308,0.329,0.347,0.417,0.369,0.0,0.0,0.0,0.462,0.0,0.422
2658050,0.326,0.39,0.403,0.416,0.0,0.322,0.305,0.322,1.0,0.408,0.0,0.346,0.44,0.508,0.349,0.0,0.356,0.431,0.0,0.0
3374996,0.0,0.406,0.358,0.392,0.0,0.0,0.307,0.308,0.408,1.0,0.0,0.366,0.489,0.442,0.525,0.0,0.479,0.362,0.0,0.324


# Same tags in all taglists

In [86]:
intersect_set = sorted_user_tag_df.iloc[0]["answer_tags"]
#print(intersect_set)
for i,tags in enumerate(sorted_user_tag_df["answer_tags"][0:20]):
    intersect_set = intersect_set.intersection(tags)
intersect_set

{'deep-learning'}

# Count tags

In [85]:
tag_counter = Counter()
for i,tags in enumerate(sorted_user_tag_df["answer_tags"][0:20]):
    for tag in tags:
        tag_counter[tag] += 1

tag_counter = Counter({k: c for k,c in sorted(tag_counter.items(), key=lambda item: item[1], reverse=True)})

tag_counter

Counter({'deep-learning': 20,
         'neural-network': 19,
         'python': 19,
         'numpy': 18,
         'tensorflow': 18,
         'computer-vision': 17,
         'python-3.x': 17,
         'keras': 17,
         'scikit-learn': 16,
         'nlp': 16,
         'conv-neural-network': 16,
         'classification': 15,
         'artificial-intelligence': 15,
         'data-science': 14,
         'regression': 14,
         'recurrent-neural-network': 14,
         'pandas': 14,
         'lstm': 14,
         'image-processing': 13,
         'one-hot-encoding': 13,
         'mnist': 12,
         'svm': 12,
         'feature-extraction': 12,
         'text-classification': 12,
         'linear-regression': 12,
         'logistic-regression': 12,
         'cross-validation': 12,
         'optimization': 12,
         'training-data': 11,
         'unsupervised-learning': 10,
         'backpropagation': 10,
         'arrays': 10,
         'cluster-analysis': 10,
         'naivebayes':

# Gesamtanzahl der Tags

In [76]:
len(tag_counter)

689

# Textähnlichkeiten der Experten

In [None]:
doc_dict = {}

for i in range(0, len(top_users_posts)):        
    doc_dict[top_users_posts.iloc[i]["AnswerUserId"]] = doc_dict.get(top_users_posts.iloc[i]["AnswerUserId"], '') + ' ' + clean_bodys(top_users_posts.iloc[i]["AnswerBody"])


In [73]:
doc_df = pd.DataFrame.from_dict(doc_dict, orient="index", columns=["Doc"])

Unnamed: 0,Doc
1060350,"K2 x,z can be 0.Then this value is not well-..."
2658050,"In short yes, you should include each class. ..."
4561314,Here are the original input variables:A is a ...
1714410,If you got from git you should find in fold...
562769,Value iteration is used when you have transit...
4685471,"In documentation, it is mentioned: What you ..."
5025009,I see 3 possible ways to solve this:1 try to...
5974433,To understand how backpropagation is even pos...
4785185,You have already split on weather and gender....
1090562,Majority of machine learning algorithms work ...


In [92]:
for post in doc_df.itertuples():
    print(len(post[1]))

146651
457832
42612
145811
54218
478490
27474
144713
220743
17415
9961
40127
242731
37994
58081
15306
114026
100284
2030
196788


In [None]:
i = 0
col = 0

doc_sim_matrix = []

while i < len(doc_df) - 1:
    print(i)
    doc1 = nlp(doc_df.iloc[i]["Doc"])
    doc_sim_vec = []
    
    j = col + 1
    while j < len(doc_df):
        print(j)
        doc2 = nlp(doc_df.iloc[j]["Doc"])
        sim = doc1.similarity(doc2)
        doc_sim_vec.append(sim)                
        j += 1
    doc_sim_matrix.append(doc_sim_vec)
    col += 1
    i += 1
    print('---')
doc_sim_matrix

In [77]:
len(doc_df)

20

In [90]:
pd.DataFrame(doc_sim_matrix)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,0.998271,0.992634,0.993036,0.99699,0.994728,0.989189,0.993858,0.996933,0.995932,0.994589,0.989918,0.996549,0.994795,0.995843,0.985585,0.995958,0.993855,0.766346,0.991891
1,0.994438,0.993793,0.996806,0.996435,0.989719,0.995312,0.997449,0.995539,0.994862,0.991263,0.997294,0.994282,0.996877,0.986199,0.996003,0.993757,0.76981,0.993845,
2,0.997096,0.994884,0.993524,0.994794,0.99439,0.996388,0.991792,0.99276,0.996216,0.996506,0.993954,0.997076,0.98879,0.994016,0.99484,0.779966,0.997224,,
3,0.995506,0.992427,0.994934,0.99555,0.995713,0.99345,0.993611,0.996629,0.995825,0.996234,0.997647,0.991585,0.996316,0.996049,0.776746,0.997984,,,
4,0.995417,0.994614,0.996582,0.998064,0.99719,0.995996,0.991776,0.997432,0.997465,0.996675,0.99004,0.998123,0.996326,0.773945,0.994609,,,,
5,0.991593,0.993703,0.997437,0.99397,0.99575,0.991873,0.997488,0.995015,0.995732,0.985639,0.995633,0.995586,0.75701,0.994193,,,,,
6,0.993106,0.993233,0.992416,0.990921,0.993404,0.992842,0.995177,0.993195,0.995065,0.993865,0.996795,0.779525,0.99458,,,,,,
7,0.995966,0.99509,0.994158,0.993664,0.995988,0.99475,0.995447,0.988966,0.995094,0.995113,0.753082,0.995524,,,,,,,
8,0.995245,0.996245,0.993385,0.998221,0.996658,0.997135,0.987684,0.997485,0.996049,0.768711,0.99535,,,,,,,,
9,0.995769,0.991023,0.996259,0.997098,0.995174,0.988153,0.996704,0.995996,0.76057,0.992043,,,,,,,,,


In [None]:
doc_df.iloc[1]["Doc"]