In [1]:
# Load csv data

import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
import matplotlib.pyplot as plt
import json
import spacy 
from spacy.matcher import Matcher
import pandas as pd
import re
from collections import Counter
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer 
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

filepath = os.path.join(os.getcwd(), 'QueryResults_sample.csv')

stack_posts = pd.read_csv(filepath, sep = ",")
    
print("loaded csv data")

loaded csv data


In [4]:
# clean posts and match nouns, adjectives with spacy
nlp = spacy.load("en_core_web_md")
matcher = Matcher(nlp.vocab)

pattern = [{'POS': 'ADJ', 'OP': '?'},
           {'OP': '?', 'POS': 'NOUN'}
          ]

wordlist = []

for text in stack_posts["AnswerBody"][:50]:                   
    text = re.sub('<pre(.|\n)*?/pre>|<code>(.|\n)*?</code>|(<a(.*|\n)a>)|<blockquote(.|\n)*?\/blockquote>|(<(p|\/p|br|sub|\/sub|em|\/em)>)', '', text)        
    
    doc = nlp(text)
    
    matcher.add("match_nouns", [pattern])
    matches = matcher(doc)
    for match_id, start, end in matches:
        wordlist.append(doc[start:end].text)       

c = Counter(wordlist)
print("finished")

finished


In [6]:
# get n relevant words from single document
n = 3 # top n features

stack_answers = stack_posts["AnswerBody"][:50]

text_list = []
for reg_text in stack_answers:    
    reg_text = re.sub('<pre(.|\n)*?/pre>|<code>(.|\n)*?</code>|(<a(.*|\n)a>)|<blockquote(.|\n)*?\/blockquote>|(<(p|\/p|br|sub|\/sub|em|\/em|li|\/li|ul|\/ul|strong|\/strong)>|\n)', '', reg_text)  
    text_list.append(reg_text)


wordlist_vocab = set(wordlist)

top_n = []
for i, text in enumerate(text_list):
    vectorizer = TfidfVectorizer(stop_words = 'english', min_df = 0.01, vocabulary=wordlist_vocab) #, max_features=3
    text_vectorizer = vectorizer.fit_transform([text_list[i]])
    feature_names = vectorizer.get_feature_names()
    
    # sort features and add top n features to list
    feature_array = np.array(vectorizer.get_feature_names())
    tfidf_sorting = np.argsort(text_vectorizer.toarray()).flatten()[::-1]
    top_n.append(feature_array[tfidf_sorting][:n])  

# create vocab set from top_n numpy array    
vocab = set()
for array in top_n:
    for item in array:
        vocab.add(item)

        
print("finished")

finished


In [9]:
# Document similarity
vector_array = np.empty((len(text_list), nlp.vocab.vectors_length))


for i, text in enumerate(text_list):
    doc = nlp(text)
    vector_array[i] = doc.vector


cosine_similarity([vector_array[0]], [vector_array[3]])[0][0]

0.8868453828976748

In [28]:
# Term similarity
# first try
document_term_vectors = []



for i, document in enumerate(text_list):
    doc = nlp(document)
    noun_adj_count = 0
    for token in doc:
        if token.pos_ in ("NOUN", "ADJ"):
            noun_adj_count += 1
    vector_array = np.empty((noun_adj_count, nlp.vocab.vectors_length))    
    index = 0
    for token in doc:        
        if token.pos_ in ("NOUN", "ADJ"):
            vector_array[index] = token.vector
            index += 1
    document_term_vectors.append(vector_array)
#cosine_similarity([vector_array[0]], [vector_array[3]])

200

In [11]:
# Term similarity
# second try

term_term_matrix = []
# loop through relavant words and get their vectors

for i, term in enumerate(vocab):
    term_vector = []    
    doc1 = nlp(str(term))
    vector_base = doc1.vector    
    for j, term in enumerate(vocab):
        doc2 = nlp(str(term))
        vector = doc2.vector
        # add these vectors to term_vector of this word
        term_vector.append(cosine_similarity([vector_base], [vector])[0][0])
    # add vectors to matrix of vectors of all words
    term_term_matrix.append(term_vector)



In [27]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

ttm = pd.DataFrame(term_term_matrix, columns = vocab, index = vocab)
ttm

Unnamed: 0,kernel,string,numbers,metrics,answers,updates,distribution,client,feature,range,mean,case,features,hyperplane,continuous,model,value,alignment,workaround,characters,westons,test,steps,problem,parantheses,bit,instances,objects,sparse,dummy,reducer,new,mind,validation,values,setup,module,thing,svm,items,port,use,line,default,matrix,small,distance,online,frequent,means,complicated,pixel,orthogonal,directory,sklearn,imputation,code,syntax,data,optimization,fit,list,situations,key,labels,image,mapping,counts,proper,training,size,targets,xmean,server,variable,way,algorithm,sure,word,single,quantile,error,projection,example,underestimations,order,iris,classifier,arrangements,grid,file,good,set,classifiers,intro,prior,non,gradient,class,structure,uppercase,learning,loss,number,pipeline,noise,step
kernel,1.0,0.219056,0.12796,0.204862,0.119206,0.329871,0.274471,0.28289,0.16545,0.05939,0.164372,0.15028,0.144871,0.213102,0.151586,0.131482,0.178345,0.178219,0.50644,0.137043,0.0,0.208171,0.156651,0.363449,0.054235,0.27065,0.264394,0.20981,0.176661,0.282625,0.097837,0.114344,0.176805,0.252752,0.198009,0.367043,0.417641,0.194141,0.139204,0.055279,0.3164,0.229196,0.151866,0.431096,0.2899,0.16265,0.014608,-0.051019,0.027153,0.160272,0.125053,0.23605,0.251843,0.282069,0.0,0.234714,0.35358,0.342302,0.288416,0.282523,0.082096,0.205216,0.121343,0.170452,0.063389,0.211419,0.301759,0.088205,0.197732,0.064975,0.213593,0.121784,0.0,0.411581,0.292266,0.151926,0.384729,0.210526,0.100401,0.15411,0.251843,0.437522,0.140186,0.199312,0.000337,0.100036,0.062104,0.384729,0.001115,0.157647,0.357147,0.132957,0.157684,0.338209,0.082848,0.14765,0.148877,0.099857,0.082508,0.218876,-0.051359,0.093538,0.077024,0.141406,0.152641,0.159747,0.174704
string,0.219056,1.0,0.381008,0.156219,0.148439,0.124292,0.11132,0.217729,0.252759,0.221656,0.184927,0.292127,0.20244,0.230339,0.250967,0.215282,0.379406,0.247045,0.258707,0.341464,0.0,0.211772,0.191344,0.273571,0.335723,0.318553,0.345902,0.398396,0.226195,0.35173,0.272904,0.23694,0.185917,0.277746,0.332598,0.280964,0.282966,0.262907,0.004497,0.233439,0.225159,0.303962,0.37672,0.384764,0.268694,0.2806,0.195206,0.077969,0.180026,0.213001,0.230796,0.215275,0.250649,0.277297,0.0,0.154364,0.345809,0.468132,0.30188,0.156402,0.23214,0.317033,0.182884,0.388761,0.255963,0.210028,0.234865,0.214794,0.255361,0.089852,0.313932,0.183433,0.0,0.251034,0.46449,0.27185,0.286756,0.248341,0.291767,0.300834,0.250649,0.341313,0.216254,0.36707,-0.067083,0.23584,0.117637,0.286756,0.207433,0.29881,0.364419,0.197062,0.441965,0.22293,0.225154,0.155201,0.200448,0.169762,0.304909,0.326386,0.169818,0.183201,0.137626,0.403231,0.186357,0.221699,0.247251
numbers,0.12796,0.381008,1.0,0.371299,0.334221,0.264689,0.330841,0.212055,0.266593,0.340347,0.407491,0.288777,0.219008,0.187244,0.209873,0.229281,0.424413,0.210846,0.18539,0.332397,0.0,0.271317,0.261301,0.398845,0.277297,0.332516,0.347396,0.267312,0.175868,0.237811,0.099772,0.245134,0.291718,0.232659,0.430184,0.264733,0.193908,0.33222,-0.074191,0.288631,0.190365,0.339926,0.429218,0.297232,0.283706,0.337496,0.259944,0.271126,0.278253,0.364019,0.297484,0.239449,0.240004,0.241203,0.0,0.292869,0.349709,0.214066,0.480681,0.191717,0.251421,0.430705,0.31288,0.431769,0.300315,0.215313,0.250334,0.477911,0.288574,0.164513,0.360916,0.321726,0.0,0.212008,0.312876,0.357593,0.297131,0.406112,0.36363,0.339247,0.240004,0.33441,0.247397,0.44804,0.180783,0.368393,0.080959,0.297131,0.238481,0.308307,0.301228,0.307129,0.343471,0.271521,0.108433,0.306834,0.215188,0.163524,0.241673,0.273922,0.177364,0.211502,0.248378,0.820055,0.193078,0.246207,0.290087
metrics,0.204862,0.156219,0.371299,1.0,0.225132,0.30788,0.350007,0.362006,0.188041,0.201633,0.29525,0.14501,0.175025,0.213002,0.326642,0.278037,0.45312,0.35838,0.258988,0.121691,0.0,0.277839,0.258687,0.218011,0.13381,0.137096,0.33253,0.254885,0.114338,0.178227,0.113705,0.145619,0.171323,0.445407,0.427729,0.270325,0.309359,0.161991,-0.038304,0.148755,0.116704,0.245707,0.140198,0.278637,0.308565,0.156199,0.215919,0.189188,0.185951,0.277536,0.256214,0.273395,0.300458,0.115997,0.0,0.408396,0.272391,0.268091,0.534252,0.533588,0.181611,0.262154,0.294978,0.356108,0.22273,0.181229,0.462647,0.33627,0.280768,0.249283,0.205726,0.364311,0.0,0.278209,0.318678,0.224198,0.442528,0.200307,0.188936,0.170494,0.300458,0.302873,0.293319,0.289338,0.186669,0.20865,0.011642,0.442528,0.163647,0.303389,0.197221,0.214206,0.240601,0.494352,0.017436,0.1914,0.107139,0.197198,0.118962,0.327865,0.074481,0.292947,0.174022,0.29348,0.271854,0.199166,0.212963
answers,0.119206,0.148439,0.334221,0.225132,1.0,0.335392,0.112415,0.26609,0.161052,0.179057,0.328743,0.282336,0.134284,0.186943,0.180985,0.129869,0.218741,0.158917,0.246126,0.267909,0.0,0.420077,0.378741,0.430371,0.143647,0.266431,0.247855,0.178389,0.147321,0.202825,0.021799,0.245197,0.424179,0.251193,0.217313,0.212363,0.133816,0.371814,-0.076073,0.226967,0.033041,0.269062,0.238619,0.183795,0.155754,0.173358,0.154328,0.344381,0.266288,0.277456,0.387564,0.059182,0.108959,0.193559,0.0,0.156297,0.231195,0.226183,0.250101,0.232438,0.254547,0.354945,0.396061,0.33037,0.146072,0.135447,0.17249,0.222532,0.384394,0.283902,0.087391,0.152639,0.0,0.209999,0.158252,0.436512,0.22365,0.462741,0.382448,0.180395,0.108959,0.304477,0.077809,0.306178,0.018796,0.336466,0.000471,0.22365,0.144521,0.179799,0.222779,0.380049,0.224477,0.243562,0.245367,0.263011,0.184743,0.046071,0.207666,0.177893,0.133402,0.377884,0.217352,0.33041,0.090139,0.139679,0.376247
updates,0.329871,0.124292,0.264689,0.30788,0.335392,1.0,0.243474,0.359551,0.39931,0.170024,0.161708,0.178764,0.360781,0.040727,0.239129,0.181627,0.140732,0.12084,0.534109,0.153975,0.0,0.216332,0.243375,0.271281,-0.001401,0.224881,0.239123,0.173982,0.0187,0.048195,0.00754,0.444028,0.224983,0.285005,0.13854,0.377647,0.298297,0.214061,-0.066034,0.328101,0.159808,0.285378,0.187253,0.309984,0.099422,0.152762,0.006648,0.316516,0.29378,0.19047,0.181934,0.095107,-0.006797,0.336946,0.0,0.093955,0.335881,0.182638,0.368367,0.263739,0.138852,0.455114,0.184796,0.267158,0.135174,0.205155,0.255456,0.138513,0.227349,0.223311,0.12203,0.178169,0.0,0.387989,0.144801,0.245593,0.185922,0.380996,0.146263,0.157498,-0.006797,0.330209,0.040335,0.190301,-0.071629,0.259368,-0.021357,0.185922,0.135778,0.197122,0.33159,0.206726,0.277707,0.198894,0.181535,0.284048,0.091979,0.004626,0.126579,0.102124,-0.044588,0.188333,0.099818,0.272893,0.180952,0.135519,0.215761
distribution,0.274471,0.11132,0.330841,0.350007,0.112415,0.243474,1.0,0.364013,0.272072,0.345311,0.302557,0.280003,0.23875,0.190118,0.409724,0.294894,0.362719,0.311928,0.219584,0.09696,0.0,0.293629,0.20622,0.321481,-0.023615,0.12996,0.284254,0.246193,0.258252,0.157291,0.079619,0.273473,0.128478,0.263878,0.326726,0.339985,0.336223,0.170402,-0.055159,0.250252,0.261,0.352252,0.310824,0.271919,0.380635,0.347074,0.250726,0.287718,0.239291,0.395655,0.253929,0.18492,0.316975,0.296288,0.0,0.421089,0.301195,0.134153,0.470959,0.362109,0.184551,0.237654,0.249516,0.298028,0.328016,0.26789,0.390532,0.32514,0.37616,0.213191,0.315355,0.280013,0.0,0.321417,0.431676,0.26486,0.315857,0.205141,0.144027,0.34261,0.316975,0.25849,0.308601,0.352929,0.060799,0.380671,0.008523,0.315857,0.323342,0.355056,0.303324,0.230342,0.244549,0.307597,-0.015479,0.352986,0.258741,0.30001,0.169248,0.440262,-0.035212,0.22999,0.290213,0.362605,0.351217,0.217955,0.223665
client,0.28289,0.217729,0.212055,0.362006,0.26609,0.359551,0.364013,1.0,0.242707,0.301935,0.228182,0.356842,0.227762,0.014288,0.261929,0.248168,0.357388,0.250351,0.341455,0.181515,0.0,0.2866,0.302594,0.375146,0.022156,0.230775,0.455633,0.311297,0.035311,0.209256,0.048182,0.283542,0.316906,0.420983,0.289546,0.459411,0.378549,0.246568,-0.000624,0.206075,0.320082,0.332547,0.257778,0.472434,0.15808,0.259041,0.195079,0.268381,0.259565,0.34024,0.289306,0.162795,0.097399,0.420921,0.0,0.169427,0.319259,0.253977,0.427273,0.352313,0.194166,0.297063,0.359279,0.389585,0.186229,0.239714,0.328703,0.245928,0.349189,0.329145,0.202938,0.294097,0.0,0.632458,0.281655,0.321247,0.251261,0.337298,0.205504,0.297647,0.097399,0.4017,0.152436,0.391895,0.029838,0.321164,-0.029798,0.251261,0.252703,0.202753,0.426856,0.275431,0.329372,0.228271,0.031392,0.363883,0.215838,0.078637,0.266331,0.27837,-0.010403,0.288923,0.215522,0.344955,0.287387,0.071712,0.300327
feature,0.16545,0.252759,0.266593,0.188041,0.161052,0.39931,0.272072,0.242707,1.0,0.37217,0.20893,0.293754,0.841719,0.065625,0.296761,0.30263,0.256707,0.246126,0.286289,0.387936,0.0,0.224981,0.275054,0.310567,0.09357,0.262576,0.300236,0.302961,0.176268,0.138434,0.104004,0.441673,0.257772,0.243064,0.206218,0.385438,0.336409,0.342232,-0.166057,0.311029,0.246832,0.399462,0.324612,0.340489,0.194211,0.347621,0.207711,0.267421,0.282607,0.324293,0.239355,0.204562,0.127039,0.265468,0.0,0.116327,0.259861,0.244133,0.289324,0.225344,0.343804,0.410741,0.233173,0.343887,0.251665,0.328102,0.289309,0.160386,0.277239,0.151888,0.280971,0.171657,0.0,0.279344,0.323528,0.349827,0.262069,0.397431,0.218232,0.397311,0.127039,0.238686,0.239287,0.417548,-0.077971,0.235059,0.048482,0.262069,0.236765,0.285656,0.23129,0.324236,0.443875,0.257529,0.202014,0.203901,0.18782,0.173818,0.260107,0.345586,0.024969,0.205338,0.116212,0.365718,0.141433,0.240623,0.281075
range,0.05939,0.221656,0.340347,0.201633,0.179057,0.170024,0.345311,0.301935,0.37217,1.0,0.280882,0.242906,0.41248,0.116613,0.376705,0.319366,0.487067,0.211166,0.083367,0.237846,0.0,0.312843,0.26251,0.218968,-0.001813,0.259104,0.267718,0.296261,0.249581,0.1278,0.131705,0.331302,0.252964,0.16815,0.393209,0.236087,0.208755,0.181626,-0.135995,0.374438,0.17761,0.364935,0.426552,0.223939,0.171952,0.388521,0.398478,0.263364,0.250455,0.321826,0.235331,0.253851,0.14815,0.199842,0.0,0.21796,0.163147,0.096547,0.31144,0.159546,0.380928,0.267481,0.336022,0.299044,0.272955,0.239332,0.234915,0.261152,0.264612,0.290285,0.417932,0.394726,0.0,0.156597,0.380554,0.296649,0.133945,0.266949,0.145219,0.326571,0.14815,0.160983,0.205157,0.386864,0.011839,0.328356,0.01907,0.133945,0.242162,0.236533,0.115477,0.317851,0.409671,0.194929,0.041331,0.204828,0.222055,0.274744,0.272599,0.259408,-0.042663,0.284685,0.216977,0.408087,0.161442,0.313825,0.237902
