In [1]:
#Needed libraries:
#Regex for text cleaning
import re

#NLP library
import nltk

#Helper for creating regex 
import string

# Lemmatisation is the algorithmic process of determining the lemma of a word based on its intended meaning.
# Lemmatisation depends on correctly identifying the intended part of speech and meaning
#of a word in a sentence, as well as within the larger context surrounding that sentence
from nltk.stem import WordNetLemmatizer
wnl=WordNetLemmatizer()

#pattern.en module contains a fast part-of-speech tagger for English (CLiPS)
from pattern.en import tag

#WordNet is a lexical database for the English language.[1] It groups English words into sets of synonyms called synsets,#
#provides short definitions and usage examples, and records a number of relations among these synonym sets or their members. 
from nltk.corpus import wordnet as wn

import numpy as np

import pandas as pd
from pandas import DataFrame, Series


from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer


from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# To delete stop words from the text
from nltk.corpus import stopwords
stopword_list=stopwords.words("english")

stopword_list.extend(['www','mail','edu','athttps'])

#For tokenizing
from nltk.tokenize import sent_tokenize, word_tokenize

#remove special characters
remove_characters=re.compile('[^a-zA-Z ]')

In [3]:
def remove_special_characters(text):
    text = text.decode('utf-8')
    text=text.strip()
    filtered_sentence=re.sub(remove_characters, r' ', text)
    return filtered_sentence

In [4]:
def lemmatize_text(text):
#pos_tagged_text is lower case and has WordNet tags, ready to lemmatize    
    pos_tagged_text = pos_tag_text(text)
    lemmatized_tokens = [wnl.lemmatize(word, pos_tag) if pos_tag
                         else word #if word has a tag lemmatize it and add to the list, otherwise just add the word                    
                         for word, pos_tag in pos_tagged_text]
    lemmatized_text = ' '.join(lemmatized_tokens)
    return lemmatized_text

In [5]:
# Annotate text tokens with POS tags
def pos_tag_text(text):
#Converts Penn Treebank POS tags to WordNet tags    
    def penn_to_wn_tags(pos_tag):
        if pos_tag.startswith('J'):
            return wn.ADJ
        elif pos_tag.startswith('V'):
            return wn.VERB
        elif pos_tag.startswith('N'):
            return wn.NOUN
        elif pos_tag.startswith('R'):
            return wn.ADV
        else:
            return None
    #Use pattern library tagging functions (Penn Treebank syntax)
    tagged_text = tag(text)# Result: list of tuples for each sentence
    #In order to use lemmatizer we need to change POS tags to WordNet tags and make all words lowercase
    tagged_lower_text = [(word.lower(), penn_to_wn_tags(pos_tag))
                         for word, pos_tag in
                         tagged_text]
    return tagged_lower_text

In [6]:
#This function removes stopwords
def remove_stopwords(text):
    tokens=tokenize_text(text)
    filtered_tokens=[token for token in tokens if token not in stopword_list]
    filtered_text=" ".join(filtered_tokens)
    return filtered_text

In [7]:
#This fucntion tokenize words in a sentence
def tokenize_text(text):
    text = text.decode('utf-8')
    tokens=nltk.word_tokenize(text)
    tokens=[token.strip() for token in tokens]
    return tokens

In [8]:
def normalize_abstract(abstracts):
    normalized_abstracts=[]
    for abstract in abstracts:
        normalized_abstract=[]
        #First clean data from any special characters
        text=remove_special_characters(abstract)
        #Split abstract into sentences
        sentences=sent_tokenize(text)
        for text in sentences:
            text=lemmatize_text(text)
            text=remove_stopwords(text)
            normalized_abstract.append(text)
        normalized_abstract_string=" ".join(normalized_abstract)
        normalized_abstracts.append(normalized_abstract_string)
    return normalized_abstracts

In [9]:
#GETTING THE FEACURES AND VECTORIZER

In [10]:
def build_feature_matrix(abstracts, feature_type='frequency',
                         ngram_range=(1, 1), min_df=0.00, max_df=1.0):

    feature_type = feature_type.lower().strip()  
    
    if feature_type == 'frequency':
        vectorizer = CountVectorizer(binary=False, min_df=min_df,
                                     max_df=max_df, ngram_range=ngram_range)
    elif feature_type == 'tfidf':
        vectorizer = TfidfVectorizer(min_df=min_df, max_df=max_df, 
                                     ngram_range=ngram_range,use_idf=True)
    else:
        raise Exception("Wrong feature type entered. Possible values:'frequency', 'tfidf'")

    feature_matrix = vectorizer.fit_transform(abstracts).astype(float)

    
    return vectorizer, feature_matrix

In [11]:
#Read im the data, delete additional columns

all_articles_tools = pd.read_table('ALL_DATA/NAR/TOOLS/nar_active_links_00_17.txt', keep_default_na=False)
all_articles_tools.drop('Unnamed: 0', axis=1, inplace=True)
all_articles_tools.drop('Unnamed: 0.1', axis=1, inplace=True)
all_articles_tools.drop('Unnamed: 0.1.1', axis=1, inplace=True)
#all_articles_tools.drop('Unnamed: 0.1.1.1', axis=1, inplace=True)
# create a column for storing related articles






In [12]:
#this column is going to be used to store names of tools without any integers inside
# this is helpful to find different versions of the same tools
all_articles_tools["main_name"]="NULL"

first_word=re.compile('[^\s]+')
for i in range(len(all_articles_tools)):
    tool_name=all_articles_tools["name_tool"][i]
    tool_name=re.match(first_word, tool_name).group(0)
    tool_name = ''.join([j for j in tool_name if not j.isdigit()])
    all_articles_tools["main_name"][i]=tool_name
    i=i+1
    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


In [13]:
all_articles_tools["related_articles"]="NULL"

#Give each data point an unique id
id_list=range(len(all_articles_tools)+1)[1:]
all_articles_tools["id"] = id_list

names_of_tools=all_articles_tools.main_name.tolist()
# create a list of names of tools that are duplicates
import collections
duplicated_tools=[item for item, count in collections.Counter(names_of_tools).items() if count > 1]





In [14]:
#list of id of articles which are not "main articles associated with a tool"- to be dropped before calculating cosine similarity
articles_to_drop_by_ids=[]

for name in duplicated_tools:
    #ascending = False most recent on the top of the df
    related_articles=all_articles_tools.loc[all_articles_tools['main_name'] == name].sort_values(by='date', ascending=False)
    ids=[]
    ids=related_articles.id.tolist()
    index=ids[0]-1
    all_articles_tools["related_articles"][index]=ids[1:]
    articles_to_drop_by_ids.append(ids[1:])


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


In [15]:
year_pattern=re.compile('^20[0-9]{2}')

all_articles_tools["year"]="NULL"
for i in range(len(all_articles_tools)):
    try:
        year_matched=re.match( year_pattern, all_articles_tools["date"][i] ).group(0)
        all_articles_tools["year"][i] = year_matched

    except:
        all_articles_tools["year"][i]="NULL"


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


In [16]:
# MAKE SURE VIEWS, CITATIONS AND ALTMETRIC SCORE ARE NP INTS
all_articles_tools.altmetric_score = all_articles_tools.altmetric_score.astype(np.int64)
all_articles_tools.views = all_articles_tools.views.astype(np.int64)
all_articles_tools.citations_amount = all_articles_tools.citations_amount.astype(np.int64)

In [17]:
all_articles_tools["citations_per_year"]=0.0
#Set relative amount of citations- citations per year
for i in range(len(all_articles_tools)):
    if all_articles_tools["citations_amount"][i] is None:
        pass
    else:
        years=2017 - int(all_articles_tools["year"][i]) +1
        yearly_citations=float(all_articles_tools["citations_amount"][i])/float(years) 
        all_articles_tools["citations_per_year"][i]=int(round(yearly_citations))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [18]:
all_articles_tools["views_per_year"]=0
for i in range(len(all_articles_tools)):
    years=2017 - int(all_articles_tools["year"][i])+1
    yearly_views= float(all_articles_tools["views"][i]/years)
    all_articles_tools["views_per_year"][i]=yearly_views

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [19]:
#SET RELATIVE AMOUNT OF VIEWS AND SCALE (relative views are in range 4 to 34 so they can be use as sizes of balls
# on scatter plots)
all_articles_tools["relative_views"]=0
max_views=all_articles_tools["views_per_year"].max()


length=len(all_articles_tools)       
for i in range(length):
    yearly_views=float(all_articles_tools["views_per_year"][i])
    relative_views=(yearly_views/float(max_views))*30+4
    all_articles_tools["relative_views"][i]=int(round(relative_views))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


In [20]:
#change a list of lists into a regular list
articles_to_drop_by_ids = [item for sublist in articles_to_drop_by_ids for item in sublist]

# list of ids of main articles

id_main_articles=range(len(all_articles_tools)+1)[1:]
for item in articles_to_drop_by_ids:
    id_main_articles.remove(item)
    
# I need to create a df only with TOOLS
only_tools=DataFrame()
only_tools=all_articles_tools[["homepage", "id", "related_articles", "name_tool", "abstract", "citations_per_year","relative_views", "topics" ]]

# drop the values which are not main articles
for item in articles_to_drop_by_ids:
    only_tools = only_tools[only_tools.id != item]

In [21]:
only_tools = only_tools.rename(index=str, columns={"id": "id_article", "related_articles": "ids_related_articles"})

In [22]:
#Give each data point an unique id
id_list=range(len(only_tools)+1)[1:]
only_tools["id"] = id_list

In [23]:
all_abstracts=only_tools.abstract.tolist()
names_of_tools=only_tools.name_tool.tolist()

In [24]:
# At this point I need to work more on my abstracts and names_of tools to get better results

In [25]:
# Add normalized names of tools to stop words
# Delete email addresses from the abstracts

In [26]:
names_copy=only_tools.name_tool.tolist()
all_normalized_names=normalize_abstract(names_copy)

In [27]:
stopword_list.extend(['www','mail','edu','athttps'])
stopword_list.extend(all_normalized_names)

In [28]:
len(stopword_list)

2164

In [29]:
##REGEX FOR GETTING RID OF EMIALS AND WEBSITES
website_pattern=re.compile(r'[\w\.-]+@[\w\.-]+')
email_pattern=re.compile(r'http[^\s]+')

In [30]:
i=0
for i in range(len(all_abstracts)):
    result=re.sub(website_pattern,"",  all_abstracts[i])
    all_abstracts[i]=re.sub(email_pattern,"",  result)
    i=i+1

In [31]:
# Step 1: NORMALIZE YOUR DATA
all_normalized_abstracts=normalize_abstract(all_abstracts)

In [32]:
# Step 2: EXTRACT FEATURES
tfidf_vectorizer, tfidf_matrix=build_feature_matrix(all_normalized_abstracts, feature_type="tfidf")

In [33]:
feature_names=tfidf_vectorizer.get_feature_names()

In [34]:
adj_matrix=cosine_similarity(tfidf_matrix, tfidf_matrix)

In [1]:
#len(feature_names)

In [None]:
# Extract the most important features ~ keywords that define the tool, abstract
# In order to identify words which were given most weight by tf-idf we need to extract wieights from tfidf_matrix

In [36]:
from scipy.sparse import csr_matrix

In [37]:
#sparse matrix to dense matrix
matrix_dense=tfidf_matrix.todense() 

In [38]:
matrix_dense #This is a very sparse matrix

matrix([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        ..., 
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [39]:
#Add columns to store info about most importat features=words, and their assosiated weights 
only_tools["feature_list"]="NULL"
only_tools["feature_scores"]="NULL"

In [None]:
#I chose weight of word to be more than 0.10 for word to be considered meaningful
#Here we accually collect only index of that word in the list of all the features
#In the next step we will change index to word

In [40]:
i=0
for i in range(len(only_tools)):
    matrix_dense_row=matrix_dense[i]
    A = np.squeeze(np.asarray(matrix_dense_row))
    important_features=[(d, x) for d, x in enumerate(A) if x > 0.10]
    feature_tuples=zip(*important_features)
    feature_lists=map(list,feature_tuples )
    only_tools["feature_list"][i]=feature_lists[0]
    only_tools["feature_scores"][i]=feature_lists[1]
    i=i+1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [None]:
#This column will store actual words

In [41]:
only_tools["feature_names"]="NULL"

In [42]:
# feature indexes to words
for i in range(len(only_tools)):
    tool_features=[]
    for item in only_tools["feature_list"][i]:
        tool_features.append(feature_names[item])
    only_tools["feature_names"][i]=tool_features
    i=i+1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [43]:
# Add all the list of feature words into one big list check frequency of accurence of the words
# choose only globally relevant words

In [44]:
big_list_of_words=[]
for i in range(len(only_tools)):
    big_list_of_words.append(only_tools["feature_names"][i])

In [45]:
# list of lists into a list
features_list = [item for sublist in big_list_of_words for item in sublist]


In [None]:
# add to feature list keywords from the journal itself
#check if the journal provides keywords
#change the string into list of words

In [46]:
import collections
counter=collections.Counter(features_list)

In [47]:
only_tools["topics_list"]="NULL"

In [48]:
big_list_of_topics=[]
for i in range(len(only_tools)):
    topics=""
    topics= only_tools["topics"][i]
    topics1=topics.split(",")
    only_tools["topics_list"][i]=topics1
    big_list_of_topics.append(only_tools["topics_list"][i])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [64]:
# currently big_list_of_topics is a list of lists

In [None]:
# list of lists into a list
topics_list = [item for sublist in big_list_of_topics for item in sublist]

In [68]:
len(topics_list)

13712

In [67]:
len(features_list)

45932

In [None]:
# We also want to add list of topics which were provided in same of the journals

In [132]:
features_topics_list=features_list + topics_list

In [133]:
len(features_topics_list)

59644

In [134]:
leading_space=re.compile("^\s")

In [None]:
#Some of the features have leading space-we want to get rid of it

In [135]:
for i in range(len(features_topics_list)):
    word=re.sub(leading_space, "",features_topics_list[i])
    features_topics_list[i]=word
    

In [136]:
import collections
counter=collections.Counter(features_topics_list)

In [None]:
#To get all words=features with weight at least 0.10 is not enough, we want to only consider words that were popular and meaningful
#We can manulally decided how many distinct keywords we want to include and remove words which have high weight
#but are not meaningful for our purposes.

In [137]:
counter.most_common(319)

[(u'genome', 876),
 ('genes', 694),
 (u'rna', 499),
 (u'protein', 403),
 (u'datasets', 371),
 (u'dna', 298),
 ('', 293),
 (u'data', 259),
 (u'sequence', 255),
 (u'structure', 235),
 (u'database', 227),
 (u'software', 207),
 (u'community', 189),
 (u'internet', 179),
 (u'molecule', 168),
 (u'server', 167),
 (u'bioinformatics', 164),
 ('protein structure', 152),
 (u'mutation', 142),
 (u'human', 139),
 ('mice', 129),
 (u'interaction', 128),
 (u'secondary', 127),
 (u'complex', 127),
 ('gene expression', 125),
 (u'phenotype', 122),
 (u'genetic', 121),
 (u'transcription', 119),
 (u'genetics', 116),
 ('binding sites', 116),
 (u'prediction', 115),
 (u'structural', 114),
 (u'site', 114),
 (u'analysis', 114),
 (u'annotation', 111),
 (u'network', 109),
 (u'genomics', 107),
 (u'model', 105),
 (u'expression', 103),
 (u'alignment', 101),
 (u'mining', 99),
 (u'information', 98),
 (u'tool', 97),
 (u'method', 93),
 (u'nucleotides', 88),
 ('ligands', 87),
 (u'bind', 87),
 ('plants', 86),
 (u'web', 85),
 

In [138]:
my_keywords_tuples=counter.most_common(319)

In [139]:
my_keywords = zip(*my_keywords_tuples)
my_keywords =my_keywords[0]

In [None]:
#This is an example of featured keywords for one of the abstracts before cleaning it.
#Here we see all the words with weight more than 0.1

In [140]:
only_tools["feature_names"][1]

[u'activate',
 u'activation',
 u'apparent',
 u'bacillus',
 u'bind',
 u'box',
 u'competence',
 u'consensus',
 u'factor',
 u'macroarrays',
 u'ofbacillus',
 u'presence',
 u'promoter',
 u'putative',
 u'regulation',
 u'site',
 u'subtilis',
 u'subtiliscontains',
 u'supposition',
 u'take',
 u'transcription',
 u'unreliable',
 u'valid']

In [None]:
#Here are keywords associated to that abstract by journal maintainer 
#(ONLY 2 out of 4 journal provide keywords, topic )

In [141]:
only_tools["topics_list"][1]

['bacillus subtilis',
 'binding sites',
 'consensus sequence',
 'dna',
 'genes',
 'genome',
 'operon',
 'promoter regions (genetics)',
 'transcription factor',
 'competence',
 'rna']

In [142]:
only_tools["keywords_dirty"]="NULL"

In [143]:
for i in range(len(only_tools)):
    only_tools["keywords_dirty"][i]= only_tools["feature_names"][i] + only_tools["topics_list"][i]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [145]:
my_keywords_list=list(my_keywords)

In [146]:
# Decided to delete some items manually....I actually removed many more words not shown here

In [147]:
my_keywords_list.remove("")

In [148]:
my_keywords_list.remove("co")

In [149]:
my_keywords_list.remove("non")

In [150]:
my_keywords_list.remove("id")

In [151]:
my_keywords_list.remove("go")

In [152]:
my_keywords_list.remove("may")

In [153]:
my_keywords_list.remove("tf")

In [154]:
my_keywords_list.remove("xml")

In [155]:
my_keywords_list.remove("two")

In [156]:
my_keywords_list.remove("tfs")

In [None]:
#HERE IS MY FINAL LIST OF RELEVANT KEYWORDS, ONLY THESE WORDS CAN BE FEATURE AS A TOOL'S
#KEYWORDS ON THE TOOL'S LANDING PAGE, THIS WORDS ALSO COULD HELP WITH SEARCH RESULTS
#ABSTRACTS WITH WORD=

In [4]:
#my_keywords_list 
#this list should be improved many more words to delete
# we should create a black list of words that score high but should not be in the final keywords list

In [158]:
only_tools["tool_keywords"]="NULL"

In [159]:
#Saving only relevant keywords
for i in range(len(only_tools)):
    all_features=only_tools["keywords_dirty"][i]
    only_relevant = [word for word in all_features if word  in my_keywords_list]
    only_tools["tool_keywords"][i]=only_relevant

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [3]:
#CheCk how the keywords for each abstract look like
#only_tools["tool_keywords"]

In [174]:
# delete words that are not your keywords from only_tools["feature_names"][i]=
#[rowData for index, rowData in journal_data.iterrows() if rowData['tag'] not in tag_blacklist]

In [49]:
#This would add 9 closets neighbors index and similarity level for recommandaction system purposes
only_tools["closets_neighbor_1"]="NULL"
only_tools["closets_neighbor_2"]="NULL"
only_tools["closets_neighbor_3"]="NULL"
only_tools["closets_neighbor_4"]="NULL"
only_tools["closets_neighbor_5"]="NULL"
only_tools["closets_neighbor_6"]="NULL"
only_tools["closets_neighbor_7"]="NULL"
only_tools["closets_neighbor_8"]="NULL"
only_tools["closets_neighbor_9"]="NULL"
only_tools["similarity_neighbor_1"]="NULL"
only_tools["similarity_neighbor_2"]="NULL"
only_tools["similarity_neighbor_3"]="NULL"
only_tools["similarity_neighbor_4"]="NULL"
only_tools["similarity_neighbor_5"]="NULL"
only_tools["similarity_neighbor_6"]="NULL"
only_tools["similarity_neighbor_7"]="NULL"
only_tools["similarity_neighbor_8"]="NULL"
only_tools["similarity_neighbor_9"]="NULL"

In [50]:
i=0
data_length=len(only_tools)
for i in range (data_length):
    sorted_similarity=sorted(((value, index) for index, value in enumerate(adj_matrix[i])), reverse=True)
    closest_list=[]
    closest_list=sorted_similarity[1:14]
#set up the values of the closest neighboors
    only_tools["closets_neighbor_1"][i] = (closest_list[0][1] + 1)# index to id
    only_tools["similarity_neighbor_1"][i] = closest_list[0][0]
#    
    only_tools["closets_neighbor_2"][i] =(closest_list[1][1] + 1)
    only_tools["similarity_neighbor_2"][i] = closest_list[1][0]
#    
    only_tools["closets_neighbor_3"][i] =(closest_list[2][1] + 1)
    only_tools["similarity_neighbor_3"][i] = closest_list[2][0]
#    
    only_tools["closets_neighbor_4"][i] =(closest_list[3][1] + 1)
    only_tools["similarity_neighbor_4"][i] = closest_list[3][0]
    
    only_tools["closets_neighbor_5"][i] = (closest_list[4][1] + 1)# index to id
    only_tools["similarity_neighbor_5"][i] = closest_list[4][0]
#    
    only_tools["closets_neighbor_5"][i] =(closest_list[5][1] + 1)
    only_tools["similarity_neighbor_5"][i] = closest_list[5][0]
#    
    only_tools["closets_neighbor_6"][i] =(closest_list[6][1] + 1)
    only_tools["similarity_neighbor_6"][i] = closest_list[6][0]
#    
    only_tools["closets_neighbor_7"][i] =(closest_list[7][1] + 1)
    only_tools["similarity_neighbor_7"][i] = closest_list[7][0]
    
    only_tools["closets_neighbor_8"][i] =(closest_list[8][1] + 1)
    only_tools["similarity_neighbor_8"][i] = closest_list[8][0]
    
    only_tools["closets_neighbor_9"][i] =(closest_list[9][1] + 1)
    only_tools["similarity_neighbor_9"][i] = closest_list[9][0]

    i=i+1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/

In [56]:
adj_matrix_nor =adj_matrix / adj_matrix.max()

In [None]:
# in order to use t-sne you need to change cosine similarity to cosine distance 
#cosine distance = 1 - cosine similarity
#REST OF THE CODE 

In [57]:
#Before you RUN it take a look at the parameters

import numpy as np
from sklearn.manifold import TSNE

#MODEL 3D
model3D=TSNE(n_components=3, perplexity=15.0, early_exaggeration=4.0, learning_rate=100.0, n_iter=8000, n_iter_without_progress=30, min_grad_norm=1e-07, metric='precomputed', init='random', verbose=0, random_state=None, method='barnes_hut', angle=0.5)
np.set_printoptions(suppress=True)
TSNE_data3D=model3D.fit_transform(1-adj_matrix_nor) 
transformed_TSNE_data3D=TSNE_data3D.transpose()

In [58]:
#MODEL 2D

model2D=TSNE(n_components=2, perplexity=10.0, early_exaggeration=4.0, learning_rate=100.0, n_iter=8000, n_iter_without_progress=30, min_grad_norm=1e-07, metric='precomputed', init='random', verbose=0, random_state=None, method='barnes_hut', angle=0.5)
np.set_printoptions(suppress=True)
TSNE_data2D=model2D.fit_transform(1-adj_matrix_nor) 
transformed_TSNE_data2D=TSNE_data2D.transpose()

In [59]:
# Manipulating data frame to add new information

#addig additional columns to data frame
only_tools["x"]=0
only_tools["y"]=0
only_tools["z"]=0
only_tools["closest_neighbors"]="NULL"
only_tools["x_2d"]=0
only_tools["y_2d"]=0

In [60]:
#assign the right values to the columns  3D case
x_coordinate = transformed_TSNE_data3D[0]
len(x_coordinate)
only_tools["x"]=x_coordinate

y_coordinate = transformed_TSNE_data3D[1]
only_tools["y"] = y_coordinate

z_coordinate = transformed_TSNE_data3D[2]
only_tools["z"]=z_coordinate

In [61]:
#assign the right values to the columns  2D case
x_coordinate_2d = transformed_TSNE_data2D[0]
only_tools["x_2d"]=x_coordinate_2d

y_coordinate_2d = transformed_TSNE_data2D[1]
only_tools["y_2d"] = y_coordinate_2d

In [161]:
only_tools.to_csv("BEST2_main_tools.txt",sep='\t', encoding='utf-8')
all_articles_tools.to_csv("BEST2_tools_articles.txt",sep='\t', encoding='utf-8')

In [None]:
#CODE FOR CANVAS VISUALIZATION
similar_tool_canvas=similar_tools[["main_tool", "similar_tool_fk", "similarity"]]

In [None]:
for i in range(len(similar_tool_canvas)):
    tool_id=similar_tool_canvas["main_tool"][i]
    similar_tool_canvas["main_tool"][i]= tools["name_tool"][tool_id-1]
    similar_tool_id=similar_tool_canvas["similar_tool_fk"][i]
    similar_tool_canvas["similar_tool_fk"][i]= tools["name_tool"][similar_tool_id-1]

In [None]:
similar_tool_canvas.to_csv("canvas.txt",sep='\t', encoding='utf-8')

In [None]:
similar_tool_canvas.to_csv("canvas2.txt", header=False, index=False,sep='\t', encoding='utf-8')