In [1]:
#Needed libraries:
#Regex for text cleaning
import re

#NLP library
import nltk

#Helper for creating regex 
import string

# Lemmatisation is the algorithmic process of determining the lemma of a word based on its intended meaning.
# Lemmatisation depends on correctly identifying the intended part of speech and meaning
#of a word in a sentence, as well as within the larger context surrounding that sentence
from nltk.stem import WordNetLemmatizer
wnl=WordNetLemmatizer()

#pattern.en module contains a fast part-of-speech tagger for English (CLiPS)
from pattern.en import tag

#WordNet is a lexical database for the English language.[1] It groups English words into sets of synonyms called synsets,#
#provides short definitions and usage examples, and records a number of relations among these synonym sets or their members. 
from nltk.corpus import wordnet as wn

import numpy as np

import pandas as pd
from pandas import DataFrame, Series


from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer


from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# To delete stop words from the text
from nltk.corpus import stopwords
stopword_list=stopwords.words("english")

stopword_list.extend(['www','mail','edu','athttps'])

#For tokenizing
from nltk.tokenize import sent_tokenize, word_tokenize

#remove special characters
remove_characters=re.compile('[^a-zA-Z ]')

In [3]:
def remove_special_characters(text):
    text = text.decode('utf-8')
    text=text.strip()
    filtered_sentence=re.sub(remove_characters, r' ', text)
    return filtered_sentence

In [4]:
def lemmatize_text(text):
#pos_tagged_text is lower case and has WordNet tags, ready to lemmatize    
    pos_tagged_text = pos_tag_text(text)
    lemmatized_tokens = [wnl.lemmatize(word, pos_tag) if pos_tag
                         else word #if word has a tag lemmatize it and add to the list, otherwise just add the word                    
                         for word, pos_tag in pos_tagged_text]
    lemmatized_text = ' '.join(lemmatized_tokens)
    return lemmatized_text

In [5]:
# Annotate text tokens with POS tags
def pos_tag_text(text):
#Converts Penn Treebank POS tags to WordNet tags    
    def penn_to_wn_tags(pos_tag):
        if pos_tag.startswith('J'):
            return wn.ADJ
        elif pos_tag.startswith('V'):
            return wn.VERB
        elif pos_tag.startswith('N'):
            return wn.NOUN
        elif pos_tag.startswith('R'):
            return wn.ADV
        else:
            return None
    #Use pattern library tagging functions (Penn Treebank syntax)
    tagged_text = tag(text)# Result: list of tuples for each sentence
    #In order to use lemmatizer we need to change POS tags to WordNet tags and make all words lowercase
    tagged_lower_text = [(word.lower(), penn_to_wn_tags(pos_tag))
                         for word, pos_tag in
                         tagged_text]
    return tagged_lower_text

In [6]:
#This function removes stopwords
def remove_stopwords(text):
    tokens=tokenize_text(text)
    filtered_tokens=[token for token in tokens if token not in stopword_list]
    filtered_text=" ".join(filtered_tokens)
    return filtered_text

In [7]:
#This fucntion tokenize words in a sentence
def tokenize_text(text):
    text = text.decode('utf-8')
    tokens=nltk.word_tokenize(text)
    tokens=[token.strip() for token in tokens]
    return tokens

In [8]:
def normalize_abstract(abstracts):
    normalized_abstracts=[]
    for abstract in abstracts:
        normalized_abstract=[]
        #First clean data from any special characters
        text=remove_special_characters(abstract)
        #Split abstract into sentences
        sentences=sent_tokenize(text)
        for text in sentences:
            text=lemmatize_text(text)
            text=remove_stopwords(text)
            normalized_abstract.append(text)
        normalized_abstract_string=" ".join(normalized_abstract)
        normalized_abstracts.append(normalized_abstract_string)
    return normalized_abstracts

In [9]:
#GETTING THE FEACURES AND VECTORIZER

In [10]:
def build_feature_matrix(abstracts, feature_type='frequency',
                         ngram_range=(1, 1), min_df=0.0, max_df=1.0):

    feature_type = feature_type.lower().strip()  
    
    if feature_type == 'frequency':
        vectorizer = CountVectorizer(binary=False, min_df=min_df,
                                     max_df=max_df, ngram_range=ngram_range)
    elif feature_type == 'tfidf':
        vectorizer = TfidfVectorizer(min_df=min_df, max_df=max_df, 
                                     ngram_range=ngram_range)
    else:
        raise Exception("Wrong feature type entered. Possible values:'frequency', 'tfidf'")

    feature_matrix = vectorizer.fit_transform(abstracts).astype(float)

    
    return vectorizer, feature_matrix

In [11]:
#Read im the data, delete additional columns

all_articles_tools = pd.read_table('ALL_DATA/ALL_journals.txt', keep_default_na=False)
all_articles_tools.drop('Unnamed: 0', axis=1, inplace=True)
all_articles_tools.drop('Unnamed: 0.1', axis=1, inplace=True)
all_articles_tools.drop('Unnamed: 0.1.1', axis=1, inplace=True)
#all_articles_tools.drop('Unnamed: 0.1.1.1', axis=1, inplace=True)
# create a column for storing related articles






In [12]:
#this column is going to be used to store names of tools without any integers inside
# this is helpful to find different versions of the same tools
all_articles_tools["main_name"]="NULL"

first_word=re.compile('[^\s]+')
for i in range(len(all_articles_tools)):
    tool_name=all_articles_tools["name_tool"][i]
    tool_name=re.match(first_word, tool_name).group(0)
    tool_name = ''.join([j for j in tool_name if not j.isdigit()])
    all_articles_tools["main_name"][i]=tool_name
    i=i+1
    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


In [13]:
all_articles_tools["related_articles"]="NULL"

#Give each data point an unique id
id_list=range(len(all_articles_tools)+1)[1:]
all_articles_tools["id"] = id_list

names_of_tools=all_articles_tools.main_name.tolist()
# create a list of names of tools that are duplicates
import collections
duplicated_tools=[item for item, count in collections.Counter(names_of_tools).items() if count > 1]





In [14]:
#list of id of articles which are not "main articles associated with a tool"- to be dropped before calculating cosine similarity
articles_to_drop_by_ids=[]

for name in duplicated_tools:
    #ascending = False most recent on the top of the df
    related_articles=all_articles_tools.loc[all_articles_tools['main_name'] == name].sort_values(by='date', ascending=False)
    ids=[]
    ids=related_articles.id.tolist()
    index=ids[0]-1
    all_articles_tools["related_articles"][index]=ids[1:]
    articles_to_drop_by_ids.append(ids[1:])


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


In [15]:
year_pattern=re.compile('^20[0-9]{2}')

all_articles_tools["year"]="NULL"
for i in range(len(all_articles_tools)):
    try:
        year_matched=re.match( year_pattern, all_articles_tools["date"][i] ).group(0)
        all_articles_tools["year"][i] = year_matched

    except:
        all_articles_tools["year"][i]="NULL"


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


In [16]:
all_articles_tools

Unnamed: 0,title,link_to_tools,tag,views,citations_amount,altmetric_score,journal,authors,topics,abstract,...,all_links,altmetric_link,name_tool,homepage,info,active,main_name,related_articles,id,year
0,MASS: multiple structural alignment by seconda...,http://musket.sourceforge.net,ORIGINAL PAPER,81,41.0,0,Bioinformatics Oxford,"O. Dror,H. Benyamini,R. Nussinov,H. Wolfson",,We present a novel method for multiple alignme...,...,"http://bioinfo3d.cs.tau.ac.il/MASS,http://oran...",,MASS,http://bioinfo3d.cs.tau.ac.il/MASS,,True,MASS,,1,2003
1,"PoPMuSiC, rationally designing point\nmutatio...",http://musket.sourceforge.net,APPLICATIONS NOTE,23,46.0,0,Bioinformatics Oxford,"J. M. Kwasigroch,D. Gilis,Y. Dehouck,M. Rooman",,Summary: PoPMuSiC is an efficient tool for rat...,...,"http://babylone.ulb.ac.be/popmusic,http://jean...",,PoPMuSiC,http://babylone.ulb.ac.be/popmusic,,True,PoPMuSiC,,2,2002
2,ModLoop: automated modeling of loops in protei...,http://musket.sourceforge.net,APPLICATIONS NOTES,54,282.0,3,Bioinformatics Oxford,"András Fiser,Andrej Sali",,Summary:ModLoop is a web server for automated ...,...,"http://andras@fiserlab.org,http://salilab.org/...",https://badges.altmetric.com/?size=128&score=3...,ModLoop,http://salilab.org/modloop,,True,ModLoop,,3,2003
3,SNOW: Standard NOmenclature Wizard to help sea...,http://musket.sourceforge.net,APPLICATIONS NOTES,10,1.0,0,Bioinformatics Oxford,"Daniel Aguilar,Baldomero Oliva,Francesc X. Avi...",,Summary:When developing bioinformatical tools ...,...,"http://ibfquer@blues.uab.es,http://ibb.uab.es/...",,SNOW,http://ibb.uab.es/snow/,,True,SNOW,,4,2003
4,gff2aplot: Plotting sequence comparisons,http://musket.sourceforge.net,APPLICATIONS NOTES,17,4.0,0,Bioinformatics Oxford,"Josep F. Abril,Roderic Guigó,Thomas Wiehe",,Summary:gff2aplotis a program to visualize the...,...,"http://jabril@imim.es,http://genome.imim.es/so...",,gff2aplot,http://genome.imim.es/software/gfftools/GFF2AP...,,True,gffaplot,,5,2003
5,Using credibility intervals instead of hypothe...,http://musket.sourceforge.net,ORIGINAL PAPERS,9,22.0,0,Bioinformatics Oxford,"Ricardo Z.N. Vêncio,Helena Brentani,Carlos A.B...",,Motivation:Statistical methods usually used to...,...,"http://rvencio@ime.usp.br,http://www.ime.usp.b...",,SAGE,http://www.ime.usp.br/~rvencio/SAGEci/,,True,SAGE,,6,2003
6,MEGA2: molecular evolutionary genetics analysi...,http://musket.sourceforge.net,APPLICATIONS NOTE,193,0.0,9,Bioinformatics Oxford,"Sudhir Kumar,Koichiro Tamura,Ingrid B. Jakobse...",,Summary: We have developed a new software pack...,...,"http://www.megasoftware.net,http://s.kumar@asu...",https://badges.altmetric.com/?size=128&score=9...,MEGA2,http://www.megasoftware.net,software,True,MEGA,,7,2001
7,Tricross: using dot-plots in sequence-id space...,http://musket.sourceforge.net,ORIGINAL PAPER,6,4.0,0,Bioinformatics Oxford,"William C. Ray,Robert S. Munson Jr,Charles J. ...",,Motivation: The process of determining the fun...,...,"http://micro-gen.ouhsc.edu/,http://www.biosci....",,Tricross,http://www.biosci.ohio-state.edu/~ray/bioinfor...,,True,Tricross,,8,2001
8,The systems biology markup language (SBML): a ...,http://musket.sourceforge.net,ORIGINAL PAPER,475,0.0,21,Bioinformatics Oxford,"M. Hucka,A. Finney,H. M. Sauro,H. Bolouri,J. C...",,Motivation: Molecular biotechnology now makes ...,...,"http://www.sbml.org/,http://sysbio-team@caltec...",https://badges.altmetric.com/?size=128&score=2...,SBML,http://www.sbml.org/,,True,SBML,,9,2003
9,TreeSAAP: Selection on Amino Acid Properties u...,http://musket.sourceforge.net,APPLICATIONS NOTE,167,142.0,1,Bioinformatics Oxford,"Steve Woolley,Justin Johnson,Matthew J. Smith,...",,Summary: The software program TreeSAAP measure...,...,"http://genome.cs.byu.edu/treesaap.htm,http://D...",https://badges.altmetric.com/?size=128&score=1...,TreeSAAP,http://genome.cs.byu.edu/treesaap.htm,,True,TreeSAAP,,10,2003


In [17]:
# MAKE SURE VIEWS, CITATIONS AND ALTMETRIC SCORE ARE NP INTS
all_articles_tools.altmetric_score = all_articles_tools.altmetric_score.astype(np.int64)
all_articles_tools.views = all_articles_tools.views.astype(np.int64)
all_articles_tools.citations_amount = all_articles_tools.citations_amount.astype(np.int64)

In [18]:
all_articles_tools["citations_per_year"]=0.0
#Set relative amount of citations- citations per year
for i in range(len(all_articles_tools)):
    if all_articles_tools["citations_amount"][i] is None:
        pass
    else:
        years=2017 - int(all_articles_tools["year"][i]) +1
        yearly_citations=float(all_articles_tools["citations_amount"][i])/float(years) 
        all_articles_tools["citations_per_year"][i]=int(round(yearly_citations))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [19]:
all_articles_tools["views_per_year"]=0
for i in range(len(all_articles_tools)):
    years=2017 - int(all_articles_tools["year"][i])+1
    yearly_views= float(all_articles_tools["views"][i]/years)
    all_articles_tools["views_per_year"][i]=yearly_views

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [20]:
#SET RELATIVE AMOUNT OF VIEWS AND SCALE (relative views are in range 4 to 34 so they can be use as sizes of balls
# on scatter plots)
all_articles_tools["relative_views"]=0
max_views=all_articles_tools["views_per_year"].max()


length=len(all_articles_tools)       
for i in range(length):
    yearly_views=float(all_articles_tools["views_per_year"][i])
    relative_views=(yearly_views/float(max_views))*30+4
    all_articles_tools["relative_views"][i]=int(round(relative_views))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


NameError: name 'only_tools' is not defined

In [22]:
#change a list of lists into a regular list
articles_to_drop_by_ids = [item for sublist in articles_to_drop_by_ids for item in sublist]

# list of ids of main articles

id_main_articles=range(len(all_articles_tools)+1)[1:]
for item in articles_to_drop_by_ids:
    id_main_articles.remove(item)
    
# I need to create a df only with TOOLS
only_tools=DataFrame()
only_tools=all_articles_tools[["homepage", "id", "related_articles", "name_tool", "abstract", "citations_per_year","relative_views" ]]

# drop the values which are not main articles
for item in articles_to_drop_by_ids:
    only_tools = only_tools[only_tools.id != item]

In [23]:
only_tools = only_tools.rename(index=str, columns={"id": "id_article", "related_articles": "ids_related_articles"})

In [24]:
#Give each data point an unique id
id_list=range(len(only_tools)+1)[1:]
only_tools["id"] = id_list

In [25]:
all_abstracts=only_tools.abstract.tolist()
names_of_tools=only_tools.name_tool.tolist()

In [26]:
# Step 1: NORMALIZE YOUR DATA
all_normalized_abstracts=normalize_abstract(all_abstracts)

In [27]:
# Step 2: EXTRACT FEATURES
tfidf_vectorizer, tfidf_matrix=build_feature_matrix(all_normalized_abstracts, feature_type="tfidf")

In [28]:
feature_names=tfidf_vectorizer.get_feature_names()

In [29]:
adj_matrix=cosine_similarity(tfidf_matrix, tfidf_matrix)

In [30]:
#This would add 4 closets neighbors index and similarity level for recommandaction system purposes
only_tools["closets_neighbor_1"]="NULL"
only_tools["closets_neighbor_2"]="NULL"
only_tools["closets_neighbor_3"]="NULL"
only_tools["closets_neighbor_4"]="NULL"
only_tools["similarity_neighbor_1"]="NULL"
only_tools["similarity_neighbor_2"]="NULL"
only_tools["similarity_neighbor_3"]="NULL"
only_tools["similarity_neighbor_4"]="NULL"
i=0
data_length=len(only_tools)
for i in range (data_length):
    sorted_similarity=sorted(((value, index) for index, value in enumerate(adj_matrix[i])), reverse=True)
    closest_list=[]
    closest_list=sorted_similarity[1:5]
#set up the values of the closest neighboors
    only_tools["closets_neighbor_1"][i] = (closest_list[0][1] + 1)# index to id
    only_tools["similarity_neighbor_1"][i] = closest_list[0][0]
#    
    only_tools["closets_neighbor_2"][i] =(closest_list[1][1] + 1)
    only_tools["similarity_neighbor_2"][i] = closest_list[1][0]
#    
    only_tools["closets_neighbor_3"][i] =(closest_list[2][1] + 1)
    only_tools["similarity_neighbor_3"][i] = closest_list[2][0]
#    
    only_tools["closets_neighbor_4"][i] =(closest_list[3][1] + 1)
    only_tools["similarity_neighbor_4"][i] = closest_list[3][0]

    i=i+1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a

In [31]:
adj_matrix_nor =adj_matrix / adj_matrix.max()

In [32]:
#Before you RUN it take a look at the parameters

import numpy as np
from sklearn.manifold import TSNE

#MODEL 3D
model3D=TSNE(n_components=3, perplexity=15.0, early_exaggeration=4.0, learning_rate=100.0, n_iter=8000, n_iter_without_progress=30, min_grad_norm=1e-07, metric='precomputed', init='random', verbose=0, random_state=None, method='barnes_hut', angle=0.5)
np.set_printoptions(suppress=True)
TSNE_data3D=model3D.fit_transform(1-adj_matrix_nor) 
transformed_TSNE_data3D=TSNE_data3D.transpose()

In [33]:
#MODEL 2D

model2D=TSNE(n_components=2, perplexity=10.0, early_exaggeration=4.0, learning_rate=100.0, n_iter=8000, n_iter_without_progress=30, min_grad_norm=1e-07, metric='precomputed', init='random', verbose=0, random_state=None, method='barnes_hut', angle=0.5)
np.set_printoptions(suppress=True)
TSNE_data2D=model2D.fit_transform(1-adj_matrix_nor) 
transformed_TSNE_data2D=TSNE_data2D.transpose()

In [34]:
# Manipulating data frame to add new information

#addig additional columns to data frame
only_tools["x"]=0
only_tools["y"]=0
only_tools["z"]=0
only_tools["closest_neighbors"]="NULL"
only_tools["x_2d"]=0
only_tools["y_2d"]=0

In [35]:
#assign the right values to the columns  3D case
x_coordinate = transformed_TSNE_data3D[0]
len(x_coordinate)
only_tools["x"]=x_coordinate

y_coordinate = transformed_TSNE_data3D[1]
only_tools["y"] = y_coordinate

z_coordinate = transformed_TSNE_data3D[2]
only_tools["z"]=z_coordinate

In [36]:
#assign the right values to the columns  2D case
x_coordinate_2d = transformed_TSNE_data2D[0]
only_tools["x_2d"]=x_coordinate_2d

y_coordinate_2d = transformed_TSNE_data2D[1]
only_tools["y_2d"] = y_coordinate_2d

In [37]:
#THE PRODUCT OF THESE CALCULATION ARE TWO DATA FRAMES
#ONE WITH VALID TOOLS AND THEIR SIMILARITY DATA
#ONE WITH ARTICLES

In [38]:
len(only_tools)

5915

In [39]:
only_tools

Unnamed: 0,homepage,id_article,ids_related_articles,name_tool,abstract,citations_per_year,relative_views,id,closets_neighbor_1,closets_neighbor_2,...,similarity_neighbor_1,similarity_neighbor_2,similarity_neighbor_3,similarity_neighbor_4,x,y,z,closest_neighbors,x_2d,y_2d
0,http://bioinfo3d.cs.tau.ac.il/MASS,1,,MASS,We present a novel method for multiple alignme...,3.0,4,1,569,1422,...,0.273267,0.260642,0.246373,0.239749,-0.192010,0.174666,-2.040201,,-0.033138,0.107600
2,http://salilab.org/modloop,3,,ModLoop,Summary:ModLoop is a web server for automated ...,19.0,4,2,1496,3968,...,0.305909,0.292766,0.275731,0.229524,-1.984522,0.644129,-0.685255,,-0.190165,0.105187
4,http://genome.imim.es/software/gfftools/GFF2AP...,5,,gff2aplot,Summary:gff2aplotis a program to visualize the...,0.0,4,3,872,254,...,0.223975,0.195749,0.175267,0.170718,0.482479,-0.234369,-0.437665,,-0.064961,-0.011358
6,http://www.megasoftware.net,7,,MEGA2,Summary: We have developed a new software pack...,0.0,4,4,517,2318,...,0.49956,0.311569,0.256006,0.113978,-1.478689,0.428377,1.660700,,0.035459,0.043875
7,http://www.biosci.ohio-state.edu/~ray/bioinfor...,8,,Tricross,Motivation: The process of determining the fun...,0.0,4,5,3138,2822,...,0.15815,0.141981,0.13338,0.124686,-0.439376,-0.301227,-2.043669,,-0.024854,0.031711
9,http://genome.cs.byu.edu/treesaap.htm,10,,TreeSAAP,Summary: The software program TreeSAAP measure...,9.0,4,6,844,404,...,0.147531,0.114574,0.111554,0.0871143,0.969968,-0.737989,0.462751,,-0.035861,-0.198041
11,http://www.cgl.ucsf.edu/Research/minrms,12,,MINRMS,Motivation: Existing algorithms for automated ...,2.0,4,7,4112,3349,...,0.241985,0.232656,0.227525,0.21457,-0.565001,-0.266125,-2.044399,,0.057230,0.158059
12,http://www.bioquery.org,13,,BioQuery,Summary: BioQuery is an application that helps...,0.0,4,8,2987,2916,...,0.167472,0.124948,0.12477,0.114572,-0.724206,-0.169190,-1.936309,,-0.083003,-0.045826
13,http://www.esat.kuleuven.ac.be/maran/,14,,MARAN,Summary: MARAN is a web-based application for ...,1.0,4,9,4938,488,...,0.139158,0.124356,0.112039,0.102343,0.393767,0.090869,2.023142,,0.071079,0.038350
14,http://Ligand.Info,15,,Ligand-Info,Motivation: The Ligand-Info system is based on...,2.0,4,10,2458,1074,...,0.189105,0.184302,0.164492,0.16015,1.700274,-0.982699,-0.157038,,0.101751,-0.040322


In [40]:
len(all_articles_tools)

7049

In [41]:
all_articles_tools

Unnamed: 0,title,link_to_tools,tag,views,citations_amount,altmetric_score,journal,authors,topics,abstract,...,homepage,info,active,main_name,related_articles,id,year,citations_per_year,views_per_year,relative_views
0,MASS: multiple structural alignment by seconda...,http://musket.sourceforge.net,ORIGINAL PAPER,81,41,0,Bioinformatics Oxford,"O. Dror,H. Benyamini,R. Nussinov,H. Wolfson",,We present a novel method for multiple alignme...,...,http://bioinfo3d.cs.tau.ac.il/MASS,,True,MASS,,1,2003,3.0,5,4
1,"PoPMuSiC, rationally designing point\nmutatio...",http://musket.sourceforge.net,APPLICATIONS NOTE,23,46,0,Bioinformatics Oxford,"J. M. Kwasigroch,D. Gilis,Y. Dehouck,M. Rooman",,Summary: PoPMuSiC is an efficient tool for rat...,...,http://babylone.ulb.ac.be/popmusic,,True,PoPMuSiC,,2,2002,3.0,1,4
2,ModLoop: automated modeling of loops in protei...,http://musket.sourceforge.net,APPLICATIONS NOTES,54,282,3,Bioinformatics Oxford,"András Fiser,Andrej Sali",,Summary:ModLoop is a web server for automated ...,...,http://salilab.org/modloop,,True,ModLoop,,3,2003,19.0,3,4
3,SNOW: Standard NOmenclature Wizard to help sea...,http://musket.sourceforge.net,APPLICATIONS NOTES,10,1,0,Bioinformatics Oxford,"Daniel Aguilar,Baldomero Oliva,Francesc X. Avi...",,Summary:When developing bioinformatical tools ...,...,http://ibb.uab.es/snow/,,True,SNOW,,4,2003,0.0,0,4
4,gff2aplot: Plotting sequence comparisons,http://musket.sourceforge.net,APPLICATIONS NOTES,17,4,0,Bioinformatics Oxford,"Josep F. Abril,Roderic Guigó,Thomas Wiehe",,Summary:gff2aplotis a program to visualize the...,...,http://genome.imim.es/software/gfftools/GFF2AP...,,True,gffaplot,,5,2003,0.0,1,4
5,Using credibility intervals instead of hypothe...,http://musket.sourceforge.net,ORIGINAL PAPERS,9,22,0,Bioinformatics Oxford,"Ricardo Z.N. Vêncio,Helena Brentani,Carlos A.B...",,Motivation:Statistical methods usually used to...,...,http://www.ime.usp.br/~rvencio/SAGEci/,,True,SAGE,,6,2003,1.0,0,4
6,MEGA2: molecular evolutionary genetics analysi...,http://musket.sourceforge.net,APPLICATIONS NOTE,193,0,9,Bioinformatics Oxford,"Sudhir Kumar,Koichiro Tamura,Ingrid B. Jakobse...",,Summary: We have developed a new software pack...,...,http://www.megasoftware.net,software,True,MEGA,,7,2001,0.0,11,4
7,Tricross: using dot-plots in sequence-id space...,http://musket.sourceforge.net,ORIGINAL PAPER,6,4,0,Bioinformatics Oxford,"William C. Ray,Robert S. Munson Jr,Charles J. ...",,Motivation: The process of determining the fun...,...,http://www.biosci.ohio-state.edu/~ray/bioinfor...,,True,Tricross,,8,2001,0.0,0,4
8,The systems biology markup language (SBML): a ...,http://musket.sourceforge.net,ORIGINAL PAPER,475,0,21,Bioinformatics Oxford,"M. Hucka,A. Finney,H. M. Sauro,H. Bolouri,J. C...",,Motivation: Molecular biotechnology now makes ...,...,http://www.sbml.org/,,True,SBML,,9,2003,0.0,31,4
9,TreeSAAP: Selection on Amino Acid Properties u...,http://musket.sourceforge.net,APPLICATIONS NOTE,167,142,1,Bioinformatics Oxford,"Steve Woolley,Justin Johnson,Matthew J. Smith,...",,Summary: The software program TreeSAAP measure...,...,http://genome.cs.byu.edu/treesaap.htm,,True,TreeSAAP,,10,2003,9.0,11,4


In [42]:
import plotly
plotly.offline.init_notebook_mode()
from plotly.offline import plot, iplot, init_notebook_mode
import plotly.plotly as py
import plotly.graph_objs as go

import numpy as np

x_list = only_tools.x.tolist()
y_list = only_tools.y.tolist()
z_list = only_tools.z.tolist()
tool_name = only_tools.name_tool.tolist()

trace1 = go.Scatter3d(
    x=x_list,
    y=y_list,
    z=z_list,
    text=['<br>'.join([x,y]) for x,y in zip(only_tools.name_tool, only_tools.homepage)],
    mode='markers',
    hoverinfo="text", 
    marker=dict(
        size=only_tools.relative_views,
        symbol='circle',
        cmin=4,
        cmax=34,
        colorscale='Portland',
        color = only_tools.citations_per_year, #set color equal to a variable
        showscale=True,
        line=dict(
            width=1
        ),
        opacity=0.9
    )
)


data = [trace1]
layout = go.Layout(
    title = 'Similarity Map',
    margin=dict(
        l=0,
        r=0,
        b=0,
        t=0
    ),
    xaxis=dict(
             showgrid=False,
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    ),
    yaxis=dict(
        showgrid=False,
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    ),
    
    
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)

In [43]:
# Create a trace
x_list_2d = only_tools.x_2d.tolist()
y_list_2d = only_tools.y_2d.tolist()
tool_name = only_tools.name_tool.tolist()

trace = go.Scattergl(
    x = x_list_2d ,
    y = y_list_2d,
    text=tool_name,
    mode = 'markers'
    
)
layout = dict(title = 'Similarity Map Basic',
              
     font=dict(family='Courier New, monospace', size=15, color='#686868'),
         xaxis=dict(
             showgrid=False,
        titlefont=dict(
            family='Courier New, monospace',
            size=0.1,
            color='#7f7f7f'
        )
    ),
    yaxis=dict(
        showgrid=False,
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    )
)

data = [trace]

fig = dict(data=data, layout=layout)
iplot(fig)

In [46]:
df3=only_tools[["x", "y", "z", "name_tool","homepage","citations_per_year","relative_views" ]]

In [47]:
df3.to_csv("ALL.csv",index=False, encoding='utf-8')

In [44]:
only_tools.to_csv("All_3_main_tools.txt",sep='\t', encoding='utf-8')
all_articles_tools.to_csv("All_3_all_tools_articles.txt",sep='\t', encoding='utf-8')