In [1]:
#Needed libraries:
#Regex for text cleaning
import re

#NLP library
import nltk

#Helper for creating regex 
import string

# Lemmatisation is the algorithmic process of determining the lemma of a word based on its intended meaning.
# Lemmatisation depends on correctly identifying the intended part of speech and meaning
#of a word in a sentence, as well as within the larger context surrounding that sentence
from nltk.stem import WordNetLemmatizer
wnl=WordNetLemmatizer()

#pattern.en module contains a fast part-of-speech tagger for English (CLiPS)
from pattern.en import tag

#WordNet is a lexical database for the English language.[1] It groups English words into sets of synonyms called synsets,#
#provides short definitions and usage examples, and records a number of relations among these synonym sets or their members. 
from nltk.corpus import wordnet as wn

import numpy as np

import pandas as pd
from pandas import DataFrame, Series


from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer


from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# To delete stop words from the text
from nltk.corpus import stopwords
stopword_list=stopwords.words("english")

#Add additional stop words
stopword_list.extend(['www','mail','edu','athttps'])

#For tokenizing
from nltk.tokenize import sent_tokenize, word_tokenize

#remove special characters, this is recommended
remove_characters=re.compile('[^a-zA-Z ]')

In [3]:
def remove_special_characters(text):
    text = text.decode('utf-8')
    text=text.strip()
    filtered_sentence=re.sub(remove_characters, r' ', text)
    return filtered_sentence

In [4]:
def lemmatize_text(text):
#pos_tagged_text is lower case and has WordNet tags, ready to lemmatize    
    pos_tagged_text = pos_tag_text(text)
    lemmatized_tokens = [wnl.lemmatize(word, pos_tag) if pos_tag
                         else word #if word has a tag lemmatize it and add to the list, otherwise just add the word                    
                         for word, pos_tag in pos_tagged_text]
    lemmatized_text = ' '.join(lemmatized_tokens)
    return lemmatized_text

In [5]:
# Annotate text tokens with POS tags
def pos_tag_text(text):
#Converts Penn Treebank POS tags to WordNet tags    
    def penn_to_wn_tags(pos_tag):
        if pos_tag.startswith('J'):
            return wn.ADJ
        elif pos_tag.startswith('V'):
            return wn.VERB
        elif pos_tag.startswith('N'):
            return wn.NOUN
        elif pos_tag.startswith('R'):
            return wn.ADV
        else:
            return None
    #Use pattern library tagging functions (Penn Treebank syntax)
    tagged_text = tag(text)# Result: list of tuples for each sentence
    #In order to use lemmatizer we need to change POS tags to WordNet tags and make all words lowercase
    tagged_lower_text = [(word.lower(), penn_to_wn_tags(pos_tag))
                         for word, pos_tag in
                         tagged_text]
    return tagged_lower_text

In [6]:
#This function removes stopwords
def remove_stopwords(text):
    tokens=tokenize_text(text)
    filtered_tokens=[token for token in tokens if token not in stopword_list]
    filtered_text=" ".join(filtered_tokens)
    return filtered_text

In [7]:
#This fucntion tokenize words in a sentence
def tokenize_text(text):
    text = text.decode('utf-8')
    tokens=nltk.word_tokenize(text)
    tokens=[token.strip() for token in tokens]
    return tokens

In [8]:
def normalize_abstract(abstracts):
    normalized_abstracts=[]
    for abstract in abstracts:
        normalized_abstract=[]
        #First clean data from any special characters
        text=remove_special_characters(abstract)
        #Split abstract into sentences
        sentences=sent_tokenize(text)
        for text in sentences:
            text=lemmatize_text(text)
            text=remove_stopwords(text)
            normalized_abstract.append(text)
        normalized_abstract_string=" ".join(normalized_abstract)
        normalized_abstracts.append(normalized_abstract_string)
    return normalized_abstracts

In [9]:
#GETTING THE FEACURES AND VECTORIZER

In [10]:
def build_feature_matrix(abstracts, feature_type='frequency',
                         ngram_range=(1, 1), min_df=0.00, max_df=1.0):

    feature_type = feature_type.lower().strip()  
    
    if feature_type == 'frequency':
        vectorizer = CountVectorizer(binary=False, min_df=min_df,
                                     max_df=max_df, ngram_range=ngram_range)
    elif feature_type == 'tfidf':
        vectorizer = TfidfVectorizer(min_df=min_df, max_df=max_df, 
                                     ngram_range=ngram_range,use_idf=True)
    else:
        raise Exception("Wrong feature type entered. Possible values:'frequency', 'tfidf'")

    feature_matrix = vectorizer.fit_transform(abstracts).astype(float)

    
    return vectorizer, feature_matrix

In [68]:
#Read im the data, delete additional columns

all_articles_tools = pd.read_table('ALL_journals_NEW.txt', keep_default_na=False)
#Get rid of unnnamed columns added each time you read data into df
all_articles_tools.drop('Unnamed: 0', axis=1, inplace=True)
all_articles_tools.drop('Unnamed: 0.1', axis=1, inplace=True)
all_articles_tools.drop('Unnamed: 0.1.1', axis=1, inplace=True)
#all_articles_tools.drop('Unnamed: 0.1.1.1', axis=1, inplace=True)
# create a column for storing related articles






In [69]:
len(all_articles_tools)

7382

In [70]:
#this column is going to be used to store names of tools without any integers inside
# this is helpful to find different versions of the same tools
#I am stripping names of special character and numbers to easier identify articles related to the same tool
all_articles_tools["main_name"]="NULL"

first_word=re.compile('[^\s]+')
for i in range(len(all_articles_tools)):
    try:
        tool_name=all_articles_tools["name_tool"][i]
        tool_name=re.match(first_word, tool_name).group(0)
        tool_name = ''.join([j for j in tool_name if not j.isdigit()])
        all_articles_tools["main_name"][i]=tool_name
    except:
        pass
    i=i+1

    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


In [71]:
all_articles_tools["related_articles"]="NULL"

#Give each data point an unique id
id_list=range(len(all_articles_tools)+1)[1:]
all_articles_tools["id"] = id_list

names_of_tools=all_articles_tools.main_name.tolist()
# create a list of names of tools that are duplicates
import collections
duplicated_tools=[item for item, count in collections.Counter(names_of_tools).items() if count > 1]





In [72]:
#list of id of articles which are not "main articles associated with a tool"- to be dropped before calculating
#cosine similarity
articles_to_drop_by_ids=[]

for name in duplicated_tools:
#ascending = False most recent on the top of the df- we decided that the most recent article
# will be listed as the main on the tools' landing page
    related_articles=all_articles_tools.loc[all_articles_tools['main_name'] == name].sort_values(by='date', ascending=False)
    ids=[]
    ids=related_articles.id.tolist()
    index=ids[0]-1
    all_articles_tools["related_articles"][index]=ids[1:]
    articles_to_drop_by_ids.append(ids[1:])


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


In [73]:
#Another piece of information needed for calculating relative citations, per year citations
year_pattern=re.compile('^20[0-9]{2}')

all_articles_tools["year"]="NULL"
for i in range(len(all_articles_tools)):
    try:
        year_matched=re.match( year_pattern, all_articles_tools["date"][i] ).group(0)
        all_articles_tools["year"][i] = year_matched

    except:
        all_articles_tools["year"][i]="NULL"


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [74]:
# MAKE SURE VIEWS, CITATIONS AND ALTMETRIC SCORE ARE NP INTS -this was giving problems during calculations
# of citations per year and so on
all_articles_tools.altmetric_score = all_articles_tools.altmetric_score.astype(np.int64)
all_articles_tools.views = all_articles_tools.views.astype(np.int64)
all_articles_tools.citations_amount = all_articles_tools.citations_amount.astype(np.int64)

In [75]:
all_articles_tools["citations_per_year"]=0.0
#Set relative amount of citations- citations per year
for i in range(len(all_articles_tools)):
    if all_articles_tools["citations_amount"][i] is None:
        pass
    else:
        years=2017 - int(all_articles_tools["year"][i]) +1
        yearly_citations=float(all_articles_tools["citations_amount"][i])/float(years) 
        all_articles_tools["citations_per_year"][i]=int(round(yearly_citations))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [76]:
all_articles_tools["views_per_year"]=0
for i in range(len(all_articles_tools)):
    years=2017 - int(all_articles_tools["year"][i])+1
    yearly_views= float(all_articles_tools["views"][i]/years)
    all_articles_tools["views_per_year"][i]=yearly_views

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [77]:
#SET RELATIVE AMOUNT OF VIEWS AND SCALE (relative views are in range 4 to 34 so they can be use as sizes of balls
#on scatter plots)
#The 4 to 34 scale was introduce because it translates nicely to a size of a ball in the Plotly 3D scatterplot
#This can be changed or dropped
all_articles_tools["relative_views"]=0
max_views=all_articles_tools["views_per_year"].max()


length=len(all_articles_tools)       
for i in range(length):
    yearly_views=float(all_articles_tools["views_per_year"][i])
    relative_views=(yearly_views/float(max_views))*30+4
    all_articles_tools["relative_views"][i]=int(round(relative_views))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]


In [78]:
#change a list of lists into a regular list
articles_to_drop_by_ids = [item for sublist in articles_to_drop_by_ids for item in sublist]

# list of ids of main articles
id_main_articles=range(len(all_articles_tools)+1)[1:]
for item in articles_to_drop_by_ids:
    id_main_articles.remove(item)
    
#Create a df only with TOOLS- main articles 
only_tools=DataFrame()
only_tools=all_articles_tools[["homepage", "id", "related_articles", "name_tool", "abstract", "citations_per_year","relative_views", "topics" ]]

# drop the values which are not main articles
for item in articles_to_drop_by_ids:
    only_tools = only_tools[only_tools.id != item]

In [79]:
only_tools = only_tools.rename(index=str, columns={"id": "id_article", "related_articles": "ids_related_articles"})

## At this point I have two dataframes:
* one with MAIN TOOLS - each of them will have their own landing page on the website,
* one with all articles - I keep them for related articles, they also help with foregin keys and tables organization in SQL

In [80]:
#Give each data point an unique id
id_list=range(len(only_tools)+1)[1:]
only_tools["id"] = id_list

In [189]:
all_abstracts=only_tools.abstract.tolist()
names_of_tools=only_tools.name_tool.tolist()

In [82]:
# At this point I need to work more on my abstracts and names_of tools to get better results:
# These two things show to improve the results: 
# -add normalized names of tools to stop words,
# -delete email addresses from the abstracts
# Getting rid of theses info also help to make keywords more relevant

In [190]:
# Normalizing names of tools so they have the same form as names of tools in normalized abstracs
names_copy=only_tools.name_tool.tolist()
all_normalized_names=normalize_abstract(names_copy)

In [191]:
# Add normalized tool's names to the stopwords, they are not relavant for similarity calculation,faster calculation
stopword_list.extend(['www','mail','edu','athttps'])
stopword_list.extend(all_normalized_names)

In [192]:
len(stopword_list)

18169

In [193]:
##REGEX FOR GETTING RID OF EMIALS AND WEBSITE ADDRESSES
website_pattern=re.compile(r'[\w\.-]+@[\w\.-]+')
email_pattern=re.compile(r'http[^\s]+')

In [194]:
i=0
for i in range(len(all_abstracts)):
    result=re.sub(website_pattern,"",  all_abstracts[i])
    all_abstracts[i]=re.sub(email_pattern,"",  result)
    i=i+1

### This part deals with similarity between abstracts' of MAIN TOOLS

In [88]:
# Step 1: NORMALIZE YOUR DATA
all_normalized_abstracts=normalize_abstract(all_abstracts)

# Step 2: EXTRACT FEATURES
tfidf_vectorizer, tfidf_matrix=build_feature_matrix(all_normalized_abstracts, feature_type="tfidf")

#Get the names of the features in the features matrix, so you are aware of what is happening
feature_names=tfidf_vectorizer.get_feature_names()

#Calculate the adjacency matrix
adj_matrix=cosine_similarity(tfidf_matrix, tfidf_matrix)

### Here we save 9 closest neighbors

In [155]:
#This would add 9 closets neighbors index and similarity level for recommandaction system purposes
only_tools["closets_neighbor_1"]="NULL"
only_tools["closets_neighbor_2"]="NULL"
only_tools["closets_neighbor_3"]="NULL"
only_tools["closets_neighbor_4"]="NULL"
only_tools["closets_neighbor_5"]="NULL"
only_tools["closets_neighbor_6"]="NULL"
only_tools["closets_neighbor_7"]="NULL"
only_tools["closets_neighbor_8"]="NULL"
only_tools["closets_neighbor_9"]="NULL"
only_tools["similarity_neighbor_1"]="NULL"
only_tools["similarity_neighbor_2"]="NULL"
only_tools["similarity_neighbor_3"]="NULL"
only_tools["similarity_neighbor_4"]="NULL"
only_tools["similarity_neighbor_5"]="NULL"
only_tools["similarity_neighbor_6"]="NULL"
only_tools["similarity_neighbor_7"]="NULL"
only_tools["similarity_neighbor_8"]="NULL"
only_tools["similarity_neighbor_9"]="NULL"

In [156]:
i=0
data_length=len(only_tools)
for i in range (data_length):
    sorted_similarity=sorted(((value, index) for index, value in enumerate(adj_matrix[i])), reverse=True)
    closest_list=[]
    closest_list=sorted_similarity[1:14]
#set up the values of the closest neighboors
    only_tools["closets_neighbor_1"][i] = (closest_list[0][1] + 1)# index to id
    only_tools["similarity_neighbor_1"][i] = closest_list[0][0]
#    
    only_tools["closets_neighbor_2"][i] =(closest_list[1][1] + 1)
    only_tools["similarity_neighbor_2"][i] = closest_list[1][0]
#    
    only_tools["closets_neighbor_3"][i] =(closest_list[2][1] + 1)
    only_tools["similarity_neighbor_3"][i] = closest_list[2][0]
#    
    only_tools["closets_neighbor_4"][i] =(closest_list[3][1] + 1)
    only_tools["similarity_neighbor_4"][i] = closest_list[3][0]
    
    only_tools["closets_neighbor_5"][i] = (closest_list[4][1] + 1)# index to id
    only_tools["similarity_neighbor_5"][i] = closest_list[4][0]
#    
    only_tools["closets_neighbor_5"][i] =(closest_list[5][1] + 1)
    only_tools["similarity_neighbor_5"][i] = closest_list[5][0]
#    
    only_tools["closets_neighbor_6"][i] =(closest_list[6][1] + 1)
    only_tools["similarity_neighbor_6"][i] = closest_list[6][0]
#    
    only_tools["closets_neighbor_7"][i] =(closest_list[7][1] + 1)
    only_tools["similarity_neighbor_7"][i] = closest_list[7][0]
    
    only_tools["closets_neighbor_8"][i] =(closest_list[8][1] + 1)
    only_tools["similarity_neighbor_8"][i] = closest_list[8][0]
    
    only_tools["closets_neighbor_9"][i] =(closest_list[9][1] + 1)
    only_tools["similarity_neighbor_9"][i] = closest_list[9][0]

    i=i+1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/

## It would be alaso beneficial to store normalized abstracts for all of the articles for more efficient search in case other method don not give enough results or any results

In [165]:
all_abstracts_stopword_list=stopwords.words("english")

In [167]:
all_abstracts_ALL_articles=all_articles_tools.abstract.tolist()

In [168]:
##REGEX FOR GETTING RID OF EMIALS AND WEBSITE ADDRESSES
website_pattern=re.compile(r'[\w\.-]+@[\w\.-]+')
email_pattern=re.compile(r'http[^\s]+')

In [169]:
i=0
for i in range(len(all_abstracts_ALL_articles)):
    result=re.sub(website_pattern,"",  all_abstracts[i])
    all_abstracts[i]=re.sub(email_pattern,"",  result)
    i=i+1

In [170]:
# Step 1: NORMALIZE YOUR DATA
all_normalized_abstracts_ALL_articles=normalize_abstract(all_abstracts_ALL_articles)

In [171]:
all_articles_tools["abstract_normalized"]="NULL"

In [172]:
for i in range(len(all_articles_tools)):
    all_articles_tools["abstract_normalized"][i]=all_normalized_abstracts_ALL_articles[i]
    i=i+1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


### Now, it is time to get relevant keywords for the landing pages of tools

In [89]:
# That many features are avaiable
len(feature_names)

17746

In [90]:
#matrix shape is number of abstracts (rows) by number of unique words (columns)
tfidf_matrix.shape

(5995, 17746)

In [91]:
# BUILD LIST you don't care about in your corpous to add to stopwords
# If there are meaningless words in feature_names list- they should be added to stop words and adj_matrix should 
# be them recalculated OR just delete them from the accepted keywords list


In [92]:
# Extract the most important features ~ keywords that define the tool, abstract
# In order to identify words which were given most weight by tf-idf we need to extract wieights from tfidf_matrix

In [93]:
#In order to analyze rows of the matrix we need to change it to dense
from scipy.sparse import csr_matrix
matrix_dense=tfidf_matrix.todense()

In [94]:
matrix_dense #This is a very sparse matrix

matrix([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        ..., 
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [95]:
#Add columns to store info about most importat features=words, and their assosiated weights 
only_tools["feature_list"]="NULL"
only_tools["feature_scores"]="NULL"

In [96]:
#I chose weight of word to be more than 0.10 for word to be considered meaningful
#Here we accually collect only index of these words in the list of all the features
#In the next step we will change index to word

In [97]:
only_tools["abstract"][2558]

'The genes that produce antibodies and the immune receptors expressed on lymphocytes are not germline encoded; rather, they are somatically generated in each developing lymphocyte by a process called V(D)J recombination, which assembles specific, independent gene segments into mature composite genes. The full set of composite genes in an individual at a single point in time is referred to as the immune repertoire. V(D)J recombination is the distinguishing feature of adaptive immunity and enables effective immune responses against an essentially infinite array of antigens. Characterization of immune repertoires is critical in both basic research and clinical contexts. Recent technological advances in repertoire profiling via high-throughput sequencing have resulted in an explosion of research activity in the field. This has been accompanied by a proliferation of software tools for analysis of repertoire sequencing data. Despite the widespread use of immune repertoire profiling and analy

In [98]:
i=0
for i in range(len(only_tools)):
    matrix_dense_row=matrix_dense[i]
    A = np.squeeze(np.asarray(matrix_dense_row))
    important_features=[(d, x) for d, x in enumerate(A) if x > 0.10]
    feature_tuples=zip(*important_features)
    feature_lists=map(list,feature_tuples )
    only_tools["feature_list"][i]=feature_lists[0]
    only_tools["feature_scores"][i]=feature_lists[1]
    i=i+1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [99]:
#This column will store actual words
only_tools["feature_names"]="NULL"

In [100]:
# feature indexes to words
for i in range(len(only_tools)):
    tool_features=[]
    for item in only_tools["feature_list"][i]:
        tool_features.append(feature_names[item])
    only_tools["feature_names"][i]=tool_features
    i=i+1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [101]:
# Add all the list of feature words into one big list check frequency of accurence of the words
# choose only globally relevant words

In [102]:
#Add all the words you selected for each of the abstracts into one big list
big_list_of_words=[]
for i in range(len(only_tools)):
    big_list_of_words.append(only_tools["feature_names"][i])

In [103]:
# list of lists into a list- this is very long list with the same words appearing multiple times
features_list = [item for sublist in big_list_of_words for item in sublist]

In [104]:
#Add to feature list keywords from the journal itself
#They are stored as one string - words comma separated -change the string into list of words
only_tools["topics_list"]="NULL" 

In [105]:
big_list_of_topics=[]
for i in range(len(only_tools)):
    topics=""
    topics= only_tools["topics"][i]
    topics1=topics.split(",")
    only_tools["topics_list"][i]=topics1
    big_list_of_topics.append(only_tools["topics_list"][i])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [106]:
# list of lists into a list
topics_list = [item for sublist in big_list_of_topics for item in sublist]

In [107]:
# Feature list includes keywords with weigth more than 0.1
#and keywords as provided by the journals-only NAR and BMC provide the journals)
features_topics_list=features_list + topics_list

In [108]:
#Some of the features have leading space-we want to get rid of it
leading_space=re.compile("^\s")
for i in range(len(features_topics_list)):
    word=re.sub(leading_space, "",features_topics_list[i])
    features_topics_list[i]=word

In [109]:
#This library helps to identify "important words"-as per weight that are common enough to be good tahs -keywords
import collections
counter=collections.Counter(features_topics_list)

In [110]:
#To get all words=features with weight at least 0.10 is not enough, we want to only consider words that
#were popular and meaningful. #We can manulally decided how many distinct keywords we want to include 
#and remove words which have high weight but are not meaningful for our purposes.

In [111]:
my_keywords_tuples=counter.most_common(319) # 300+ keywords proved to work well but can be changed

In [112]:
my_keywords = zip(*my_keywords_tuples)
my_keywords =my_keywords[0]

In [113]:
#List of words that are in keywords list but are not meaningfull -continue to develop it
black_list_of_terms=["", "co", ",many", "non","end", "may", "tf","know", "use","via","onto","five", "four",
                     "yet", "without","go"]

In [114]:
my_keywords_clean = [x for x in my_keywords if x not in black_list_of_terms]

In [115]:
# Add all the considered keywords for each abstract into one list
only_tools["keywords_dirty"]="NULL"
for i in range(len(only_tools)):
    only_tools["keywords_dirty"][i]= only_tools["feature_names"][i] + only_tools["topics_list"][i]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [116]:
#That many different keywords which willl be featured on the landing pages
len(my_keywords_clean)

312

In [117]:
#This column will store these relevant keywords for each of the tools
only_tools["tool_keywords"]="NULL"

In [118]:
#Saving only relevant keywords
for i in range(len(only_tools)):
    all_features=only_tools["keywords_dirty"][i]
    only_relevant = [word for word in all_features if word  in my_keywords_clean]
    only_tools["tool_keywords"][i]=only_relevant

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
  after removing the cwd from sys.path.


### *At this point the relevant keywords for each tool are saved.

## This part deals with improving search results using CHAMPIONSHIP LIST, as described in https://nlp.stanford.edu/IR-book/ with slight modifications

In [119]:
#List of words that are in keywords list but are not meaningfull -continue to develop it
#This could be improved but the words look pretty relevant
black_list_of_terms_search=["", "co", ",many", "non","end", "may", "tf","know", "use","via","onto","five", "four",
                     "yet", "without","go", "june"]

In [120]:
#top 4,000 words for search
search_words_tuples=counter.most_common(4000)

In [121]:
my_search_keywords = zip(*search_words_tuples)
my_search_keywords =my_search_keywords[0]

In [122]:
my_search_words_clean = [x for x in my_search_keywords if x not in black_list_of_terms_search]

In [123]:
#Save the words which can be used to find a given tool

In [124]:
only_tools["search_words"]="NULL"
#Saving only relevant keywords
for i in range(len(only_tools)):
    all_features=only_tools["keywords_dirty"][i]
    only_relevant = [word for word in all_features if word in my_search_words_clean]
    only_tools["search_words"][i]=only_relevant

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
  """


### Check the results: abstract and corresponding search words that will 'find' it

In [133]:
all_normalized_abstracts[5]

u'summary software program measure selective influence structural biochemical amino acid property cladogenesis perform goodness categorical statistical test availability package executables window pc macintosh osx java code documentation instruction manual available unix version available upon request contact correspondence address'

In [134]:
only_tools["search_words"][5]

[u'acid',
 u'address',
 u'amino',
 u'biochemical',
 u'categorical',
 u'code',
 u'correspondence',
 u'documentation',
 u'executables',
 u'goodness',
 u'influence',
 u'instruction',
 u'java',
 u'macintosh',
 u'manual',
 u'measure',
 u'osx',
 u'package',
 u'pc',
 u'perform',
 u'program',
 u'property',
 u'request',
 u'selective',
 u'statistical',
 u'structural',
 u'test',
 u'unix',
 u'upon',
 u'version',
 u'window']

In [None]:
#Create new data structure to hold the results

In [135]:
#This help with fixing tool's indexing
search_tools=pd.concat([only_tools],ignore_index=True)

In [136]:
d={'search_word': my_search_words_clean}

In [137]:
search_words_df=DataFrame(data=d)

In [138]:
# Search words df is a CHAMPIONSHIP LIST, it stores a words and list of tools' ids that are findiable by 
# that search keyword

In [139]:
search_words_df["tool_ids"]=np.empty((len(search_words_df), 0)).tolist()

In [140]:
#search_words_df

In [141]:
#take search words associated with a tool and put tools' id next to that words in the df
for i in range(len(only_tools)):
    all_search_words=only_tools["search_words"][i]
    for word in all_search_words:
#        print(word)
        index_list=search_words_df.loc[search_words_df['search_word'] == word].index.tolist()
        search_words_df["tool_ids"][index_list[0]].append(only_tools["id"][i])

In [142]:
#search_words_df

In [143]:
#The best way to work with this would be have words in one list and then list of lists for tools_id
#This could be improved by making the words in alphabetic order
search_df=search_words_df.sort_values( ["search_word"])

In [144]:
#search_df["search_word"].tolist()

In [174]:
search_words_list=search_df["search_word"].tolist()

In [175]:
thefile = open('search_words_t.txt', 'w')

In [176]:
thefile.write("[")
for item in search_words_list:
  thefile.write("%s," % item)
thefile.write("]")

In [177]:
seach_tool_ids=search_df["tool_ids"].tolist()

In [178]:
thefile2 = open('search_tool_t.txt', 'w')

In [179]:
thefile2.write("[")
for item in seach_tool_ids:
  thefile2.write("%s," % item)
thefile2.write("]")

In [151]:
#Check the files sometimes the list are cut before- not whole list is written to the file

In [152]:
#search_df[3959:]

In [153]:
#fix weird problem 
my_list=search_df["tool_ids"][3959:].tolist()

### Feature Matrix also needs to be printed to a file 

In [187]:
# We can build feature vector for each tool based on 4,000 search words we care about

In [195]:
# Step 1: NORMALIZE YOUR DATA
all_normalized_abstracts=normalize_abstract(all_abstracts)


In [236]:
# Step 2: EXTRACT FEATURES
tfidf_vectorizer2, tfidf_matrix2=build_feature_matrix_search(all_normalized_abstracts, feature_type="tfidf")

In [206]:
def build_feature_matrix_search(abstracts, feature_type='frequency',
                         ngram_range=(1, 1), min_df=0.00, max_df=1.0,vocabulary=search_words_list):

    feature_type = feature_type.lower().strip()  
    
    if feature_type == 'frequency':
        vectorizer = CountVectorizer(binary=False, min_df=min_df,
                                     max_df=max_df, ngram_range=ngram_range)
    elif feature_type == 'tfidf':
        vectorizer = TfidfVectorizer(min_df=min_df, max_df=max_df, 
                                     ngram_range=ngram_range,use_idf=True,vocabulary=search_words_list)
    else:
        raise Exception("Wrong feature type entered. Possible values:'frequency', 'tfidf'")

    feature_matrix = vectorizer.fit_transform(abstracts).astype(float)

    
    return vectorizer, feature_matrix

In [196]:
len(search_words_list)

3986

In [237]:
tfidf_matrix2.shape

(5995, 3986)

In [216]:
tfidf_matrix2=tfidf_matrix2.todense()

In [238]:
save_sparse_csr("test.txt", tfidf_matrix2)

In [234]:
def save_sparse_csr(filename,array):
    np.savez(filename,data = array.data ,indices=array.indices,
             indptr =array.indptr, shape=array.shape )

In [240]:
def load_sparse_csr(filename):
    loader = np.load(filename)
    return csr_matrix((  loader['data'], loader['indices'], loader['indptr']),
                         shape = loader['shape'])

In [241]:
mydata=load_sparse_csr("test.txt.npz")

In [242]:
mydata

<5995x3986 sparse matrix of type '<type 'numpy.float64'>'
	with 393729 stored elements in Compressed Sparse Row format>

### Here is everything connected to graphing tool in a 3d interactive plot

In [157]:
adj_matrix_nor =adj_matrix / adj_matrix.max()

In [158]:
# in order to use t-sne you need to change cosine similarity to cosine distance 
#cosine distance = 1 - cosine similarity
#REST OF THE CODE 

In [159]:
#Before you RUN it take a look at the parameters

import numpy as np
from sklearn.manifold import TSNE

#MODEL 3D
model3D=TSNE(n_components=3, perplexity=15.0, early_exaggeration=4.0, learning_rate=100.0, n_iter=8000, n_iter_without_progress=30, min_grad_norm=1e-07, metric='precomputed', init='random', verbose=0, random_state=None, method='barnes_hut', angle=0.5)
np.set_printoptions(suppress=True)
TSNE_data3D=model3D.fit_transform(1-adj_matrix_nor) 
transformed_TSNE_data3D=TSNE_data3D.transpose()

In [160]:
#MODEL 2D

model2D=TSNE(n_components=2, perplexity=10.0, early_exaggeration=4.0, learning_rate=100.0, n_iter=8000, n_iter_without_progress=30, min_grad_norm=1e-07, metric='precomputed', init='random', verbose=0, random_state=None, method='barnes_hut', angle=0.5)
np.set_printoptions(suppress=True)
TSNE_data2D=model2D.fit_transform(1-adj_matrix_nor) 
transformed_TSNE_data2D=TSNE_data2D.transpose()

In [161]:
# Manipulating data frame to add new information

#addig additional columns to data frame
only_tools["x"]=0
only_tools["y"]=0
only_tools["z"]=0
only_tools["closest_neighbors"]="NULL"
only_tools["x_2d"]=0
only_tools["y_2d"]=0

In [162]:
#assign the right values to the columns  3D case
x_coordinate = transformed_TSNE_data3D[0]
len(x_coordinate)
only_tools["x"]=x_coordinate

y_coordinate = transformed_TSNE_data3D[1]
only_tools["y"] = y_coordinate

z_coordinate = transformed_TSNE_data3D[2]
only_tools["z"]=z_coordinate

In [163]:
#assign the right values to the columns  2D case
x_coordinate_2d = transformed_TSNE_data2D[0]
only_tools["x_2d"]=x_coordinate_2d

y_coordinate_2d = transformed_TSNE_data2D[1]
only_tools["y_2d"] = y_coordinate_2d

### Print results to files which will be turn into relavant tables in the next step

In [None]:
#print vector for each tool 

In [229]:
only_tools.to_csv("backup3_search_main_tools.txt",sep='\t', encoding='utf-8')
all_articles_tools.to_csv("backup3_search_tools_articles.txt",sep='\t', encoding='utf-8')

### This is code to get a file that would be input to network2canvas algorithm

In [233]:
#CODE FOR CANVAS VISUALIZATION
similar_tool_canvas=only_tools[["main_tool", "similar_tool_fk", "similarity"]]

KeyError: "['main_tool' 'similar_tool_fk' 'similarity'] not in index"

In [None]:
for i in range(len(similar_tool_canvas)):
    tool_id=similar_tool_canvas["main_tool"][i]
    similar_tool_canvas["main_tool"][i]= tools["name_tool"][tool_id-1]
    similar_tool_id=similar_tool_canvas["similar_tool_fk"][i]
    similar_tool_canvas["similar_tool_fk"][i]= tools["name_tool"][similar_tool_id-1]

In [None]:
similar_tool_canvas.to_csv("canvas2.txt", header=False, index=False,sep='\t', encoding='utf-8')