In [99]:
import wikipedia
import skbio as skb
import numpy as np
import pandas as pd
import seaborn as sns
from ecopy import Mantel

from matplotlib import pyplot as plt
import pylab as pl
from qiime2 import Artifact
from skbio import TreeNode
from skbio.stats.distance import mantel
from scipy.stats import linregress
from scipy.spatial.distance import squareform, pdist
from os.path import abspath, join
from os import makedirs

from collections import defaultdict

In [100]:
## Test case when you find the correct query
print(wikipedia.summary("tiger"))
page=wikipedia.page('tiger')
print(page)
print(page.content)

The tiger (Panthera tigris) is the largest living cat species and a member of the genus Panthera. It is most recognisable for its dark vertical stripes on orange-brown fur with a lighter underside. It is an apex predator, primarily preying on ungulates such as deer and wild boar. It is territorial and generally a solitary but social predator, requiring large contiguous areas of habitat, which support its requirements for prey and rearing of its offspring. Tiger cubs stay with their mother for about two years, before they become independent and leave their mother's home range to establish their own.
The tiger was first scientifically described in 1758 and once ranged widely from the Eastern Anatolia Region in the west to the Amur River basin in the east, and in the south from the foothills of the Himalayas to Bali in the Sunda Islands. Since the early 20th century, tiger populations have lost at least 93% of their historic range and have been extirpated from Western and Central Asia, th

In [101]:
## Test case when you don't find the right query ==> first type of exception to be expected
try: 
    page=wikipedia.page('adsfjhslkjfh')
except wikipedia.exceptions.PageError: 
    print('a page error occurred')
print(page)

a page error occurred
<WikipediaPage 'Tiger'>


In [102]:
## Given that an initial page is not found the page will return the first listed query  
page=wikipedia.page('Testudo hermanni boettgeri')
print(page.content)
type(page.content)

Hermann's tortoise (Testudo hermanni) is a species in the genus Testudo. Two subspecies are known: the western Hermann's tortoise (T. h. hermanni ) and the eastern Hermann's tortoise (T. h. boettgeri ). Sometimes mentioned as a subspecies, T. h. peleponnesica is not yet confirmed to be genetically different from T. h. boettgeri.


== Etymology ==
The specific epithet, hermanni, honors French naturalist Johann Hermann.The subspecific name, boettgeri, honors German herpetologist Oskar Boettger.


== Geographic range ==
Testudo hermanni can be found throughout southern Europe. The western population (T. h. hermanni) is found in eastern Spain, southern France, the Balearic islands, Corsica, Sardinia, Sicily, south and central Italy (Tuscany). The eastern population (T. h. boettgeri ) inhabits Serbia, Kosovo, North Macedonia, Romania, Bulgaria, Albania, Turkey and Greece, while T. h. hercegovinensis populates the coasts of Bosnia and Herzegovina, Croatia, and Montenegro.


== Description an

str

In [103]:
## import host species from metadata
metadata_path='/panfs/panfs1.ucsd.edu/panscratch/jhc103/VertMetaphlan-frmerged/metadata/vert_metadata_new.txt'
df=pd.read_csv(metadata_path, sep='\t')

In [104]:
species_list=np.array(df['host_species'])
species_set=set(species_list)
print(len(species_set))
species_set

288


{'Abramis brama',
 'Acanthiza uropygialis',
 'Accipiter nisus',
 'Acrocephalus scirpaceus',
 'Acryllium vulturinum',
 'Aepyceros melampus',
 'Alburnus alburnus',
 'Aldabrachelys gigantea',
 'Alligator mississippiensis',
 'Alouatta caraya',
 'Alouatta palliata',
 'Alouatta pigra',
 'Alouatta seniculus',
 'Anas crecca',
 'Anas platyrhynchos',
 'Anguis fragilis',
 'Anser anser',
 'Aonyx cinerea',
 'Apodemus flavicollis',
 'Apodemus sylvaticus',
 'Aptenodytes patagonicus',
 'Apteryx australis',
 'Apteryx mantelli',
 'Ardea cinerea',
 'Ardeotis kori',
 'Artibeus intermedius',
 'Asio otus',
 'Aspius aspius',
 'Ateles belzebuth',
 'Ateles hybridus',
 'Balaenoptera borealis',
 'Balaenoptera physalus',
 'Balantiopteryx plicata',
 'Blicca bjoerkna',
 'Bos frontalis',
 'Bos taurus',
 'Bradypus variegatus',
 'Branta leucopsis',
 'Bufo bufo',
 'Bufotes viridis',
 'Buteo jamaicensis',
 'Calamanthus cautus',
 'Callosciurus prevosti',
 'Callosciurus prevostii',
 'Camelus dromedarius',
 'Camelus ferus'

In [251]:
## Texts to search for: 
## Carnivore: carnivore, carnivorous, meat 
## Herbivore: herbivore, herbivorous, plant
## Omnivore: omnivore, omnivorous 
## Insectivore: insectivore
## Frugivore: frugiivore
## bloodfeeder: bloodfeeder
species_diet_dict={}
count_species=0
count_failed_search_species=0
count_failed_category_species=0
## define the category and it's associated texts
carnivore = ["carnivore", "carnivorous", 'Carnivorous', 'Carnivore']
herbivore = ["herbivore", "herbivorous", "Herbivore", "Herbivorous"]
omnivore = ["omnivore", "omnivorous", "Omnivore", "Omnivorous"]
insectivore = ["insectivore", "Insectivore"]
frugivore = ["frugivore", "Frugivore"]
bloodfeeder = ["blood feeder", "Blood feeder", "Hematophagy", "hematophagy"]
scavenger = ["scavenger", "Scavenger"]
granivore = ["granivore", "Granivore", "granivorous", "Granivorous"]
nectarivore = ["nectarivore", "Nectarivore", "nectarivorous", "Nectarivorous"]
folivore = ["folivore", "Folivore", "folivorous", "Folivorous", "folivory", "Folivory"]
gummivore = ["gummivore", "Gummivore", "gummivorous", "Gummivorous", "gummivory", "Gummivory"]
filterfeeder = ["filter feeder", "Filter feeder", "filter feed", "Filter feed"]
## define key words for searching if category is not found
## Added a space in the front to elimate sentences that contain words such as threatened 
key_words=[' eat', ' feed', ' consume', " diet"]

# master dict of all list 
master_dict={"carnivore":carnivore,"herbivore":herbivore,"omnivore":omnivore,"frugivore":frugivore,"insective":insectivore, "bloodfeeder": bloodfeeder,
            "scavenger":scavenger, "granivore":granivore, "nectarivore":nectarivore, "folivore":folivore, "gummivore": gummivore, "filterfeeder": filterfeeder}
# master list of all query string
all_diet_query=carnivore + herbivore + omnivore + insectivore + frugivore + bloodfeeder + scavenger + granivore + nectarivore + folivore + gummivore + filterfeeder

# for value in all_diet_query:
#     print(value)
for species in species_set:
    count_species+=1
    ## branch 0: can we find the query? 
    try: 
        # extract content of page for the query without \n
        pageContent=wikipedia.page(species).content.replace('\n'," ").replace("=","#")
        
        ## branch 1: if none of the string is found in the page 
        ## if there is nothing found extract all sentences with key words and put it as value 
        ## if there is something found put the category with most hits 
        if not any( value in pageContent for value in all_diet_query):
            count_failed_category_species+=1
            print(str(count_species)+'. Cannot find category for this species in wikipedia: '+ species )
            key_sentence_list=[sentence + ' ' for sentence in pageContent.split('.') if any(value in sentence for value in key_words)]
            
            ## branch 3: no esily associated sentence related to diet then return a google search on species diet 
            ## Could implement for the future --> if len(key_sentence_list): 
                
            
            species_diet_dict.update({species:"<->".join(key_sentence_list)})
        else:
            ## If there are one string found look for count each individual occurence and store in dict
            ## dictionary contain the categories as keys and number of occurence as values 
            master_count_dict={}
            for keys in master_dict:
                counter=0
                for values in master_dict[keys]:
                    counter+=pageContent.count(values)
                master_count_dict.update({keys:counter})
            
            ## convert the dict into defualtdict inorder to find the max category
            ## maxCategory = max(master_count_dict, key=master_count_dict.get)
            master_category_dict = defaultdict(list)
            for key, val in master_count_dict.items():
                master_category_dict[val].append(key)
            
            ## branch 2: last situation return the categories with highest count 
            species_diet_dict.update({species:' , '.join(master_category_dict[max(master_category_dict)])})
            
    except wikipedia.exceptions.PageError: 
        count_failed_search_species+=1
        print(str(count_species)+'. Cannot find this species in wikipedia: '+ species )
        species_diet_dict.update({species:"query cannot be found"})
print("Total number of species not found in wikipedia: " + str(count_failed_search_species))
print("Total number of species without clear categories: " + str(count_failed_category_species))

2. Cannot find category for this species in wikipedia: Bufo bufo
4. Cannot find category for this species in wikipedia: Uroplatus lineatus
5. Cannot find category for this species in wikipedia: Lutra lutra
7. Cannot find category for this species in wikipedia: Carollia sowelli
8. Cannot find category for this species in wikipedia: Mesoplodon bidens
9. Cannot find category for this species in wikipedia: Rhyticeros undulatus
10. Cannot find category for this species in wikipedia: Halichoerus grypus
12. Cannot find category for this species in wikipedia: Lagenorhynchus albirostris
13. Cannot find category for this species in wikipedia: Branta leucopsis
14. Cannot find category for this species in wikipedia: Coturnix coturnix
16. Cannot find category for this species in wikipedia: Canis lupus arctos
18. Cannot find category for this species in wikipedia: Caprimulgus europaeus
19. Cannot find category for this species in wikipedia: Alburnus alburnus
20. Cannot find category for this species

193. Cannot find category for this species in wikipedia: Salamandra atra
194. Cannot find category for this species in wikipedia: Gazella subgutturosa
196. Cannot find category for this species in wikipedia: Lepidobatrachus asper
199. Cannot find category for this species in wikipedia: Chondrostoma nasus
200. Cannot find category for this species in wikipedia: Camelus ferus
201. Cannot find category for this species in wikipedia: Eolophus roseicapillus
203. Cannot find category for this species in wikipedia: Lacerta agilis
207. Cannot find category for this species in wikipedia: Zingel zingel
208. Cannot find category for this species in wikipedia: Salvelinus fontinalis
210. Cannot find category for this species in wikipedia: Calamanthus cautus
216. Cannot find category for this species in wikipedia: Casuarius casuarius
217. Cannot find category for this species in wikipedia: Falco biarmicus
219. Cannot find category for this species in wikipedia: Chloris chloris
220. Cannot find categ

In [262]:
df_host_categories = pd.DataFrame.from_dict(species_diet_dict,orient="index",columns=["categories"])
df_host_categories.reset_index(inplace=True)
df_host_categories=df_host_categories.rename(columns={"index":"host_species"})

In [263]:
df_host_categories

Unnamed: 0,host_species,categories
0,Canis lupus familiaris,carnivore
1,Bufo bufo,It becomes active at dusk and spends the nigh...
2,Sander lucioperca,carnivore
3,Uroplatus lineatus,
4,Lutra lutra,"The Eurasian otter has a diet mainly of fish,..."
...,...,...
283,Panthera leo,query cannot be found
284,Proechimys semispinosus,"It feeds on fruits and seeds, fungi, plant ma..."
285,Testudo hermanni boettgeri,They determine which plants to eat by the sen...
286,Rusa unicolor,"Sambar feed on a wide variety of vegetation, i..."


In [264]:
## Examine one of the cell values 
new_str=df_host_categories.iloc[1]['categories']
print(new_str)


 It becomes active at dusk and spends the night hunting for the invertebrates on which it feeds <-> It is voracious and eats woodlice, slugs, beetles, caterpillars, flies, earthworms and even small mice <-> It does not recognise its prey as such but will try to consume any small, dark coloured, moving object it encounters at night <-> Toads seem to use visual cues for feeding and can see their prey at low light intensities where humans are unable to discern anything <-> This comes away in tattered pieces and is then consumed <-> Birds that feed on toads include herons, crows and birds of prey <-> The tadpoles also exude noxious substances which deter fishes from eating them but not the great crested newt <-> Aquatic invertebrates that feed on toad tadpoles include dragonfly larvae, diving beetles and water boatmen <-> It lays its eggs on the toad's skin and when these hatch, the larvae crawl into the toad's nostrils and eat its flesh internally with lethal consequences <->The annual li

In [265]:
## Save df to file for integration with the excel sheet later
df_host_categories.to_csv('host_diet_category.txt',sep='\t')

In [233]:
## Testing cell
pageContent=wikipedia.page('Bufo bufo').content.replace('\n', " ")
print(pageContent)
key_words=['eat', 'feed']
list_sentence=[sentence + " " for sentence in pageContent.split('.') if any(value in sentence for value in key_word)]
print(list_sentence)

str_sentence="<->".join(list_sentence)
str_sentence

The common toad, European toad, or in Anglophone parts of Europe, simply the toad (Bufo bufo, from Latin bufo "toad"), is an amphibian found throughout most of Europe (with the exception of Ireland, Iceland, and some Mediterranean islands), in the western part of North Asia, and in a small portion of Northwest Africa. It is one of a group of closely related animals that are descended from a common ancestral line of toads and which form a species complex. The toad is an inconspicuous animal as it usually lies hidden during the day. It becomes active at dusk and spends the night hunting for the invertebrates on which it feeds. It moves with a slow, ungainly walk or short jumps, and has greyish-brown skin covered with wart-like lumps. Although toads are usually solitary animals, in the breeding season, large numbers of toads converge on certain breeding ponds, where the males compete to mate with the females. Eggs are laid in gelatinous strings in the water and later hatch out into tadpol

''

In [239]:
## Help cell
help(pd.DataFrame.to_csv)

Help on function to_csv in module pandas.core.generic:

to_csv(self, path_or_buf:Union[str, pathlib.Path, IO[~AnyStr], NoneType]=None, sep:str=',', na_rep:str='', float_format:Union[str, NoneType]=None, columns:Union[Sequence[collections.abc.Hashable], NoneType]=None, header:Union[bool, List[str]]=True, index:bool=True, index_label:Union[bool, str, Sequence[collections.abc.Hashable], NoneType]=None, mode:str='w', encoding:Union[str, NoneType]=None, compression:Union[str, Mapping[str, str], NoneType]='infer', quoting:Union[int, NoneType]=None, quotechar:str='"', line_terminator:Union[str, NoneType]=None, chunksize:Union[int, NoneType]=None, date_format:Union[str, NoneType]=None, doublequote:bool=True, escapechar:Union[str, NoneType]=None, decimal:Union[str, NoneType]='.', errors:str='strict') -> Union[str, NoneType]
    Write object to a comma-separated values (csv) file.
    
    .. versionchanged:: 0.24.0
        The order of arguments for Series was changed.
    
    Parameters
    -

In [253]:
testlist=["yo","hey","bye"]
", ".join(testlist)
testlist=[]
len(testlist)

0

In [261]:
list_species=["Uroplatus lineatus"]
for species in list_species:
    print(type(species))
    species_f=str(species)
    species_f.replace(' ',"+")
    print(species)
    

<class 'str'>
Uroplatus lineatus
