In [1]:
from scipy.special import comb
import pandas as pd
import numpy as np
import scipy.misc
scipy.misc.comb = comb
import h3

import sys
sys.path.append('project-files')
from helper_functions.gridding import h3_grid

In [2]:
# Read and concatnate data into a single dataframe
df = pd.read_csv('project-files/correlation/correlation_new.csv')
  

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119334 entries, 0 to 119333
Data columns (total 6 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   ScientificName  119281 non-null  object 
 1   TaxonID         119281 non-null  object 
 2   Date            119334 non-null  object 
 3   Latitude        119225 non-null  float64
 4   Longitude       119225 non-null  float64
 5   Municipality    119248 non-null  object 
dtypes: float64(2), object(4)
memory usage: 5.5+ MB


In [3]:
#Doing the gridding

#Drop NAs from Lat and Lon. Otherwise h3 cannot work with these.
df.dropna(subset=['Latitude', 'Longitude'], inplace=True)
df = df[['ScientificName', 'Latitude', 'Longitude']]

#String operations to remove subspecie branching
df['ScientificName'] = df['ScientificName'].str.split(' subsp.').str.get(0)
df['ScientificName'] = df['ScientificName'].str.split(' var.').str.get(0)
  
#Creating separete the dataframes for every specie
df_amanita_muscaria = df.loc[df['ScientificName'] == 'Amanita muscaria']
df_pinus_sylvestris = df.loc[df['ScientificName'] == 'Pinus sylvestris']
df_lepus_timidus = df.loc[df['ScientificName'] == 'Lepus timidus']
df_vulpes_vulpes = df.loc[df['ScientificName'] == 'Vulpes vulpes']
df_taphrina_betulina = df.loc[df['ScientificName'] == 'Taphrina betulina']
df_betula_pubescens = df.loc[df['ScientificName'] == 'Betula pubescens']

#Putting them in a list so we can loop over
specie_list = [df_amanita_muscaria, df_pinus_sylvestris, df_lepus_timidus, df_vulpes_vulpes, df_taphrina_betulina, df_betula_pubescens]

#Fitting all dataframes to grid
for i in range(len(specie_list)):
    specie_df = specie_list[i]
    grid_object = h3_grid()
    grid_object.fit(specie_df)
    grid = grid_object.grid_info()
    grid = grid.drop(columns=['observations_id', 'neighbors'])
    grid = grid.rename(columns={'count':specie_df['ScientificName'].values[0]})
    specie_list[i] = grid


specie_list

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['h3_cell'] = df.apply(self.row_to_h3cell, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['h3_cell'] = df.apply(self.row_to_h3cell, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['h3_cell'] = df.apply(self.row_to_h3cell, axis=1)
A value is trying to be set on a copy of a sli

[             h3_cell  Amanita muscaria
 0    8501268bfffffff                 1
 1    850126c7fffffff                 1
 2    850126d3fffffff                 1
 3    850126dbfffffff                 1
 4    85088383fffffff                 1
 ..               ...               ...
 604  85112e7bfffffff                 1
 605  85112e83fffffff                 6
 606  85112e8bfffffff                 5
 607  85112eaffffffff                 1
 608  85112ecbfffffff                 1
 
 [609 rows x 2 columns],
               h3_cell  Pinus sylvestris
 0     85012613fffffff                 2
 1     8501261bfffffff                15
 2     85012643fffffff                 1
 3     85012647fffffff                 2
 4     8501264ffffffff                 1
 ...               ...               ...
 1867  85112ecbfffffff                49
 1868  85112ecffffffff                21
 1869  85112ed3fffffff                22
 1870  85112ed7fffffff                16
 1871  85112edbfffffff                22
 

In [66]:
#Merge different dataframes into one for simplicty

combined_df = specie_list[0]
specie_list = specie_list[1:]

for i in range(len(specie_list)):
    specie_df = specie_list[i]
    combined_df = combined_df.merge(specie_df, how='outer')

combined_df = combined_df.fillna(0)
combined_df

Unnamed: 0,h3_cell,Lepus timidus,Vulpes vulpes,Taphrina betulina,Betula pubescens
0,85012647fffffff,1.0,0.0,0.0,28.0
1,8501264ffffffff,1.0,1.0,0.0,37.0
2,85012693fffffff,1.0,0.0,0.0,40.0
3,8501269bfffffff,1.0,0.0,0.0,0.0
4,850126c7fffffff,4.0,0.0,0.0,40.0
...,...,...,...,...,...
1892,85112ebbfffffff,0.0,0.0,0.0,7.0
1893,85112ec3fffffff,0.0,0.0,0.0,6.0
1894,85112ecbfffffff,0.0,0.0,0.0,8.0
1895,85112ecffffffff,0.0,0.0,0.0,4.0


From now on, we will calculate correlation values for different cases

Symbiotic relationship: Amanita muscaria vs Pinus sylvestris <br>
Predatory relationship: Lepus timidus vs Vulpes vulpes<br>
Parasitic relationship: Taphrina betulina vs Betula pubescens<br>

In [67]:
#
def one_way_relation(var1, var2, dataframe):
    dataframe_1 = dataframe.loc[dataframe[var1] >= 1]
    var1ToVar2 = dataframe_1.loc[dataframe_1[var2] >= 1].shape[0]
    print('Var1 Count')
    print(dataframe_1.shape[0])
    dataframe_2 = dataframe.loc[dataframe[var2] >= 1]
    print('Var2 Count')
    print(dataframe_2.shape[0])
    var2ToVar1 = dataframe_2.loc[dataframe_2[var1] >= 1].shape[0]
    return var1ToVar2, var2ToVar1

In [68]:
def count_comparsion(varA, varB, dataframe):
    varA_df_1 = dataframe.loc[dataframe[varA] >= 1]
    varA_df_meanA = varA_df_1[varA].mean()
    df_varA1_varB0 = varA_df_1.loc[varA_df_1[varB] == 0]
    varA1_varB0_mean = df_varA1_varB0[varA].mean()
    df_varA1_varB1 = varA_df_1.loc[varA_df_1[varB] >= 1]
    varA1_varB1_mean = df_varA1_varB1[varA].mean()

    varB_df_1 = dataframe.loc[dataframe[varB] >= 1]
    varB_df_meanB = varB_df_1[varB].mean()
    df_varA0_varB1 = varB_df_1.loc[varB_df_1[varA] == 0]
    varA0_varB1_mean = df_varA0_varB1[varB].mean()
    df_varB1_varA1 = varB_df_1.loc[varB_df_1[varA] >= 1]
    varB1_varA1_mean = df_varB1_varA1[varB].mean()
    return varA_df_meanA, varA1_varB0_mean, varA1_varB1_mean, varB_df_meanB, varA0_varB1_mean, varB1_varA1_mean

In [69]:
#Symbiotic case
symbiotic_df = combined_df[['h3_cell', 'Amanita muscaria', 'Pinus sylvestris']]
#symbiotic_corr = symbiotic_df.corr(method='pearson')

#print(symbiotic_corr)


print(one_way_relation('Amanita muscaria', 'Pinus sylvestris', symbiotic_df))

count_comparsion('Amanita muscaria', 'Pinus sylvestris', symbiotic_df)



KeyError: "['Amanita muscaria', 'Pinus sylvestris'] not in index"

In [70]:
#Predatory case
predatory_df = combined_df[['h3_cell', 'Lepus timidus', 'Vulpes vulpes']]
#predatory_corr = predatory_df.corr(method='pearson')

#print(predatory_corr)

print(one_way_relation('Lepus timidus', 'Vulpes vulpes', predatory_df))

count_comparsion('Lepus timidus', 'Vulpes vulpes', predatory_df)


Var1 Count
810
Var2 Count
618
(436, 436)


(6.646913580246913,
 3.620320855614973,
 9.243119266055047,
 7.106796116504855,
 2.39010989010989,
 9.075688073394495)

In [71]:
#Parasitic case
parasitic_df = combined_df[['h3_cell', 'Taphrina betulina', 'Betula pubescens']]
#parasitic_df.corr(method='pearson')

print(one_way_relation('Taphrina betulina', 'Betula pubescens', parasitic_df))

count_comparsion('Taphrina betulina', 'Betula pubescens', parasitic_df)


Var1 Count
71
Var2 Count
1880
(71, 71)


(3.76056338028169,
 nan,
 3.76056338028169,
 13.710106382978724,
 12.914317302377004,
 33.985915492957744)