# Standardize survey data for phase 3
**Note:** This code results in `../output/survey.sqlite` which should be moved to `$BIODIVERSITY_DATA/survey/` for dependent modules to work.

In [1]:
import sqlite3
import pandas as pd
import numpy as np
from pandas import ExcelFile

SURVEY_DATA_COLUMNS=['species','latitude','longitude','phase','presence','magnitude']

## Input and output

In [2]:
DB='../output/survey.sqlite'
PHASE_THREE='../../data/field_survey/field_report_2020-01-03_phase3.xlsx'

## Initiating dataframe
Mandatory column names for database (more can be added): `index,species,latitude,longitude,presence`

In [3]:
surveyData=pd.DataFrame(columns=SURVEY_DATA_COLUMNS)

The function `finalize()` below performs final cleaning up and appending to main dataframe.

In [4]:
def finalize(df,surveyData):
    # Verification
    if df['latitude'].dtype!='float':
        raise TypeError("Column 'latitude' should be float.")
    elif df['longitude'].dtype!='float':
        raise TypeError("Column 'longitude' should be float.")

    surveyData=pd.concat([surveyData,df],sort=True)
    print("After appending to main dataframe: ",surveyData.shape)
    print("Species:",surveyData.species.unique())
    return surveyData

## Reading excel sheet
Processing one sheet at a time. The below function reads in a sheet and modifies columns.
Then, we process each species individually. Note that there are a number of duplicate entries.

### Majimtar

In [5]:
df=pd.read_excel(PHASE_THREE,sheet_name='majimtar')
df=df.drop(['Species','Unnamed: 2','ns1:ele','ns1:name', 'Magnitude'],axis=1)
df=df.rename(columns={'Species.1': 'species', 'lat': 'latitude', 'lon': 'longitude'})
df=df.dropna()
df.loc[(df.species=='mikania'),'species']='Mikania micrantha'
df.loc[(df.species=='chromoleana'),'species']='Chromolaena odorata'
df.loc[(df.species=='lantana'),'species']='Lantana camara'
df=df.append(df[df.species=='c/m'],ignore_index=True)
df['presence']=1
df.loc[[41],'species']='Chromolaena odorata'
df.loc[[8],'species']='Mikania micrantha'
surveyData=finalize(df,surveyData)

After appending to main dataframe:  (42, 6)
Species: ['Mikania micrantha' 'Chromolaena odorata' 'Lantana camara']


### Muglin-Tanahu

In [6]:
df=pd.read_excel(PHASE_THREE,sheet_name='muglin_tanahun')
df=df.drop(['elevation','magnitude'],axis=1)
df=df.rename(columns={'Species': 'species', 'lat': 'latitude', 'long': 'longitude'})
df=df.dropna()

df.loc[(df.species=='mikania'),'species']='Mikania micrantha'
df.loc[(df.species=='chromoleana'),'species']='Chromolaena odorata'
df.loc[(df.species=='lantana'),'species']='Lantana camara'

df=df.append(df[df.species=='mikania/ch'],ignore_index=True)
df=df.append(df[df.species=='lantana/mikania'],ignore_index=True)

df.loc[[7,8,9],'species']='Chromolaena odorata'
df.loc[[37],'species']='Lantana camara'
df.loc[[41,42,43,44],'species']='Mikania micrantha'

df['presence']=1

surveyData=finalize(df,surveyData)

After appending to main dataframe:  (87, 6)
Species: ['Mikania micrantha' 'Chromolaena odorata' 'Lantana camara']


###  Gorkha

In [7]:
df=pd.read_excel(PHASE_THREE,sheet_name='Gorkha')
df=df.drop(['Unnamed: 1','Unnamed: 5','Elevation','Magnitude'],axis=1)
df=df.rename(columns={'Species': 'species', 'Lat': 'latitude', 'Long': 'longitude'})
df=df.dropna()

df.loc[(df.species=='Chromoleana'),'species']='Chromolaena odorata'

df['presence']=1

surveyData=finalize(df,surveyData)

After appending to main dataframe:  (96, 6)
Species: ['Mikania micrantha' 'Chromolaena odorata' 'Lantana camara']


### Tanahu

In [8]:
df=pd.read_excel(PHASE_THREE,sheet_name='Tanahun')
df=df.drop(['Elevation','Magnirude','Location'],axis=1)
df=df.rename(columns={'aSpecies': 'species', 'Lat': 'latitude', 'Long': 'longitude'})
df=df.dropna()

df.loc[(df.species=='mikania'),'species']='Mikania micrantha'
df.loc[(df.species=='chromoleana'),'species']='Chromolaena odorata'
df.loc[(df.species=='Lantana'),'species']='Lantana camara'

df=df.append(df[df.species=='l/c'],ignore_index=True)
df.loc[[52],'species']='Chromolaena odorata'
df.loc[[70],'species']='Lantana camara'

df['presence']=1

df.species.unique()

surveyData=finalize(df,surveyData)

After appending to main dataframe:  (167, 6)
Species: ['Mikania micrantha' 'Chromolaena odorata' 'Lantana camara']


### Kaski syangja

In [9]:
df=pd.read_excel(PHASE_THREE,sheet_name='Kaski syangja')

df=df.drop(['elevation','magnitude','location '],axis=1)
df=df.rename(columns={'Species': 'species', 'Lat': 'latitude'})
df=df.dropna()

df.loc[(df.species=='mikania'),'species']='Mikania micrantha'
df.loc[(df.species=='chromoleana'),'species']='Chromolaena odorata'
df.loc[(df.species=='lantana'),'species']='Lantana camara'
df.loc[(df.species=='eichornia'),'species']='Eichhornia crassipes'
df.loc[(df.species=='Eichornia'),'species']='Eichhornia crassipes'
df.loc[(df.species=='Ipomea'),'species']='Ipomoea carnea'
df.loc[(df.species=='ipomea'),'species']='Ipomoea carnea'
df.loc[(df.species=='Ageratina'),'species']='Ageratina adenophora'
df.loc[(df.species=='ageratina'),'species']='Ageratina adenophora'

df['presence']=1

surveyData=finalize(df,surveyData)

After appending to main dataframe:  (262, 6)
Species: ['Mikania micrantha' 'Chromolaena odorata' 'Lantana camara'
 'Eichhornia crassipes' 'Ipomoea carnea' 'Ageratina adenophora']


### Nawalparasi

In [10]:
df=pd.read_excel(PHASE_THREE,sheet_name='Nawalparasi')

df=df.drop(['elevation','Unnamed: 1','Unnamed: 5'],axis=1)
df=df.rename(columns={'Species': 'species', 'lat': 'latitude', 'long': 'longitude'})
df=df.dropna()

df.loc[(df.species=='mikania'),'species']='Mikania micrantha'
df.loc[(df.species=='chromoleana'),'species']='Chromolaena odorata'
df.loc[(df.species=='lantana'),'species']='Lantana camara'
df.loc[(df.species=='eichornia'),'species']='Eichhornia crassipes'
df.loc[(df.species=='Eichornia'),'species']='Eichhornia crassipes'
df.loc[(df.species=='Ipomea'),'species']='Ipomoea carnea'
df.loc[(df.species=='ipomea'),'species']='Ipomoea carnea'
df.loc[(df.species=='Ageratina'),'species']='Ageratina adenophora'
df.loc[(df.species=='ageratina'),'species']='Ageratina adenophora'
df.loc[(df.species=='parthenium'),'species']='Parthenium hysterophorus'

df['presence']=1

surveyData=finalize(df,surveyData)

After appending to main dataframe:  (403, 6)
Species: ['Mikania micrantha' 'Chromolaena odorata' 'Lantana camara'
 'Eichhornia crassipes' 'Ipomoea carnea' 'Ageratina adenophora'
 'Parthenium hysterophorus']


### Chitwan makwanpur

In [11]:
df=pd.read_excel(PHASE_THREE,sheet_name='Chitwan makwanpur')

df=df.drop(['elevation','Magnitude','Unnamed: 5'],axis=1)
df=df.rename(columns={'Species': 'species', 'lat': 'latitude', 'long': 'longitude'})
df=df[(df.species!='Species')]

df=df.astype({'latitude': float, 'longitude': float})
df=df.dropna()

df.loc[(df.species=='mikania'),'species']='Mikania micrantha'
df.loc[(df.species=='chromoleana'),'species']='Chromolaena odorata'
df.loc[(df.species=='lantana'),'species']='Lantana camara'
df.loc[(df.species=='eichornia'),'species']='Eichhornia crassipes'
df.loc[(df.species=='Eichornia'),'species']='Eichhornia crassipes'
df.loc[(df.species=='Ipomea'),'species']='Ipomoea carnea'
df.loc[(df.species=='ipomea'),'species']='Ipomoea carnea'
df.loc[(df.species=='Ageratina'),'species']='Ageratina adenophora'
df.loc[(df.species=='ageratina'),'species']='Ageratina adenophora'
df.loc[(df.species=='parthenium'),'species']='Parthenium hysterophorus'

df['presence']=1

surveyData=finalize(df,surveyData)

After appending to main dataframe:  (579, 6)
Species: ['Mikania micrantha' 'Chromolaena odorata' 'Lantana camara'
 'Eichhornia crassipes' 'Ipomoea carnea' 'Ageratina adenophora'
 'Parthenium hysterophorus']


## Push to database
Assumes that the database has the following table:
```
CREATE TABLE "survey" (
	"species"	TEXT,
	"latitude"	REAL,
	"longitude"	REAL,
    "phase"	INTEGER,
	"presence"	INTEGER,
	"magnitude"	TEXT,
	PRIMARY KEY("species","latitude","longitude","phase")
);
```

In [12]:
surveyData['phase']=3
conn = sqlite3.connect(DB)
cur=conn.cursor()
cur.execute('CREATE TABLE IF NOT EXISTS "survey" ( \
    "species"    TEXT, \
    "latitude"    REAL, \
    "longitude"    REAL, \
    "phase"    INTEGER, \
    "presence"    INTEGER, \
    "magnitude"    TEXT, \
    PRIMARY KEY("species","latitude","longitude","phase") \
);')
surveyData.to_sql('temporary_table',conn,if_exists='replace')
cur=conn.cursor()
cur.execute('INSERT OR REPLACE INTO survey SELECT ' + ','.join(SURVEY_DATA_COLUMNS) + ' FROM temporary_table;')
cur.execute('DROP TABLE temporary_table;')
conn.commit()
conn.close()

## Verification
### Distinct species list

In [13]:
conn = sqlite3.connect(DB)
cur=conn.cursor()
cur.execute('SELECT DISTINCT species from survey;')
cur.fetchall()

[('Ageratina adenophora',),
 ('Chromolaena odorata',),
 ('Eichhornia crassipes',),
 ('Ipomoea carnea',),
 ('Lantana camara',),
 ('Mikania micrantha',),
 ('Parthenium hysterophorus',)]

### Distinct values in 'presence' column

In [14]:
cur.execute('SELECT DISTINCT presence from survey;')
cur.fetchall()

[(1,), (0,)]

### Distinct values in 'magnitude' column

In [15]:
cur.execute('SELECT DISTINCT magnitude from survey;')
cur.fetchall()

[('Low',),
 ('Moderate',),
 ('Dense',),
 ('moderate',),
 ('Absent',),
 ('just invaded',),
 ('low',),
 ('dense',),
 ('Not recorded',),
 ('High',),
 ('Very high',),
 ('Not recorded.',),
 ('absent',),
 (None,)]

### Rows in dataframe and database

In [16]:
cur.execute('SELECT count(*) FROM survey;')
numRows=cur.fetchall()
print("Number of rows in database: %d" %numRows[0])
print("Number of rows in dataframe: %d" %surveyData.shape[0])

Number of rows in database: 2894
Number of rows in dataframe: 579


### Checking presence/absence points for each species

In [17]:
cur.execute('SELECT species,phase,count(*),sum(presence) AS presence,sum((1-presence)) AS absence FROM survey GROUP BY species,phase;')
res=cur.fetchall()
for ele in res:
    print(ele)

('Ageratina adenophora', 1, 503, 102, 401)
('Ageratina adenophora', 2, 41, 0, 41)
('Ageratina adenophora', 3, 9, 9, 0)
('Chromolaena odorata', 1, 120, 120, 0)
('Chromolaena odorata', 2, 75, 12, 63)
('Chromolaena odorata', 3, 228, 228, 0)
('Eichhornia crassipes', 1, 51, 35, 16)
('Eichhornia crassipes', 3, 7, 7, 0)
('Ipomoea carnea', 1, 286, 133, 153)
('Ipomoea carnea', 2, 44, 0, 44)
('Ipomoea carnea', 3, 20, 20, 0)
('Lantana camara', 1, 508, 177, 331)
('Lantana camara', 2, 147, 106, 41)
('Lantana camara', 3, 172, 172, 0)
('Mikania micrantha', 1, 115, 54, 61)
('Mikania micrantha', 2, 41, 0, 41)
('Mikania micrantha', 3, 138, 138, 0)
('Parthenium hysterophorus', 1, 302, 246, 56)
('Parthenium hysterophorus', 2, 83, 7, 76)
('Parthenium hysterophorus', 3, 4, 4, 0)


In [18]:
conn.close()