# Standardize survey data

In [1]:
import sqlite3
import pandas as pd
from pandas import ExcelFile

SURVEY_DATA_COLUMNS=['species','latitude','longitude','phase','presence','magnitude']

## Input and output

In [2]:
DB='../output/survey.sqlite'
PHASE_ONE='../../data/field_survey/field_report_2019-06-11_phase1.xlsx'
PHASE_TWO=''

## Initiating dataframe
Mandatory column names for database (more can be added): `index,species,latitude,longitude,presence`

In [3]:
surveyData=pd.DataFrame(columns=SURVEY_DATA_COLUMNS)

## Phase 1
Processing one sheet at a time. The below function reads in a sheet and modifies columns.

In [4]:
def process_phase_one_sheet(sheetName):
    df=pd.read_excel(PHASE_ONE,sheet_name=sheetName)
    df.columns=map(str.lower,df.columns)
    return df

### Lantana camara

In [5]:
df=process_phase_one_sheet('Lentena')
df=df[['latitude','longitude','magnitude']]
df['species']='Lantana camara'
df['presence']=(df['magnitude']!='Absent') | (df['magnitude']!='')
df.to_csv('lantana.csv')
surveyData=surveyData.append(df,sort=True)
surveyData.shape

(538, 6)

### Chromolaena odorata

In [6]:
df=process_phase_one_sheet('Chromoleana')
df=df[['latitude','longitude','magnitude']]
df['species']='Chromolaena odorata'
df['presence']=(df['magnitude']!='Absent') | (df['magnitude']!='')
surveyData=surveyData.append(df,sort=True)
surveyData.shape

(659, 6)

###  Ipomoea carnea

In [7]:
df=process_phase_one_sheet('Ipomea')
df=df[['latitude','longitude','magnitude']]
df['species']='Ipomoea carnea'
df['presence']=(df['magnitude']!='Absent') | (df['magnitude']!='')
surveyData=surveyData.append(df,sort=True)
surveyData.shape

(947, 6)

### Mikania micrantha

In [8]:
df=process_phase_one_sheet('Mikeniea')
df=df[['latitude','longitude','magnitude']]
df['species']='Mikania micrantha'
df['presence']=(df['magnitude']!='Absent') | (df['magnitude']!='')
surveyData=surveyData.append(df,sort=True)
surveyData.shape

(1063, 6)

### Ageratina adenophora

In [9]:
df=process_phase_one_sheet('Ageratina')
df=df[['latitude','longitude','magnitude']]
df['species']='Ageratina adenophora'
df['presence']=(df['magnitude']!='Absent') | (df['magnitude']!='')
surveyData=surveyData.append(df,sort=True)
surveyData.shape

(1606, 6)

### Parthenium hysterophorus

In [10]:
df=process_phase_one_sheet('Parthenium')
df=df[['latitude','longitude','magnitude','presence_absence']]
df['species']='Parthenium hysterophorus'
df['presence']=df['presence_absence']=='Presence'
df=df.drop(['presence_absence'],axis=1)
surveyData=surveyData.append(df,sort=True)
surveyData.shape

(1986, 6)

### Eichhornia crassipes

In [11]:
df=process_phase_one_sheet('Eichorniea')
df=df[['x_coordi','y_coordi','magnitude']]
df.columns=['latitude','longitude','magnitude']
df['species']='Eichhornia crassipes'
df['presence']=(df['magnitude']!='Absent') | (df['magnitude']!='')
surveyData=surveyData.append(df,sort=True)
surveyData.shape

(2048, 6)

## Assign phase

In [12]:
surveyData['phase']=1

## Push to database
Assumes that the database has the following table:
```
CREATE TABLE "survey" (
	"species"	TEXT,
	"latitude"	REAL,
	"longitude"	REAL,
    "phase"	INTEGER,
	"presence"	INTEGER,
	"magnitude"	TEXT,
	PRIMARY KEY("species","latitude","longitude","phase")
);
```

In [16]:
conn = sqlite3.connect(DB)
surveyData.to_sql('temporary_table',conn,if_exists='replace')
cur=conn.cursor()
cur.execute('INSERT OR IGNORE INTO survey SELECT ' + ','.join(SURVEY_DATA_COLUMNS) + ' FROM temporary_table;')
cur.execute('SELECT count(*) FROM survey;')
numRows=cur.fetchall()
cur.execute('DROP TABLE temporary_table;')
conn.commit()
conn.close()

print("Number of rows in database: %d" %numRows)
print("Number of rows in dataframe: %d" %surveyData.shape[0])

[(1905,)]