# Standardize survey data

In [1]:
import sqlite3
import pandas as pd
import numpy as np
from pandas import ExcelFile

SURVEY_DATA_COLUMNS=['species','latitude','longitude','phase','presence','magnitude']

## Input and output

In [2]:
DB='../output/survey.sqlite'
PHASE_ONE='../../data/field_survey/field_report_2019-06-11_phase1.xlsx'
PHASE_TWO='../../data/field_survey/field_report_2019-08-07_phase2.csv'

## Initiating dataframe
Mandatory column names for database (more can be added): `index,species,latitude,longitude,presence`

In [3]:
surveyData=pd.DataFrame(columns=SURVEY_DATA_COLUMNS)

The function `finalize()` below performs final cleaning up and appending to main dataframe.

In [4]:
def finalize(df,surveyData):
    surveyData=pd.concat([surveyData,df],sort=True)
    print("After appending to main dataframe: ",surveyData.shape)
    return surveyData

## Phase 1
Processing one sheet at a time. The below function reads in a sheet and modifies columns.
Then, we process each species individually. Note that there are a number of duplicate entries.

In [5]:
def process_phase_one_sheet(sheetName,speciesName):
    df=pd.read_excel(PHASE_ONE,sheet_name=sheetName)
    df.columns=map(str.lower,df.columns)
    print("Original data shape: ",df.shape)
    
    if speciesName=='Parthenium hysterophorus':
        df=df[['latitude','longitude','magnitude','presence_absence']]
        df['presence']=df['presence_absence']=='Presence'
        df=df.drop(['presence_absence'],axis=1)
    elif speciesName=='Eichhornia crassipes':
        df=df[['x_coordi','y_coordi','magnitude']]
        df.columns=['latitude','longitude','magnitude']
    else:
        df=df[['latitude','longitude','magnitude']]
    
    df=df.drop_duplicates()
    df=df[df.latitude!='Absent']
    df=df[df.longitude!='Absent']
    print("After dropping duplicates and extra columns: ",df.shape)
    if speciesName=='Parthenium hysterophorus':
        df.loc[df['magnitude'].isna(),'magnitude']='Not recorded.'
    else:
        df=df[~df['magnitude'].isna() & ~df['latitude'].isna() & ~df['longitude'].isna()]
    print("After dropping rows with magnitude=NaN: ",df.shape)
    df['presence']=(df['magnitude']!='Absent') | (df['magnitude']!='')
    df['species']=speciesName
    df['phase']=1

    return df

### Lantana camara

In [6]:
df=process_phase_one_sheet('Lentena','Lantana camara')
surveyData=finalize(df,surveyData)

Original data shape:  (538, 12)
After dropping duplicates and extra columns:  (514, 3)
After dropping rows with magnitude=NaN:  (514, 3)
After appending to main dataframe:  (514, 6)


  result = method(y)


### Chromolaena odorata

In [7]:
df=process_phase_one_sheet('Chromoleana','Chromolaena odorata')
surveyData=finalize(df,surveyData)

Original data shape:  (121, 12)
After dropping duplicates and extra columns:  (120, 3)
After dropping rows with magnitude=NaN:  (120, 3)
After appending to main dataframe:  (634, 6)


###  Ipomoea carnea

In [8]:
df=process_phase_one_sheet('Ipomea','Ipomoea carnea')
surveyData=finalize(df,surveyData)

Original data shape:  (288, 11)
After dropping duplicates and extra columns:  (287, 3)
After dropping rows with magnitude=NaN:  (286, 3)
After appending to main dataframe:  (920, 6)


### Mikania micrantha

In [9]:
df=process_phase_one_sheet('Mikeniea','Mikania micrantha')
surveyData=finalize(df,surveyData)

Original data shape:  (116, 12)
After dropping duplicates and extra columns:  (116, 3)
After dropping rows with magnitude=NaN:  (115, 3)
After appending to main dataframe:  (1035, 6)


### Ageratina adenophora

In [10]:
df=process_phase_one_sheet('Ageratina','Ageratina adenophora')
surveyData=finalize(df,surveyData)

Original data shape:  (543, 11)
After dropping duplicates and extra columns:  (521, 3)
After dropping rows with magnitude=NaN:  (520, 3)
After appending to main dataframe:  (1555, 6)


### Parthenium hysterophorus

In [11]:
df=process_phase_one_sheet('Parthenium','Parthenium hysterophorus')
surveyData=finalize(df,surveyData)

Original data shape:  (380, 12)
After dropping duplicates and extra columns:  (335, 4)
After dropping rows with magnitude=NaN:  (335, 4)
After appending to main dataframe:  (1890, 6)


### Eichhornia crassipes

In [12]:
df=process_phase_one_sheet('Eichorniea','Eichhornia crassipes')
surveyData=finalize(df,surveyData)

Original data shape:  (62, 10)
After dropping duplicates and extra columns:  (62, 3)
After dropping rows with magnitude=NaN:  (51, 3)
After appending to main dataframe:  (1941, 6)


## Phase 2
This is present in a single csv file.

In [13]:
df=pd.read_csv(PHASE_TWO)
df=df[['longitude','latitude','Species','Field  Validation status']]
df.columns=['longitude','latitude','species','presence']
df['presence']=df['presence']=='Present'
df['phase']=2
df.loc[df.species=='Lantana','species']='Lantana camara'
df.loc[df.species=='lantana','species']='Lantana camara'
df.loc[df.species=='Chromoleana','species']='Chromolaena odorata'
df.loc[df.species=='Ipomea','species']='Ipomoea carnea'
df.loc[df.species=='Mikania','species']='Mikania micrantha'
df.loc[df.species=='Ageratina','species']='Ageratina adenophora'
df.loc[df.species=='parthenium','species']='Parthenium hysterophorus'
df.loc[df.species=='Parthenium','species']='Parthenium hysterophorus'
dupl=df[df.species=='Parthenium/Lantana']
dupl.species='Lantana camara'
df.loc[df.species=='Parthenium/Lantana','species']='Parthenium hysterophorus'
df=pd.concat([df,dupl],sort=True)

# Absent columns
dupl=df[(df.species=='Absent') | (df.species=='absent')]
for sp in ['Lantana camara','Chromolaena odorata','Ipomoea carnea','Mikania micrantha','Ageratina adenophora','Parthenium hysterophorus']:
    duplSp=dupl.copy()
    duplSp.species=sp
    df=pd.concat([df,duplSp],sort=True)

df=df[(df.species!='Absent') & (df.species!='absent')]
df.species.unique()
print("Original data shape",df.shape)
df=df.dropna()
print("After dropping rows with NaNs", df.shape)
surveyData=finalize(df,surveyData)

Original data shape (434, 5)
After dropping rows with NaNs (431, 5)
After appending to main dataframe:  (2372, 6)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


## Push to database
Assumes that the database has the following table:
```
CREATE TABLE "survey" (
	"species"	TEXT,
	"latitude"	REAL,
	"longitude"	REAL,
    "phase"	INTEGER,
	"presence"	INTEGER,
	"magnitude"	TEXT,
	PRIMARY KEY("species","latitude","longitude","phase")
);
```

In [14]:
conn = sqlite3.connect(DB)
cur=conn.cursor()
cur.execute('CREATE TABLE IF NOT EXISTS "survey" ( \
    "species"    TEXT, \
    "latitude"    REAL, \
    "longitude"    REAL, \
    "phase"    INTEGER, \
    "presence"    INTEGER, \
    "magnitude"    TEXT, \
    PRIMARY KEY("species","latitude","longitude","phase") \
);')
surveyData.to_sql('temporary_table',conn,if_exists='replace')
cur=conn.cursor()
cur.execute('INSERT OR IGNORE INTO survey SELECT ' + ','.join(SURVEY_DATA_COLUMNS) + ' FROM temporary_table;')
cur.execute('DROP TABLE temporary_table;')
conn.commit()
conn.close()

## Verification
### Distinct species list

In [15]:
conn = sqlite3.connect(DB)
cur=conn.cursor()
cur.execute('SELECT DISTINCT species from survey;')
cur.fetchall()

[('Ageratina adenophora',),
 ('Chromolaena odorata',),
 ('Eichhornia crassipes',),
 ('Ipomoea carnea',),
 ('Lantana camara',),
 ('Mikania micrantha',),
 ('Parthenium hysterophorus',)]

### Distinct values in 'presence' column

In [16]:
cur.execute('SELECT DISTINCT presence from survey;')
cur.fetchall()

[(1,), (0,)]

### Distinct values in 'magnitude' column

In [17]:
cur.execute('SELECT DISTINCT magnitude from survey;')
cur.fetchall()

[('Low',),
 ('Moderate',),
 ('Dense',),
 ('moderate',),
 ('Absent',),
 ('just invaded',),
 ('low',),
 ('dense',),
 ('Not recorded',),
 ('High',),
 ('Very high',),
 ('Not recorded.',),
 ('absent',),
 (None,)]

### Rows in dataframe and database

In [18]:
cur.execute('SELECT count(*) FROM survey;')
numRows=cur.fetchall()
print("Number of rows in database: %d" %numRows[0])
print("Number of rows in dataframe: %d" %surveyData.shape[0])

Number of rows in database: 2317
Number of rows in dataframe: 2372


**Note:** The discrepancy in the number of rows in database and dataframe is because there are rows for which 'magnitude' differs.

In [19]:
conn.close()