In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

from PIL import Image
import os.path

In [2]:
df = pd.read_csv('~/Dropbox (Aalto)/MAVI2/2015/rap_2015.csv', sep=',')

column translations:
- vuosi = YEAR
- lohkonro = field parcel
- tunnus = identifier
- kasvikoodi = PLANT CODE
- kasvi = PLANT
- lajikekood = VARIETY CODE
- lajike = VARIETY
- pintaala = Property area
- tays_tuho = full crop loss
- ositt_tuho = partial crop loss

In [3]:
df = df.rename(index=str, columns={
    'vuosi' : 'YEAR',
    'lohkonro' : 'field parcel',
    'tunnus' : 'identifier',
    'kasvikoodi' : 'PLANT CODE',
    'kasvi' : 'PLANT',
    'lajikekood' : 'VARIETY CODE',
    'lajike' : 'VARIETY',
    'pintaala' : 'Property area',
    'tays_tuho' : 'full crop loss',
    'ositt_tuho' : 'partial crop loss'})
print('length:', len(df))

length: 465595


In [4]:
df.head(5)

Unnamed: 0,rowid,YEAR,field parcel,identifier,PLANT CODE,PLANT,VARIETY CODE,VARIETY,Property area,full crop loss,partial crop loss,xmin,xmax,ymin,ymax
0,1,2015,8340031607-A,108884,1400,Kaura,41,AKSELI,130,,,332992.292,333122.202,6750626.879,6750777.447
1,2,2015,8340033324-A,108884,1400,Kaura,41,AKSELI,211,,,332136.192,332326.2,6749464.459,6749614.102
2,3,2015,8340473258-A,108884,1400,Kaura,41,AKSELI,716,,,332180.685,332627.877,6749577.142,6749797.02
3,4,2015,0460023106-A,84735,1310,Rehuohra,82,TOCADA,46,,,596822.261,596931.075,6880077.35,6880137.385
4,5,2015,0460023510-A,84735,1310,Rehuohra,82,TOCADA,339,,,597438.622,597776.129,6879804.776,6880060.952


In [5]:
print('remove duplicates')

print(df.shape[0] - len(np.unique(df['field parcel'])), 'duplicate entries' )

fieldparcel = df['field parcel']

df = df[fieldparcel.duplicated() == False]

print(df.shape[0] - len(np.unique(df['field parcel'])), 'duplicate entries' )

remove duplicates
27 duplicate entries
0 duplicate entries


In [6]:
print('only ', len(df['full crop loss'].dropna()), '(full) resp. ', len(df['partial crop loss'].dropna()),
      '(partial) of a total ', len(df), 'have a crop loss information')

only  7384 (full) resp.  11994 (partial) of a total  465568 have a crop loss information


select only those rows, where **both** full and partial crop loss are present

In [7]:
# # full = df['full crop loss']
# # partial = df['partial crop loss']
# # df = df[np.isfinite(full) & np.isfinite(partial)]
# df = df.dropna(subset=['full crop loss', 'partial crop loss'])
print('length: ', len(df))
# df.head(4)

length:  465568


create new column: relative crop loss = crop loss / area

In [8]:
df['full crop loss scaled'] = df['full crop loss'] / df['Property area']
df['partial crop loss scaled'] = df['partial crop loss'] / df['Property area']
df.head(4)

Unnamed: 0,rowid,YEAR,field parcel,identifier,PLANT CODE,PLANT,VARIETY CODE,VARIETY,Property area,full crop loss,partial crop loss,xmin,xmax,ymin,ymax,full crop loss scaled,partial crop loss scaled
0,1,2015,8340031607-A,108884,1400,Kaura,41,AKSELI,130,,,332992.292,333122.202,6750626.879,6750777.447,,
1,2,2015,8340033324-A,108884,1400,Kaura,41,AKSELI,211,,,332136.192,332326.2,6749464.459,6749614.102,,
2,3,2015,8340473258-A,108884,1400,Kaura,41,AKSELI,716,,,332180.685,332627.877,6749577.142,6749797.02,,
3,4,2015,0460023106-A,84735,1310,Rehuohra,82,TOCADA,46,,,596822.261,596931.075,6880077.35,6880137.385,,


In [9]:
print('variety codes: ' , len(np.unique(list(df['VARIETY CODE']))))
print('varieties: ' , len(np.unique(list(df['VARIETY']))))
print('plant codes:', len(np.unique(list(df['PLANT CODE']))))
print('plants:', len(np.unique(list(df['PLANT']))))

variety codes:  155
varieties:  553
plant codes: 58
plants: 58


In [10]:
# print('Info partial crop loss')
# print('min:', min(np.unique(list(df['partial crop loss']))))
# print('max:', max(np.unique(list(df['partial crop loss']))))
# print('unique values:', len(list(np.unique(list(df['partial crop loss'])))))
# print(list(np.unique(list(df['partial crop loss']))))

In [11]:
# plt.hist(list(np.unique(list(df['partial crop loss']))), bins=40)
# plt.title('partial crop loss')

In [12]:
# print('Info full crop loss')
# print('min:', min(np.unique(list(df['full crop loss']))))
# print('max:', max(np.unique(list(df['full crop loss']))))
# print('unique values:', len(list(np.unique(list(df['full crop loss'])))))
# print(list(np.unique(list(df['full crop loss']))))

In [13]:
# plt.hist(list(np.unique(list(df['full crop loss']))), bins=40)
# plt.title('full crop loss')

In [14]:
np.unique(list(df['PLANT']))

array(['Apila', 'Auringonkukka', 'Hirssi', 'Humala', 'Härkäpapu', 'Kaura',
       'Kevätrapsi', 'Kevätruis', 'Kevätrypsi', 'Kevätspelttivehnä',
       'Kevätvehnä', 'Kuituhamppu', 'Kuitunokkonen', 'Kuitupellava',
       'Kvinoa (kinua)', 'Mailanen', 'Maissi', 'Makealupiini',
       'Mallasohra', 'Mesikkä', 'Muut valkuaiskasvit', 'Muut viljat',
       'Rehuherne', 'Rehuohra', 'Ruistankio (Camelina, Kitupellava)',
       'Ruokaherne', 'Ruokaperuna', 'Ruokateollisuusperuna',
       'Seos (herne/härkäpapu/makea lupiini/öljykasvit)',
       'Seos herne/härkäpapu/makea lupiini yli 50 %+viljaa',
       'Seoskasvusto (valkuaiskasvit)',
       'Seoskasvusto (valkuaiskasvit+vilja)',
       'Seoskasvusto (valkuaiskasvit+öljykasvit)',
       'Seoskasvusto (vilja+öljykasvit)', 'Seoskasvusto (viljat)',
       'Seoskasvusto (öljykasvit)',
       'Siemenperuna (sertifioidun siemenen tuotantoon)', 'Soijapapu',
       'Sokerimaissi', 'Syysohra', 'Syysrapsi', 'Syysruis',
       'Syysruisvehnä', 'Syysryps

translation of plants:

- Kaura - Oats
- Kevät rapsi - Spring rape
- Kevät ruis - Spring rye
- Kevät rypsi - Spring rapeseed
- Kevätvehnä - spring wheat
- Mallasohra - Malting barley
- Rehuherne - feed peas
- Rehu ohra - feed barley
- Ruoka herne - Food Pea
- Ruoka peruna - Food potato
- Ruoka teollisuus peruna - Food industry potato
- Seoskasvusto (valkuaiskasvit+vilja) - Seagrass (protein crops + grain)
- Seoskasvusto (viljat) - Seagrass (cereals)
- Siemen peruna (sertifioidunsiemenentuotantoon) - Potato seed (for certified seed production)
- Syys ruis - September rye
- Syys rypsi - Autumn is rapeseed
- Syysvehnä - Winter wheat
- Tärkkelysperuna - Starch potatoes
- Tärkkelys perunanomasiemenlisäys - Starch Potato Meal Addition
- Vihantavilja(kaura) - Forage cereals (oats)
- Vihantavilja(ohra) - Forage cereals (barley)
- Öljypellava - Flax oil

In [15]:
# select largest number of samples for one given plant species
plants = df['PLANT']
num = 0
for plant in np.unique(list(plants)):
    num_tmp = len(df[plants == plant])
    print(plant, '\t ', num_tmp)

    if num_tmp > num:
        num = num_tmp
        plant_max = plant
        
print('------------')
print('maximum number for', plant_max, 'with', num, 'entries')

Apila 	  556
Auringonkukka 	  304
Hirssi 	  1
Humala 	  9
Härkäpapu 	  4060
Kaura 	  130707
Kevätrapsi 	  4159
Kevätruis 	  624
Kevätrypsi 	  11102
Kevätspelttivehnä 	  96
Kevätvehnä 	  60400
Kuituhamppu 	  99
Kuitunokkonen 	  2
Kuitupellava 	  2
Kvinoa (kinua) 	  22
Mailanen 	  102
Maissi 	  215
Makealupiini 	  19
Mallasohra 	  22334
Mesikkä 	  47
Muut valkuaiskasvit 	  32
Muut viljat 	  221
Rehuherne 	  2568
Rehuohra 	  158304
Ruistankio (Camelina, Kitupellava) 	  4
Ruokaherne 	  1616
Ruokaperuna 	  12604
Ruokateollisuusperuna 	  1398
Seos (herne/härkäpapu/makea lupiini/öljykasvit) 	  78
Seos herne/härkäpapu/makea lupiini yli 50 %+viljaa 	  2669
Seoskasvusto (valkuaiskasvit) 	  149
Seoskasvusto (valkuaiskasvit+vilja) 	  3070
Seoskasvusto (valkuaiskasvit+öljykasvit) 	  53
Seoskasvusto (vilja+öljykasvit) 	  14
Seoskasvusto (viljat) 	  8990
Seoskasvusto (öljykasvit) 	  8
Siemenperuna (sertifioidun siemenen tuotantoon) 	  410
Soijapapu 	  2
Sokerimaissi 	  51
Syysohra 	  23
Syysrapsi 	  

In [16]:
df = df[plants == plant_max]
df.head(10)
print('length remaining:', len(df))

length remaining: 158304


In [17]:
df.head(4)

Unnamed: 0,rowid,YEAR,field parcel,identifier,PLANT CODE,PLANT,VARIETY CODE,VARIETY,Property area,full crop loss,partial crop loss,xmin,xmax,ymin,ymax,full crop loss scaled,partial crop loss scaled
3,4,2015,0460023106-A,84735,1310,Rehuohra,82,TOCADA,46,,,596822.261,596931.075,6880077.35,6880137.385,,
4,5,2015,0460023510-A,84735,1310,Rehuohra,82,TOCADA,339,,,597438.622,597776.129,6879804.776,6880060.952,,
6,7,2015,1650371942-A,58419,1310,Rehuohra,AK,ELMERI,64,,,362055.439,362217.501,6751049.808,6751127.568,,
7,8,2015,4330062923-A,58419,1310,Rehuohra,AK,ELMERI,161,,,344866.56,345066.421,6740995.6,6741324.333,,


In [22]:
max(df['partial crop loss scaled'])

1.0

In [85]:
t = 10
files = []
full_cl = []
partial_cl = []

path_to_data = 'data/'
layer = 'NDVI'
f_extension = '.png'

for index, row in df.iterrows():
    
    if len(files)+1 > t:
        break
    
    file = path_to_data + str(row['field parcel']) + '_' +layer + f_extension
    if os.path.isfile(file):
        files.append(file)
        full_cl.append(row['full crop loss scaled'])
        partial_cl.append(row['partial crop loss scaled'])
    
full_cl = np.array(full_cl)
partial_cl = np.array(partial_cl)

In [86]:
x = np.array([np.array(Image.open(fname)) for fname in files])

In [87]:
files

['data/9260312033-C_NDVI.png',
 'data/0050685328-A_NDVI.png',
 'data/6310176261-A_NDVI.png',
 'data/1450036731-A_NDVI.png',
 'data/3170415492-A_NDVI.png',
 'data/5010091224-A_NDVI.png',
 'data/3170382958-A_NDVI.png',
 'data/9160022622-A_NDVI.png',
 'data/9260212104-A_NDVI.png',
 'data/6260185390-A_NDVI.png']

In [88]:
import sys
print(round(sys.getsizeof(x)/1024/1024,2),'MB')

7.5 MB
