In [1]:
import pandas as pd

In [2]:
!pwd

/c/Users/Windows10/code/GuiVdwinden/greeneye/notebooks


In [3]:
data = pd.read_csv('../raw_data/train_classes.csv')

In [4]:
data.head()

Unnamed: 0,image_name,tags
0,train_0,haze primary
1,train_1,agriculture clear primary water
2,train_2,clear primary
3,train_3,clear primary
4,train_4,agriculture clear habitation primary road


# '''we know there are 17 categories'''

*common labels*
>**agriculture** = Commercial agriculture

>**primary** = primary rainforest / virgin forest

>**habitation** = contain human homes or buildings

>**bare ground** = tree free areas that aren't the result of human activity. 

>**cultivation** =  subset of agriculture where individuals and families maintain farm plots for subsistence

>**road** = self-explanatory



*uncommon labels*

> **Slash burn** = a subset of the shifting cultivation label and is used for areas that demonstrate recent burn events

>**selective logging** = the practice of selectively removing high value tree species from the rainforest
>> winding dirt roads adjacent to bare brown patches in otherwise primary rain forest

>**blooming** = a natural phenomenon found in the Amazon where particular species of flowering trees bloom, fruit, and flower at the same time to maximize the chances of cross pollination.

>**conventional mining** = large-scale legal mining operations.

>**artisinal mining** = small scale mining operations.

>**blow down** = phenomena in which cold air coming from the Andes Summit topples over a certain area of trees

In [5]:
#Separated the tags just in case

atmosphere = ['clear','cloudy', 'haze','partly cloudy']

common_labels = ['rainforest', 'agriculture', 'rivers', 'towns/cities', 'roads']

uncommon_labels= ['slash burn', 'blow down', 'conventional mine', 'artisinal mine', 'blooming', 'selective logging']


In [6]:
alltags = [ 'clear', 'cloudy', 'haze', 'partly cloudy',        #the cloud coverage goes first!     
           'agriculture', 'artisinal mine', 'bare ground', 'blooming', 
           'blow down', 'cultivation', 'habitation', 'primary', 'road', 
        'selective logging', 'conventional mine', 'slashu burn','water']

# let's encode the data 

## turn the tag into a list

In [7]:
data['taglist'] = data['tags'].map(lambda x: x.split())

In [8]:
data.head()

Unnamed: 0,image_name,tags,taglist
0,train_0,haze primary,"[haze, primary]"
1,train_1,agriculture clear primary water,"[agriculture, clear, primary, water]"
2,train_2,clear primary,"[clear, primary]"
3,train_3,clear primary,"[clear, primary]"
4,train_4,agriculture clear habitation primary road,"[agriculture, clear, habitation, primary, road]"


## encode every tag in a feature (careful, it will be converted to strings!)

In [9]:
for i in alltags:
    conversor = lambda x: '1' if i in x else '0'
    data[i] = data['taglist'].map(conversor)

In [10]:
data.head()

Unnamed: 0,image_name,tags,taglist,clear,cloudy,haze,partly cloudy,agriculture,artisinal mine,bare ground,blooming,blow down,cultivation,habitation,primary,road,selective logging,conventional mine,slashu burn,water
0,train_0,haze primary,"[haze, primary]",0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1,train_1,agriculture clear primary water,"[agriculture, clear, primary, water]",1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1
2,train_2,clear primary,"[clear, primary]",1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
3,train_3,clear primary,"[clear, primary]",1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
4,train_4,agriculture clear habitation primary road,"[agriculture, clear, habitation, primary, road]",1,0,0,0,1,0,0,0,0,0,1,1,1,0,0,0,0


## make a unique code for each combination of features

In [11]:
data['result_id'] = ''
for i in alltags:
    data['result_id'] =  data['result_id'] + data[i] 
    

In [12]:
data.head()

Unnamed: 0,image_name,tags,taglist,clear,cloudy,haze,partly cloudy,agriculture,artisinal mine,bare ground,...,blow down,cultivation,habitation,primary,road,selective logging,conventional mine,slashu burn,water,result_id
0,train_0,haze primary,"[haze, primary]",0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,100000000100000
1,train_1,agriculture clear primary water,"[agriculture, clear, primary, water]",1,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,1,10001000000100001
2,train_2,clear primary,"[clear, primary]",1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,10000000000100000
3,train_3,clear primary,"[clear, primary]",1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,10000000000100000
4,train_4,agriculture clear habitation primary road,"[agriculture, clear, habitation, primary, road]",1,0,0,0,1,0,0,...,0,0,1,1,1,0,0,0,0,10001000001110000


In [13]:
data['result_id'] = data['result_id'].map(lambda x: "'" + x + "'")

get number of unique features

In [14]:
data['result_id'].nunique()

151

## make the encoded features from string to int!

In [15]:
for i in alltags:
    data[i] = data[i].map(int)

**now our dataset is encoded**

##  let's make the tags and taglist organized as before (weather coverage first, the rest, aphabetized)

In [16]:
def organizer(x):   #organize alphabetically, first the weather conditions, then the common lables
    l = []
    for i in x:
        if i in atmosphere:
            l.append(i)
    for i in x:
        if i not in atmosphere:
            l.append(i)
    x = l
    return x           

In [17]:
data['taglist'] = data['taglist'].map(organizer)

In [18]:
data.head()

Unnamed: 0,image_name,tags,taglist,clear,cloudy,haze,partly cloudy,agriculture,artisinal mine,bare ground,...,blow down,cultivation,habitation,primary,road,selective logging,conventional mine,slashu burn,water,result_id
0,train_0,haze primary,"[haze, primary]",0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,'00100000000100000'
1,train_1,agriculture clear primary water,"[clear, agriculture, primary, water]",1,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,1,'10001000000100001'
2,train_2,clear primary,"[clear, primary]",1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,'10000000000100000'
3,train_3,clear primary,"[clear, primary]",1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,'10000000000100000'
4,train_4,agriculture clear habitation primary road,"[clear, agriculture, habitation, primary, road]",1,0,0,0,1,0,0,...,0,0,1,1,1,0,0,0,0,'10001000001110000'


# save our dataset in a csvfile

## drop annoying columns 

In [19]:
!pwd

/c/Users/Windows10/code/GuiVdwinden/greeneye/notebooks


In [20]:
data.to_csv('../EncodedData.csv', index = False)

In [21]:
#the data will be in the main folder, subject to changes in the future !