In [1]:
# Goal: Process landsat_train.csv and extract
# a subset of 100,000 training instances
#
# Output: New files landsat_train_subset.csv and landsat_train_remaining.csv

import pickle
import numpy

# Open files
landsat_train = open("../../data/landsat_train.csv", 'r')
landsat_train_subset = open("landsat_train_subset.csv", 'w')
landsat_train_remaining = open("landsat_train_remaining.csv", 'w')

# Generate sorted subset of indices. Since this
# list is relatively small, we can simply genearte
# and keep it in main memory
subset = sorted(numpy.random.choice(25667779, 100000, replace=False))

# Process training file line-by-line
counter = 0
counter_subset = 0

for line in landsat_train:

    if counter % 1000000 == 0:
        print("Processing line %i ..." % counter)

    if counter_subset < len(subset) and counter == subset[counter_subset]:
        # Append to subset file
        landsat_train_subset.write(line)
        counter_subset += 1
    else:
        # Append to remaining file
        landsat_train_remaining.write(line)

    counter += 1

# Close files
landsat_train.close()
landsat_train_subset.close()
landsat_train_remaining.close()


Processing line 0 ...
Processing line 1000000 ...
Processing line 2000000 ...
Processing line 3000000 ...
Processing line 4000000 ...
Processing line 5000000 ...
Processing line 6000000 ...
Processing line 7000000 ...
Processing line 8000000 ...
Processing line 9000000 ...
Processing line 10000000 ...
Processing line 11000000 ...
Processing line 12000000 ...
Processing line 13000000 ...
Processing line 14000000 ...
Processing line 15000000 ...
Processing line 16000000 ...
Processing line 17000000 ...
Processing line 18000000 ...
Processing line 19000000 ...
Processing line 20000000 ...
Processing line 21000000 ...
Processing line 22000000 ...
Processing line 23000000 ...
Processing line 24000000 ...
Processing line 25000000 ...


In [2]:
from sklearn.ensemble import RandomForestClassifier

# load data
print("Loading training data ...")
data_train = numpy.genfromtxt("landsat_train_subset.csv", delimiter=",")
Xtrain, ytrain = data_train[:,1:], data_train[:,0]
print("Loaded training data: n=%i, d=%i" % (Xtrain.shape[0], Xtrain.shape[1]))

# training phase
print("Fitting model ...")
model = RandomForestClassifier(n_estimators=10, 
                               criterion='gini',
                               max_depth=None, 
                               min_samples_split=2, 
                               max_features=None)
model.fit(Xtrain, ytrain)
print("Model fitted!")

# save model
pickle.dump(model, open("model.save", 'wb'))


Loading training data ...
Loaded training data: n=100000, d=9
Fitting model ...
Model fitted!


In [7]:
Xtrain.shape

(100000, 9)