## Imports

In [1]:
import numpy as np
from sklearn.cluster import KMeans
from sklearn.mixture import BayesianGaussianMixture

In [3]:
# Generate array from csv and strip the headings.
def csv_to_array(filename):
    original = np.genfromtxt(filename, delimiter=',')
    data = original[1:]
    
    return data

data = csv_to_array("testdata/complete.csv")

IOError: testdata/complete.csv not found.

## Naive classification

In [8]:
def classify_rain(data):   
    # Initiate list to keep track of classes
    classification = np.array([])

    # Loop over DBZH value, add a 0 to the list if it is rain (DBZH > 7.0),
    # else add a 1.
    for i in data[:,0]:
        if(i > 7.0):
            classification = np.append(classification, 0)
        else:
            classification = np.append(classification, 1)

    # Reshape into an appendable column
    classification = np.reshape(classification, (-1,1))


    # Add labels to data, output format: DBZH, X, Y, Z, labels
    output = np.append(data[:,(0,3,4,5)], classification, axis=1)

    np.savetxt("output/naive.csv", output, delimiter=',')
    print("Created csv file.")

classify_rain("testdata/complete.csv")

TypeError: string indices must be integers, not tuple

## K-means

In [10]:
def kmeans(data):
    clusters = 3
    # Input csv in form: DBZH, TH, VRAD, X, Y, Z
    subset = data[:,(0,2)]

    ks = KMeans(n_clusters=clusters).fit(subset)
    labels = ks.labels_

    labeleddata = np.append(data[:,(0,2,3,4,5)], np.reshape(labels,(-1,1)), axis=1)

    np.savetxt("output/kmeans.csv", labeleddata, delimiter=',')
    print("Created csv file.")

kmeans(data)

TypeError: string indices must be integers, not tuple

## Bayesian Guassian Mixture

In [5]:
def bgm(data, ratio):
    # Select on which features to cluster on:
    # 0 = DBZH
    # 1 = TH
    # 2 = VRAD
    # 3 = X coordinate
    # 4 = Y coordinate
    # 5 = Z coordinate
    data = data[:,(0,2,3,4,5)]

    # Randomly shuffle the data and divide it up in a training and test set.
    length = np.shape(data)[0]
    div = int(length * ratio)
    np.random.shuffle(data)

    trainset = data[:div]
    testset = data[div:]

    # Specify mixture settings and fit on trainingset
    # Algorithm works best on current data with either, more components and a lower prior or,
    # less components and a higher prior.
    # Least n_components needs to be 2.
    mix = BayesianGaussianMixture(n_components=6,max_iter=10000,weight_concentration_prior_type='dirichlet_distribution', weight_concentration_prior=0.00001).fit(trainset)
    print("Trainingsset fitted")

    # Predict testset
    labels = mix.predict(testset)
    print("Testset predicted")

    # Append labels to data and write to file.
    labeledData = np.append(testset, np.reshape(labels, (-1,1)), axis = 1)
    np.savetxt("output/BGM.csv", labeledData, delimiter=',')

bgm(data, 0.4)

NameError: name 'data' is not defined

## Cluster Reduction

Give the output file of one of the algorithms above as input. This will reduce the amount of clusters to 2. One for rain and one for birds. Returns the same csv but with a column appended to it. For each row containing, either a 0 for 

In [6]:
def reduction(filename):
    # Generate numpy array from csv and get the last column (the labels)
    data = np.genfromtxt(filename, delimiter = ',')
    print(np.shape(data))
    lastColumn = np.shape(data)[1]-1

    # Define dictionaries
    labelDict = {}
    classDict = {}

    # Compute how many clusters there are
    clusters = np.unique(data[:,lastColumn])

    # Make a list per cluster with the DBZ values
    for i, j in zip(data[:,0],data[:,lastColumn]):
        if(str(j) in labelDict):
            labelDict[str(j)] = np.append(labelDict[str(j)], i)
        else:
            labelDict[str(j)] = np.array([i])

    # For each of the lists in labelDict, compute the average. If that average
    # is higher than 0 we classify the cluster as rain(1), else as bird(0)
    for i in labelDict:
        classDict[str(i)] = np.average(labelDict[str(i)])
        print(classDict[str(i)])
        if(classDict[str(i)] > 0):
            classDict[str(i)] = 1.0
        else:
            classDict[str(i)] = 0.0

    # Define array of the right shape, using np.empty for efficiency.
    classes = np.empty([np.shape(data)[0],1], float)
    
    # 
    for i, j in zip(data[:,lastColumn], range(len(classes))):
        classes[j] = classDict[str(i)]

    classifiedData = np.append(data, classes, axis = 1)
    np.savetxt("output/classified.csv", classifiedData, delimiter=',')

reduction("output/new.csv")

IOError: output/new.csv not found.