In [1]:
import csv 
import numpy as np
import math
import matplotlib
from datetime import datetime
from collections import Counter

In [2]:
## Y1 Prediction : Logistic Regression

In [3]:
# Helper function
def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        return False

In [4]:
# Initial data import/read-in
data = []
with open('Project1_data_untouched.csv', 'rb') as readfile:
    reader = csv.reader(readfile, delimiter=',')
    for row in reader :
        data.append(row)
        
# Convert to numpy array
data = np.asarray(data)

In [5]:
# Parses the imported data
events = {}
n = len(data)-1

x_participants = range(n)
x_marathon_ratio = n*[0.0]
x_sex = n*[0]
x_age = n*[0.0]
x_oasis = n*[0]

for row in data:
    n_participations = 0 # Total number of events participated in
    n_marathons = 0 # Total number of marathons ran
    n_oasis = 0 # Total number of Oasis marathons attended
    
    # Cycle over the date as well as type of events
    for i in range(1, 501, 5):               
        try:
            date = row[i]
            if(events.has_key(date)):                
                events[date][int(row[0])] = True
            else:
                if(is_number(row[0])):
                    events[date] = n*[False]
                    events[date][int(row[0])] = True            
            
            n_participations = n_participations + 1
            j = i+2 # j indexes into the event type column
            event_type = row[j]
            if event_type == 'Marathon':
                n_marathons = n_marathons + 1
                
            m = i+1 # m indexes into the event name column
            event = row[m]
            # Event Name : Check how many Oasis events before
            if "Oasis" in event:
                n_oasis = n_oasis + 1
                
            k = i+4 # k indexes into the category column
            sex = row[k][0]
            if is_number(row[0]):
                # Sex
                if sex == 'M':
                    x_sex[int(row[0])] = True 
                else :
                    x_sex[int(row[0])] = False
                    
                # Age: Take average of age bracket values to get age variable 
                if is_number(row[k][1:3]) and is_number(row[k][4:6]):
                    age = (float(row[k][1:3]) + float(row[k][4:6]))/2.0
                elif is_number(row[k][1:3]):
                    # Handle cases such as "M13+"
                    age = float(row[k][1:3])
                else:
                    # There's no age so use the marathon runner age median : 38
                    # From http://www.runningusa.org/marathon-report-2016?returnTo=main
                    age = 38
                x_age[int(row[0])] = age        
            
        except IndexError:
            continue
        
    # Compute "marathon ratio"
    if is_number(row[0]):
        x_marathon_ratio[int(row[0])] = n_marathons/float(n_participations)
        x_oasis[int(row[0])] = n_oasis

In [6]:
# Computes the sigmoid function
def sigmoid(x):
    return float(1) / (1 + np.exp(-x))

In [7]:
# Computes the cost and gradient of a given theta vector
def compute_cost(theta, x, y):
    # theta vector should have the intercept term and x already
    
    y = y.astype(float)
    y = y.flatten()
    J = 0; # Represents the cost here
    grad = np.zeros(np.shape(theta))

    z = sigmoid(x.dot(theta))

    cost = -(y*safe_ln(z) + (1-y)*safe_ln(1-z))

    J = np.sum(cost)
#     print J
#     grad = (np.squeeze(z)-y).T.dot(x)
    grad = (x.T).dot(z.flatten()-y)
    
    return (J, grad)

In [8]:
# Computes cost and gradient of a given theta vector, with regularization added
def compute_cost_reg(theta, x, y, l):
    # theta vector should have the intercept term and x already
    
    y = y.astype(float)
    y = y.flatten()
    J = 0; # Represents the cost here
    grad = np.zeros(np.shape(theta))

    z = sigmoid(x.dot(theta))

    cost = -(y*safe_ln(z) + (1-y)*safe_ln(1-z))

#     J = np.sum(cost) + (float(l)/(2*n)) * sum(abs(theta)) # L1 Regularization
    J = np.sum(cost) + (float(l)/(2*n)) * sum(np.power(theta, 2)) # L2 Regularization
    grad = (x.T).dot(z.flatten()-y) + float(l) * theta.flatten()
    
    return (J, grad)

In [9]:
# Implements gradient descent 
def gradient_desc(x, y, theta, alpha, convergence):
    y = y.astype(float)
    d_cost = 1
    cost = float('inf')

    while (d_cost > convergence):
        prev_cost = cost
#         (cost, grad) = compute_cost(theta, x, y)
        (cost, grad) = compute_cost_reg(theta, x, y, 0.3)
        d_cost = prev_cost-cost
        theta = theta.flatten() - (1/float(n))*alpha*grad
    return theta

In [10]:
def predict(theta, x):
    p = sigmoid(x.dot(theta))

    for i in range(len(x)):
        if p[i] >= 0.5:
            p[i] = 1
        else:
            p[i] = 0
    return p

In [11]:
def safe_ln(x, minval=0.0000000001):
    return np.log(x.clip(min=minval))

In [12]:
# Splits the data in both X and Y into K subsets of equal sizes
def k_fold_data(X, Y, K):
    x = K*[0]
    y = K*[0]
    for k in range(K):
        if k == K-1:
            x[k] = X[k*len(X)/K:(k+1)*len(X)/K + len(X)%K, :]
            y[k] = Y[k*len(X)/K:(k+1)*len(X)/K + len(X)%K]
        else:
            x[k] = X[k*len(X)/K:(k+1)*len(X)/K, :]
            y[k] = Y[k*len(X)/K:(k+1)*len(X)/K]
    return x, y

In [13]:
## Data preparation cell

# Start by compiling everything into a big table
X = np.asarray(zip(np.ones(n), x_marathon_ratio, x_sex, x_age, x_oasis)) # Indexed according to participant ID
# X_headers = [["intercept"], ["marathon_ratio"], ["sex"], ["age"], ["n_oasis"]]
# X_export = X
# i = 5

for key in events.keys():
#     X_export = np.column_stack((X_export,events[key]))
    # Response var corresponds to a single event
    if key == '2015-09-20':
        Y = np.asarray(events[key])
    # The rest of the data is features so put'em in X
    else:        
        X = np.column_stack((X, events[key]))
#     X_headers = np.append(X_headers, key)
#     i = i+1

##Data Export
# with open('full_data.csv', 'w') as mycsvfile:
#     thedatawriter = csv.writer(mycsvfile)
#     thedatawriter.writerow(X_headers)
#     for row in X_export:
#         thedatawriter.writerow(row)

# Now read in Y response variable
Y = []
with open('2015results.csv', 'rb') as readfile:
    reader = csv.reader(readfile, delimiter=',')
    for row in reader :
        Y.append(row)
Y = np.asarray(Y)
Y = Y.astype(bool)

# Now split the data into sets:
X, Y = k_fold_data(X, Y, 5)

In [14]:
## Training cell
# TODO: Make alpha and convergence arguments
def train(x_train, y_train, convergence_criteria, alpha):
    m = len(x_train[0]) # Number of features (with intercept)

    # Initialize theta parameters all zeros with the intercept term added
    initial_theta = np.zeros((m, 1))

    # Now run gradient descent
    theta = gradient_desc(x_train, y_train, initial_theta, alpha, convergence_criteria)
    
    return theta

In [15]:
## Test cell
def test(X_test, Y_test, theta_trained):
    prediction = predict(theta_trained, X_test)

    count = 0
    for j in range(len(prediction)):
        if(prediction[j] == Y_test[j]):
            count = (count + 1)

    accuracy = float(count)/len(prediction)
    print accuracy
    print prediction
    return (prediction, accuracy)

In [16]:
# Set hyperparameters 
convergence_criteria = 0.0005
alpha = 0.005

theta_0 = train(np.row_stack((X[0], X[1], X[2], X[3], X[4])), np.concatenate((Y[0], Y[1], Y[2], Y[3], Y[4])), convergence_criteria, alpha)
(p, test_accuracy_0) = test(np.row_stack((X[0], X[1], X[2], X[3], X[4])), np.concatenate((Y[0], Y[1], Y[2], Y[3], Y[4])), theta_0)

with open('p.csv', 'w') as mycsvfile:
    thedatawriter = csv.writer(mycsvfile)
    for row in p:
        print row
        thedatawriter.writerow([row])



KeyboardInterrupt: 

In [None]:
# Set hyperparameters 
convergence_criteria = 0.005
alpha = 0.005

print convergence_criteria
print alpha

theta_0 = train(np.row_stack((X[1], X[2], X[3], X[4])), np.concatenate((Y[1], Y[2], Y[3], Y[4])), convergence_criteria, alpha)
_, test_accuracy_0 = test(X[0], Y[0], theta_0)
_, train_accuracy_0 = test(np.row_stack((X[1], X[2], X[3], X[4])), np.concatenate((Y[1], Y[2], Y[3], Y[4])), theta_0)

print ('\n')
theta_1 = train(np.row_stack((X[0], X[2], X[3], X[4])), np.concatenate((Y[0], Y[2], Y[3], Y[4])), convergence_criteria, alpha)
_, test_accuracy_1 = test(X[1], Y[1], theta_1)
_, train_accuracy_1 = test(np.row_stack((X[0], X[2], X[3], X[4])), np.concatenate((Y[0], Y[2], Y[3], Y[4])), theta_1)
print ('\n')

theta_2 = train(np.row_stack((X[0], X[1], X[3], X[4])), np.concatenate((Y[0], Y[1], Y[3], Y[4])), convergence_criteria, alpha)
_, test_accuracy_2 = test(X[2], Y[2], theta_2)
_, train_accuracy_2 = test(np.row_stack((X[0], X[1], X[3], X[4])), np.concatenate((Y[0], Y[1], Y[3], Y[4])), theta_2)
print ('\n')

theta_3 = train(np.row_stack((X[0], X[1], X[2], X[4])), np.concatenate((Y[0], Y[1], Y[2], Y[4])), convergence_criteria, alpha)
_, test_accuracy_3 = test(X[3], Y[3], theta_3)
_, train_accuracy_3 = test(np.row_stack((X[0], X[1], X[2], X[4])), np.concatenate((Y[0], Y[1], Y[2], Y[4])), theta_3)
print ('\n')

theta_4 = train(np.row_stack((X[0], X[1], X[2], X[3])), np.concatenate((Y[0], Y[1], Y[2], Y[3])), convergence_criteria, alpha)
_, test_accuracy_4 = test(X[4], Y[4], theta_4)
_, train_accuracy_4 = test(np.row_stack((X[0], X[1], X[2], X[3])), np.concatenate((Y[0], Y[1], Y[2], Y[3])), theta_4)
print ('\n')

avg_accuracy = np.mean([test_accuracy_0, test_accuracy_1, test_accuracy_2, test_accuracy_3, test_accuracy_4])
print avg_accuracy

In [None]:
## Y2 Predictions: Linear regression
# Parser imports and does initial parsing
# Data_mod takes parsed data and adds missing values
# Reg does the actual regression and validation testing on formatted data


In [None]:
# Parser
def getSex(row):
    myIter = 5      #scan categories col
    while(myIter < len(row)):
        cat = row[myIter]
        cat = cat.lower()
        if cat:
            if cat == 'female' or cat[0] == 'f':
                return 0
            elif cat == 'male' or cat[0] == 'm':
                return 1
        myIter += 5
    return 1        #assume mal if no data
    
    
def getAgeEst(row):
    foundAgeCat = 0
    minAge = 13
    maxAge = 70
    date_format1 = '%m/%d/%Y'
    date_format2 = '%Y-%m-%d'
    currDate = '09/25/2016' #date of upcoming marathon
    currDate = datetime.strptime(currDate, date_format1)
    myIter = 1
    while(myIter < len(row)):
        dat = row[myIter] #event Date
        try:
            dat = datetime.strptime(dat, date_format1)
        except ValueError:
            dat = datetime.strptime(dat, date_format2)
        eventAge = currDate - dat
        eventAge = int(eventAge.days/365)       #how long ago the event was hels
        cat = row[myIter+4] #event category
        if cat:
            if cat[0] in 'MF' and cat[1] in '1234567890':
                foundAgeCat = 1
                if len(cat) == 6: #of the form M35-45
                    minAge = max(minAge, int(cat[1:3])+eventAge)
                    maxAge = min(maxAge, int(cat[4:6])+eventAge)
                elif len(cat) == 4: #of the form M18-
                    if cat[3] == '-':
                        maxAge = min(maxAge, int(cat[1:3])+eventAge)
                    elif cat[3] == '+':
                        minAge = max(minAge, int(cat[1:3])+eventAge)
        myIter += 5
    if foundAgeCat == 0:
        return -1       #no age data found
    else:
        return (minAge + maxAge)/2


def filterEvents(row):
    myIter = 3
    while(myIter < len(row)):
        name = row[myIter-1].lower()
        if row[myIter] == '1/2 Marathon - Demi-marathon' or '21.1 km' in row[myIter].lower() or row[myIter].lower() == 'demi marathon' \
        or row[myIter].lower() == 'demi-marathon' or row[myIter].lower() == 'half marathon' \
        or row[myIter].lower() == 'half marathon - demi marathon' or row[myIter].lower() == 'scotiabank ottawa half marathon' : 
            row[myIter] = '21.1'
        if '21 km' in row[myIter].lower() and 'velo' not in name and 'fondo' not in name and "l'echapee belle" not in name and "cyc" not in name and "hiver" not in name and 'VÃ‰LO'.lower() not in name:
            row[myIter] = '21'
        elif '22.2 km' in row[myIter].lower() and 'velo' not in name and 'fondo' not in name and "l'echapee belle" not in name and "cyc" not in name and "hiver" not in name and 'VÃ‰LO'.lower() not in name:
            row[myIter] = '22'
        elif '23 km' in row[myIter].lower() and 'velo' not in name and 'fondo' not in name and "l'echapee belle" not in name and "cyc" not in name and "hiver" not in name and 'VÃ‰LO'.lower() not in name:
            row[myIter] = '23'
        elif '23.5 km' in row[myIter].lower() and 'velo' not in name and 'fondo' not in name and "l'echapee belle" not in name and "cyc" not in name and "hiver" not in name and 'VÃ‰LO'.lower() not in name:
            row[myIter] = '23.5'
        elif '23.5 km' in row[myIter].lower() and 'velo' not in name and 'fondo' not in name and "l'echapee belle" not in name and "cyc" not in name and "hiver" not in name and 'VÃ‰LO'.lower() not in name:
            row[myIter] = '23.5'
        elif (row[myIter].lower().startswith(('25 km','25km'))) and 'velo' not in name and 'fondo' not in name and "l'echapee belle" not in name and "cyc" not in name and "hiver" not in name and 'VÃ‰LO'.lower() not in name:
            row[myIter] = '25'
        elif '30 km' in row[myIter].lower() and 'velo' not in name and 'fondo' not in name and "l'echapee belle" not in name and "cyc" not in name and "hiver" not in name and 'VÃ‰LO'.lower() not in name:
            row[myIter] = '30'
        elif '30 m' in row[myIter].lower() and 'velo' not in name and 'fondo' not in name and "l'echapee belle" not in name and "cyc" not in name and "hiver" not in name and 'VÃ‰LO'.lower() not in name:
            row[myIter] = '48.2803'
        elif ('42.2 km' in row[myIter].lower() or row[myIter].lower() == 'marathon' or row[myIter].lower() == 'ottawa marathon' \
        or row[myIter].lower() == 'scotiabank ottawa marathon') and 'velo' not in name and 'fondo' not in name and "l'echapee belle" not in name and "cyc" not in name and "hiver" not in name and 'VÃ‰LO'.lower() not in name:
            row[myIter] = '42.2'
        elif '45 km' in row[myIter].lower() and 'velo' not in name and 'fondo' not in name and "l'echapee belle" not in name and "cyc" not in name and "hiver" not in name and 'VÃ‰LO'.lower() not in name:
            row[myIter] = '45'
        elif '48 km' in row[myIter].lower() and 'velo' not in name and 'fondo' not in name and "l'echapee belle" not in name and "cyc" not in name and "hiver" not in name and 'VÃ‰LO'.lower() not in name:
            row[myIter] = '48'
        elif ('60 km' in row[myIter].lower() or '60km' in row[myIter].lower()) and 'velo' not in name and 'fondo' not in name and "l'echapee belle" not in name and "cyc" not in name and "hiver" not in name and 'VÃ‰LO'.lower() not in name:
            row[myIter] = '60'
        elif ('50 km' in row[myIter].lower() or '50km' in row[myIter].lower()) and 'velo' not in name and 'fondo' not in name and "l'echapee belle" not in name and "cyc" not in name and "hiver" not in name and 'VÃ‰LO'.lower() not in name:
            row[myIter] = '50'
        elif ('65 km' in row[myIter].lower() or '65km' in row[myIter].lower()) and 'velo' not in name and 'fondo' not in name and "l'echapee belle" not in name and "cyc" not in name and "hiver" not in name and 'VÃ‰LO'.lower() not in name:
            row[myIter] = '65'
        else:
            del row[myIter - 2: myIter + 3]
            myIter -= 5 #since we deleted rows
        myIter += 5
    return row
            
        
            
with open('filtered_data.csv', 'w') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['PARTICIPANT ID', 'SEX', 'EST. Current Age'] + ['EVENT DATE', 'EVENT NAME','EVENT LENGTH','TIME','CATEGORY']*8)
        
with open('Project1_data.csv', 'r') as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        if row[0] == 'PARTICIPANT ID': pass
        else:
            with open('filtered_data.csv', 'a') as csvfile:
                sex = getSex(row)
                age = getAgeEst(row)
                filteredRow = filterEvents(row)
                outRow = [filteredRow[0], sex, age]
                del filteredRow[0]
                outRow += filteredRow
                writer = csv.writer(csvfile)
                writer.writerow(outRow)


In [None]:
# Robert: data_mod1.py
'''modifies speed and adds age to those who don't have it yet'''

inFile = 'filtered_data.csv'
outFile = 'modData.csv'


def getAvAge(fileName): #returns average age of all of the people in the filtered data set
    res = 0
    n = 0
    with open(fileName, 'r') as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            try:
                if float(row[2]) != -1:
                    res += float(row[2])
                    n+=1
            except ValueError:
                pass
    return res/n

#avSpeeds[3] = 1/2 + (avSpeeds[2]+avSpeeds[4])
avAge = getAvAge(inFile)

with open(outFile, 'w') as csvfile:
    pass

with open(inFile, 'r') as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        with open(outFile, 'a') as csvfile:
            writer = csv.writer(csvfile)
            myIter = 6
            if row[0] == 'PARTICIPANT ID':  
                try:
                    while row[myIter]:
                        row[myIter] = 'ADJUSTED MARATHON TIME'
                        myIter += 5
                except IndexError:
                    pass
            else:
                if row[2] == -1:
                    row[2] == avAge     #guess the age
                
                try:
                    while row[myIter]:
                        if row[myIter] == '-1':
                            del row[myIter - 3: myIter + 2]
                            myIter -= 5 #since we deleted rows
                        else:
                            length = float(row[myIter-1])
                            unparsedTime = row[myIter]
                            (h, m, s) = unparsedTime.split(':')
                            time = int(h) + int(m)/60 + int(s)/3600
                            #speed = length/time
                            #cat = catDist(length)
                            #adjSpeed = speed * avSpeeds[2]/avSpeeds[cat]  #avSpeeds[2] is average marathon running time
                            #adjTime = 42.2/adjSpeed
                            adjTime = time * (42.2/length)**1.06
                            if adjTime > 6:
                                del row[myIter - 3: myIter + 2]
                                myIter -= 5 #since we deleted rows
                            else:
                                h = int(adjTime)
                                m = int((adjTime -h)*60)
                                s = int((adjTime -h -m/60)*3600)
                                row[myIter] = str(h) +':'+str(m)+':'+ str(s)
                        myIter += 5
                except IndexError:
                    pass
            writer.writerow(row)
                            


In [None]:
class SamplePoint(object):
    def __init__(self, ident, sex, age, EvDate, prevTime, currTime, prevEventNum):
        self.ident = ident
        self.sex = int(sex)
        age = float(age)
        (EvYear, EvMonth, EvDay) = EvDate.split('-')
        self.EvAge = age -2016 + int(EvYear)
        (h1, m1, s1) = prevTime.split(':')
        self.prevTime = int(h1) + int(m1)/60 + int(s1)/3600
        (h2, m2, s2) = currTime.split(':')
        self.currTime = int(h2) + int(m2)/60 + int(s2)/3600
        self.prevEventNum = prevEventNum

    def dataPoint(self):
        incSex = 1
        incAge = 1
        incAge2 = 1
        incAge3 = 0
        incPrevTime = 1
        incPrevTime2 = 1
        incPrevTime3 = 0
        incPrevEventNum = 1
        incPrevEventNum2 = 1
        incPrevEventNum3 = 0
        res = [1]
        if incSex: res.append(self.sex)
        if incAge: res.append(self.EvAge)
        if incAge2: res.append(self.EvAge**2)
        if incAge3: res.append(self.EvAge**3)
        if incPrevTime: res.append(self.prevTime)
        if incPrevTime2: res.append(self.prevTime**2)
        if incPrevTime3: res.append(self.prevTime**3)
        if incPrevEventNum: res.append(self.prevEventNum)
        if incPrevEventNum2: res.append(self.prevEventNum**2)
        if incPrevEventNum3: res.append(self.prevEventNum**3)
        return res
    def value(self):
        return self.currTime
    def ident(self):
        return self.ident

class SamplePoint2(object):
    def __init__(self, ident, sex, age, EvDate, currTime):
        self.ident = ident
        self.sex = int(sex)
        age = float(age)
        (EvYear, EvMonth, EvDay) = EvDate.split('-')
        self.EvAge = age -2016 + int(EvYear)
        (h2, m2, s2) = currTime.split(':')
        self.currTime = int(h2) + int(m2)/60 + int(s2)/3600

    def dataPoint(self):
        incSex = 1
        incAge = 1
        incAge2 = 1
        incAge3 = 0
        res = [1]
        if incSex: res.append(self.sex)
        if incAge: res.append(self.EvAge)
        if incAge2: res.append(self.EvAge**2)
        if incAge3: res.append(self.EvAge**3)
        return res
    def value(self):
        return self.currTime
    def ident(self):
        return self.ident


def makeMat(inFile):
    MAT = []
    with open(inFile, 'r') as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            myIter = 3
            if row[0] != 'PARTICIPANT ID':  
                try:
                    while row[myIter]:
                        prevEventNum = (len(row) - myIter)/5 - 1
                        Point = SamplePoint(row[0], row[1], row[2], row[myIter], row[myIter+8], row[myIter + 3], prevEventNum)
                        Xi = Point.dataPoint()
                        Yi = Point.value()
                        ident = row[0]
                        MAT.append([ident, Xi, Yi])
                        myIter +=5
                except (IndexError):
                    pass
    return MAT


def makeMat2(inFile):
    MAT = []
    with open(inFile, 'r') as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            myIter = 3
            if row[0] != 'PARTICIPANT ID':  
               if len(row) == 8:
                    Point = SamplePoint2(row[0], row[1], row[2], row[myIter], row[myIter + 3])
                    Xi = Point.dataPoint()
                    Yi = Point.value()
                    ident = row[0]
                    MAT.append([ident, Xi, Yi])
    return MAT


def regress(X,Y):
    res = []
    if X.any() and Y.any():
        Xt = np.transpose(X)
        X1 = np.dot(Xt,X)
        X2 = np.linalg.inv(X1)
        X3 = np.dot(X2,Xt)
        res = np.dot(X3,Y)
    return res
    
    

def validTest(MAT, k):
    numSets = int(len(MAT)/k)
    if k == 1:
        numSets = 0
    tErrs = []
    vErrs = []
    for i in range(0, k):
        Vset = MAT[i*numSets:(i+1)*numSets]
        Xval = [el[1] for el in Vset]
        Xval = np.array(Xval)
        Yval = [el[2] for el in Vset]
        Yval = np.array(Yval)
        
        Tset = MAT[:i*numSets] + MAT[(i+1)*numSets:]
        Xtrain = [el[1] for el in Tset]
        Xtrain = np.array(Xtrain)
        Ytrain = [el[2] for el in Tset]
        Ytrain = np.array(Ytrain)
        w = regress(Xtrain, Ytrain)        #for training
        err1, err2 = 0,0
        for i in range(len(Ytrain)):
            err1 += (Ytrain[i] - np.dot(w, Xtrain[i]))**2
        for i in range(len(Yval)):
            err2 += (Yval[i] - np.dot(w, Xval[i]))**2
        if Yval.any():
            vErrs.append(err2/len(Yval))
        tErrs.append(err1/len(Ytrain))
        
    return tErrs, vErrs, w

inFile = 'modData.csv'
k = 1

MAT1 = makeMat(inFile)   #MAT[i] is a list of [id, [args], res]
                        #args of the form [1, sex, age, previous time, # of events run]
MAT2 = makeMat2(inFile)
tErrs, vErrs, w = validTest(MAT1,k)
tErrs2, vErrs2, w2 = validTest(MAT2, k)
#print(sum(tErrs)/len(tErrs))
#print(sum(vErrs)/len(vErrs))

#print(sum(tErrs2)/len(tErrs2))
#print(sum(vErrs2)/len(vErrs2))

def predict(inFile, w, w2):
    with open(inFile, 'r') as csvfile:
        intres = []
        strres = []
        reader = csv.reader(csvfile)
        for row in reader:
            if row[0] == 'PARTICIPANT ID':
                pass
            elif len(row)==3:
                sex = float(row[1])
                age = float(row[2])
                DAT = np.array([1, sex, age, age**2])
                intres.append(np.dot(w2,DAT))
                
            else:
                sex = float(row[1])
                age = float(row[2])
                (h, m, s) = row[6].split(':')
                time = int(h) + int(m)/60 + int(s)/3600
                eventNum = (len(row)-3)/5
                DAT = [1, sex, age, age**2, time, time**2, eventNum, eventNum**2]
                
                intres.append(np.dot(w,DAT))
    for el in intres:     
        h = int(el)
        m = int((el -h)*60)
        s = int((el -h -m/60)*3600)
        strres.append(str('0') + str(h) +':'+str(m)+':'+ str(s))
    return strres

p = predict(inFile, w, w2)
with open('res.csv', 'w') as csvfile:
    writer = csv.writer(csvfile)
    for el in p:
        writer.writerow([el])
        


In [None]:
## Y2 Prediction
# Uong
def load_y():
	with open('Project1_data.csv', 'rb') as mycsvfile:
		thedata = csv.reader(mycsvfile)
		Processeddata = []
		number = -2

		for row in thedata:
			number+=1
			if number == -1:
				pass
			else:
	
				bool = 0

				eventdate = 1
				pos = 3
				oasispos = 2
				while pos <len(row) and oasispos <len(row):
					if row[oasispos][:14] =='Marathon Oasis' and row[pos]== 'Marathon' and row[eventdate]== '2015-09-20':
						bool = 1
					oasispos+=5
					pos+=5
					eventdate+=5
				Processeddata.append(bool)
	return Processeddata

def load_x(Listparameters):
	with open('full_data.csv', 'rb') as mycsvfile:
		thedata = csv.reader(mycsvfile)
		x = []
		y = []
		z = [] # Column
		count = 0

		for row in thedata:
			count += 1
			if count <2:
				for i in range(0,len(row),1):
					z.append(row[i]) 
			if count >=2:
				data_row = []
				for i in range(0,len(row),1):
					if z[i] == '2015-09-20':
						y.append(float(row[i]))
					elif z[i] in Listparameters:
						data_row.append(float(row[i]))
				x.append(data_row)
		return x

def mean(List):
	count = 0
	sum = 0
	for number in List:
		if not isinstance(number, basestring):
			sum += number
			count += 1
	return sum/float(count), count

def returnAverageandStandarddeviation(List):
	avg, count = mean(List)
	variance = sum([pow(x-avg,2) for x in List if not isinstance(x, basestring)])/float(count)
	return avg, math.sqrt(variance)

def splitdata(data,splitratio):
	numpy.random.shuffle(data)
	mid = int(splitratio* len(data))
	train_x = data[:mid]
	test_x = data[mid:]
	return train_x,test_x

def calculateProbability(x, mean, stdev):
	if isinstance(x, basestring):
		return 1
	else:
		exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))
		return (1 / (math.sqrt(2*math.pi) * stdev)) * exponent

def summarizeddata(X_train,Y_train,datasetcommand):
	
	HashbyClass = {}
	for x,y in zip(X_train,Y_train):
		if y in HashbyClass:
			HashbyClass[y].append(x)
		else:
			HashbyClass[y] = []
			HashbyClass[y].append(x)
	
	HashofClasses = {}
	HashofClasses['ratio'] = {}

	for Class in HashbyClass:
		HashofClasses['ratio'][Class] = float(len(HashbyClass[Class]))/len(X_train)

	for Class in HashbyClass:

		Listofattributes = []
		for attr in X_train:
			Listofattributes.append({}) 

		for data in HashbyClass[Class]:
			for i in range(0,len(data),1):
				if datasetcommand[i] == 'ID' or datasetcommand[i] == 'Result':
					pass

				elif datasetcommand[i] == 'Discrete':
					if data[i] not in Listofattributes[i]:
						Listofattributes[i][data[i]] = 1
					else:
						Listofattributes[i][data[i]] +=1

				elif datasetcommand[i] == 'Continuous':
					if 'list' not in Listofattributes[i]:
						Listofattributes[i]['list'] = []
						Listofattributes[i]['list'].append(data[i])
					else:
						Listofattributes[i]['list'].append(data[i])

		for attribute in Listofattributes:
			if 'list' in attribute:
				avg, stv = returnAverageandStandarddeviation(attribute['list'])
				attribute['avg'] = avg
				attribute['stdev'] = stv
			else:
				totalcount = 0
				for category in attribute:
					totalcount += attribute[category]
				attribute['totalcount'] = totalcount
		HashofClasses[Class] = Listofattributes

	return HashofClasses

def predict(x_test,y_test,summarizeddata,datasetcommand):
	t = 0
	f = 0
	ListofResults = []

	for x, y in zip(x_test,y_test):
		Validation = y

		hashclasspredict = {}

		for Class in summarizeddata['ratio']:
			probY = summarizeddata['ratio'][Class]
			probXgivenY = 1

			for i in range(0,len(x),1):
				if datasetcommand[i] == 'ID' or datasetcommand[i] == 'Result':
					pass
				elif datasetcommand[i] == 'Discrete':
					probXgivenY *= float(summarizeddata[Class][i][x[i]])/summarizeddata[Class][i]['totalcount']
				elif datasetcommand[i] == 'Continuous':
					probXgivenY *= calculateProbability(x[i], summarizeddata[Class][i]['avg'], summarizeddata[Class][i]['stdev'])	
	
			totalprob = probY* probXgivenY
			hashclasspredict[Class] = totalprob

		Prediction = 0
		for Class in hashclasspredict:
			if hashclasspredict[Class] > Prediction:
				Prediction = hashclasspredict[Class]
				Highestclass = Class
		ListofResults.append(Highestclass)
		if Validation == Highestclass:
			t+=1
		else:
			f+=1
	return ListofResults, float(t)/(f+t) 

def K_fold_Cross_Validate(x,y,k=5):
	Testsetsize = len(x)/k
	splitBegin = 0
	splitEnd = Testsetsize
	Acclist = []
	while splitEnd <len(x):

		x_test = x[splitBegin:splitEnd]
		y_test = y[splitBegin:splitEnd]

		x_train = x[:splitBegin] +x[splitEnd:]
		y_train = y[:splitBegin] +y[splitEnd:]

		Summary = summarizeddata(x_train,y_train,datasetcommand = ['Continuous','Discrete','Continuous','Continuous','Continuous','Continuous','Continuous'])
		Result, Accuracy = predict(x_test,y_test,Summary,datasetcommand = ['Continuous','Discrete','Continuous','Continuous','Continuous','Continuous','Continuous'])
		Acclist.append(Accuracy) 
		splitEnd +=  Testsetsize
		splitBegin +=  Testsetsize
	print "when k = "+ str(k)+" cross validation = "+ str(sum(Acclist) / float(len(Acclist)))



def K_fold_Test_Each_Data(x,y,k=5):
	f = open("Output.csv", "wt")
	Testsetsize = len(x)/k
	splitBegin = 0
	splitEnd = Testsetsize
	TrainerSummary = []
	true = 0
	false =0
	while splitEnd <len(x):
		x_train = x[:splitBegin] +x[splitEnd:]
		y_train = y[:splitBegin] +y[splitEnd:]

		Summary = summarizeddata(x_train,y_train,datasetcommand = ['Continuous','Continuous','Continuous','Continuous','Continuous','Continuous','Continuous'])
		TrainerSummary.append(Summary)
		splitEnd +=  Testsetsize
		splitBegin +=  Testsetsize

	for i in range(0,len(x),1):
		expected_y = y[i]

		MeanOutput = []

		x_test = [x[i]]
		y_test = [y[i]]

		for trainer in TrainerSummary:
			Result, Accuracy = predict(x_test,y_test,trainer,datasetcommand = ['Continuous','Continuous','Continuous','Continuous','Continuous','Continuous','Continuous'])
			MeanOutput.append(Result[0])
			splitEnd +=  Testsetsize
			splitBegin +=  Testsetsize

		count = Counter(MeanOutput)
		Result = count.most_common()[0][0]

		if Result == expected_y:
			true+=1
		else:
			false+=1
		f.write(str(Result))
		f.write("\n")
	print "accuracy of " +str(k)+ " fold cross validation = "+str(float(true)/(true+false))


def different_k_crossvalidation(List,X,Y):
	for k in List:
		print "when k = "+str(k)+" cross validation accuracy = :"
		K_fold_Cross_Validate(X,Y,k)

X = load_x(['marathon_ratio','sex','age','n_oasis','2014-09-28','2013-09-22','2013-02-17'])
Y = load_y()

K_fold_Cross_Validate(X,Y,k=5)
K_fold_Test_Each_Data(X,Y,k=5)

