In [84]:
from datetime import datetime as dt
import unicodecsv

enrollments = read_csv('enrollments.csv')
daily_engagement = read_csv('daily_engagement.csv')
project_submissions = read_csv('project_submissions.csv')

In [63]:
def read_csv(filename):
    with open(filename, 'rb') as f:
        reader = unicodecsv.DictReader(f)
        return list(reader)

In [64]:
# Takes a date as a string, and returns a Python datetime object
# If there is no date given, returns None

def parse_date(date):
    if date == '':
        return None
    else:
        return dt.strptime(date, '%Y-%m-%d')
    
    
# Takes a string which is either an empty string or represents an integer,
# and returns an int or None.

def parse_maybe_int(i):
    if i == '':
        return None
    else:
        return int(i)

In [85]:
for enrollment in enrollments:
    enrollment['cancel_date'] = parse_date(enrollment['cancel_date'])
    enrollment['days_to_cancel'] = parse_maybe_int(enrollment['days_to_cancel'])
    enrollment['is_canceled'] = enrollment['is_canceled'] == 'TRUE'
    enrollment['is_udacity'] = enrollment['is_udacity'] == 'TRUE'
    enrollment['join_date'] = parse_date(enrollment['join_date'])
    
enrollments[0]

{u'account_key': u'448',
 u'cancel_date': datetime.datetime(2015, 1, 14, 0, 0),
 u'days_to_cancel': 65,
 u'is_canceled': True,
 u'is_udacity': True,
 u'join_date': datetime.datetime(2014, 11, 10, 0, 0),
 u'status': u'canceled'}

In [66]:
for engagement_record in daily_engagement:
    engagement_record['account_key'] = engagement_record['acct']
    del[engagement_record['acct']]

In [67]:
def get_unique_students(data):
    unique_students = set()
    for data_point in data:
        unique_students.add(data_point['account_key'])
    return unique_students

In [68]:
len(enrollments)

1640

In [69]:
unique_enrolled_students = get_unique_students(enrollments)
len(unique_enrolled_students)

1302

In [70]:
len(daily_engagement)

136240

In [71]:
unique_engagement_students = get_unique_students(daily_engagement)
len(unique_engagement_students)

1237

In [72]:
len(project_submissions)

3642

In [73]:
unique_project_submitters = get_unique_students(project_submissions)
len(unique_project_submitters)

743

In [74]:
daily_engagement[0]['account_key']


u'0'

In [75]:
diff_student = []
for student in unique_enrolled_students:
    if student not in unique_engagement_students:
        diff_student.append(student)

len(diff_student)

65

In [81]:
count = 0
for enrollment in enrollments:
    if enrollment['days_to_cancel'] != 0 and enrollment['account_key'] not in unique_engagement_students:
        count += 1
count

3

In [86]:
udacity_test_accounts = set()
for enrollment in enrollments:
    if enrollment['is_udacity']:
        print enrollment['account_key']
        udacity_test_accounts.add(enrollment['account_key'])
len(udacity_test_accounts)

448
448
448
448
448
448
448
448
448
1304
1304
312
312
312
312
818
1069
1101


6

In [87]:
def remove_udacity_accounts(data):
    non_udacity_data = []
    for data_point in data:
        if data_point['account_key'] not in udacity_test_accounts:
            non_udacity_data.append(data_point)
    return non_udacity_data

In [88]:
non_udacity_enrollments = remove_udacity_accounts(enrollments)
non_udacity_engagements = remove_udacity_accounts(daily_engagement)
non_udacity_submissions = remove_udacity_accounts(project_submissions)

print len(non_udacity_enrollments)
print len(non_udacity_engagements)
print len(non_udacity_submissions)

KeyError: 'account_key'