In [None]:
!sudo pip install pandas xlrd

In [3]:
import pandas as pd
from datetime import date
from datetime import datetime

In [13]:
def log(string, alsoPrint):        
    if alsoPrint:
        print(string)

In [51]:
def parseSheet(sheetFile):
    
    # The test file we will be working on
    #sheetName = "Career Studio Takeover.xlsx"
    #try:
        # Load in the excel file using pandas
    file = pd.read_excel(sheetFile, parse_dates=True)
    #except:
    #    log(f'File {sheetName} not found!', debugMode)
    #    return None
    
    #print(file.columns)

    # We can't expect that everyone will format the name exactly as it should be,
    # so we want to identify what each header is actually called
    # Below we have a list of words that should show up in one way or another for each category
    keywords = {'Student ID':["id", 'student', 'ncsu'],
                'Event Name':['event', 'program', 'name'],
                'Student Name':['name', 'first'],
                'Village Points':['village', 'points', 'vp'],
                'Hosts':['host', 'fellow', 'leader'],
                'Date':['date', 'day', 'event', 'program']}
    actualKeywords = {}

    # These are characters that may show up that we don't want to influence the identification process
    arbitraryChars = [':', '(', ')', '[', ']', '{', '}']

    for header in file.columns:
        # Make it lowercase
        currHeader = header.lower()

        # We want to ignore columns that don't have headers
        if 'Unnamed' in header:
            continue

        # Remove semicolons
        for c in arbitraryChars:
            currHeader = currHeader.replace(c, '')

        #print(currHeader)

        qualifierMatches = {}
        i = 0
        for keyword, qualifiers in keywords.items():
            qualifierMatches[keyword] = 0
            for q in qualifiers:
                if q in currHeader:
                    qualifierMatches[keyword] = qualifierMatches[keyword] + 1

            # This is kinda an obscure way of weighting things, but it seems to work for now
            qualifierMatches[keyword] = float(qualifierMatches[keyword]) / (float(len(currHeader.split())) * float(len(keywords[keyword])))
            i = i + 1

        # Now we want to find which index of qualifierMatches has the highest value, which will become the actualKeyword
        key = max(qualifierMatches, key=qualifierMatches.get)
        #print(key)
        actualKeywords[key] = header

        #print(qualifierMatches)

    #print(actualKeywords)

    # We now know where all of our information is, so we should start grabbing it
    
    
    villagePoints = None
    # We now know where all of our information is, so we should start grabbing it
    possibleVillagePoints = file[actualKeywords['Village Points']].dropna().values.tolist()
    #print(possibleVillagePoints)
    
    if isinstance(possibleVillagePoints, list):
        if len(possibleVillagePoints) == 1:
            villagePoints = possibleVillagePoints[0]
            log(f"Found value for village points: {villagePoints} in file {sheetFile[0]} ({sheetFile[2]})", debugMode)
        elif len(possibleVillagePoints) > 1:
            log(f"Warning: more than one populated cell under header {actualKeywords['VillagePoints']}; attempting to take first value", debugMode)
            villagePoints = possibleVillagePoints[0]
            log(f"Found value for village points: {villagePoints} in file {sheetFile[0]} ({sheetFile[2]})", debugMode)
        else:
            log(f"ERROR: Village point value for {sheetFile[0]} ({sheetFile[2]}) not specified, ignoring this file!", debugMode)
            return None
    else:
        villagePoints = possibleVillagePoints
        log(f"Found value for village points: {villagePoints} in file {sheetFile[0]} ({sheetFile[2]})", debugMode)

    eventDate = None
    # Find our possible dates
    
    print(file[actualKeywords['Date']].dropna().dt.strftime('%m/%d/%Y').values.tolist())
    
    possibleDates = file[actualKeywords['Date']].dropna().values.tolist()
    #print(possibleDates)
        
    if isinstance(possibleDates, list):
        if len(possibleDates) == 1:
            eventDate = possibleDates[0]
            log(f"Found value for event date: {eventDate} in file {sheetFile[0]} ({sheetFile[2]})", debugMode)
        elif len(possibleDates) > 1:
            log(f"Warning: more than one populated cell under header {actualKeywords['Date']}; attempting to take first value", debugMode)
            eventDate = possibleDates[0]
            log(f"Found value for event date: {eventDate} in file {sheetFile[0]} ({sheetFile[2]})", debugMode)

        else:
            log(f"ERROR: Event date for {sheetFile[0]} ({sheetFile[2]}) not specified, ignoring this file!", debugMode)
            return None
    else:
        eventDate = possibleDates
        log(f"Found value for event date: {eventDate} in file {sheetFile[0]} ({sheetFile[2]})", debugMode)
    
    
    hosts = None
    possibleHosts = file[actualKeywords['Hosts']].dropna().values.tolist()
    #print(possibleHosts)
    
    if isinstance(possibleHosts, list):
        # We can have more than one host, so this is alright to have more than one cell populated
        if len(possibleHosts) >= 1:
            hosts = possibleHosts[0]
            log(f"Found value for hosts: {hosts} in file {sheetFile[0]} ({sheetFile[2]})", debugMode)
        else:
            log(f"ERROR: Hosts for {sheetFile[0]} ({sheetFile[2]}) not specified, ignoring this file!", debugMode)
            return None
    else:
        hosts = possibleHosts
        log(f"Found value for hosts: {hosts} in file {sheetFile[0]} ({sheetFile[2]})", debugMode)

    eventName = None
    possibleNames = file[actualKeywords['Event Name']].dropna().values.tolist()
    #print(possibleNames)
    
    if isinstance(possibleNames, list):
        if len(possibleNames) == 1:
            eventName = possibleNames[0]
            log(f"Found value for event name: {eventName} in file {sheetFile[0]} ({sheetFile[2]})", debugMode)
        elif len(possibleNames) > 1:
            log(f"Warning: more than one populated cell under header {actualKeywords['Event Name']}; attempting to take first value", debugMode)
            eventName = possibleNames[0]
            log(f"Found value for event date: {eventName} in file {sheetFile[0]} ({sheetFile[2]})", debugMode)
        else:
            log(f"ERROR: Event name for {sheetFile[0]} ({sheetFile[2]}) not specified, ignoring this file!", debugMode)
            return None
    else:
        eventName = possibleNames
        log(f"Found value for event name: {eventName} in file {sheetFile[0]} ({sheetFile[2]})", debugMode)

    print(eventDate)
    print(pd.datetime(eventDate))
        
    # Create a struct for the event
    event = Event(eventName, eventDate, villagePoints, hosts, sheetFile[1])
    
    #print([villagePoints, eventDate, hosts, eventName])

    students = file[actualKeywords['Student ID']].dropna().values.tolist()
    #print(students)
    
    
    # Now that we have all of the information about the event, we should return all of the students who atended and the information about the event
    return [event, students]

In [52]:
debugMode = True
parseSheet('2020-01-29_Career Studio Takeover.xlsx')

Found value for village points: 1.0 in file 2 (2)
['01/01/2020']
Found value for event date: 1577836800000000000 in file 2 (2)
Found value for hosts: Kip/Jack in file 2 (2)
Found value for event name: Career Studio Takeover in file 2 (2)
1577836800000000000


OverflowError: signed integer is greater than maximum

NameError: name 'eventDate' is not defined

In [None]:
? list.remove

In [28]:
print(3424-256)

3168
