### Imports

In [None]:
# Datetime, for logging
from datetime import datetime

# Pandas, for reading excel files
import pandas as pd

# System stuff, for authentication and path stuff
from __future__ import print_function
import pickle
import os.path
import io
from io import BytesIO

# structs to help organize events
from collections import namedtuple

# Google stuff
from googleapiclient.discovery import build
from googleapiclient.http import MediaIoBaseDownload
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request

### Some constant variable definitions:

In [None]:
# Some important variables that may change over time
teamFolderName = "HSV Fellows Team Folder" # What is the exact name of the shared drive folder

attendanceFolderName = "Attendance Sheets" # And where do we store attendance records

logFileName = f"{datetime.now()}_output.log" # Where we will store all the output of the program

debugMode = True # If this is enabled, all of the output will also be printed to the screen (as well as the log file)

credentialsFileName = "ncsu_credentials.json" # The credentials file that you download from Google

pickleFileName = "token.pickle" # Where the credentials will be stored

attendanceFolderSearchDepth = 2 # How many levels of directories we should look to find the attendance

downloadFolder = 'downloads' # Our downloads folder (make sure there isn't a trailing '/')

Event = namedtuple("Event", "name date villagePoints hosts id")

### Some simple methods that will help us with various things

In [None]:
# This will help us log the output of the program
def log(string, alsoPrint):
    with open(logFileName, "a") as logFileBuffer:
        logFileBuffer.write(f"[{datetime.now()}]:  {string}\n")
        
    if alsoPrint:
        print(f"[{datetime.now()}]:  {string}")

### Method to setup authentication

In [None]:
def setupAuthentication():
    
    creds = None
    
    # The file token.pickle stores the user's access and refresh tokens, and is
    # created automatically when the authorization flow completes for the first
    # time.
    if os.path.exists(pickleFileName):
        with open(pickleFileName, 'rb') as token:
            creds = pickle.load(token)
            
    # If there are no (valid) credentials available, let the user log in.
    if not creds or not creds.valid:
        log(f"Didn't find credentials from {pickleFileName}", debugMode)
        
        # Either way set up the credentials
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
            log(f"Refreshing credentials via Google", debugMode)
        else:
            flow = InstalledAppFlow.from_client_secrets_file(
                credentialsFileName, SCOPES)
            creds = flow.run_local_server(port=0)
            log(f"Refreshing credentials via secrets file {credentialsFileName}", debugMode)
            
        # Save the credentials for the next run
        with open(pickleFileName, 'wb') as token:
            pickle.dump(creds, token)
    else:
        log(f"Found credentials from {pickleFileName}", debugMode)
        
    return creds

### Method to find the Team folder ID

In [None]:
def findTeamFolderID(service):
    
    # This query string specifies that we are looking only for files that fit the mimetype in '', which means folders
    queryString = "mimeType = 'application/vnd.google-apps.folder'"
    # List all of the files that match our query
    results = service.files().list(q=queryString, pageSize=300, fields="nextPageToken, files(name, id)").execute()
    # Now grab those bad boys so we can mess around with them
    items = results.get('files', [])

    # We are going eventually single out the Fellow Team Folder, and we will store its GDrive ID here
    # This could also be found by looking at the long hexadecimal string in the url of the drive folder
    # but we want to find it automatically
    teamFolderID = None
    
    if not items:
        # Hopefully we don't find nothing, but it we do we wanna stop
        log('No files found in drive', debugMode)
        return
    else:
        # Otherwise we wanna print out that we found the folder and save the ID
        for item in items:
            if item['name'] == teamFolderName:
                log(u'Found team folder: ({0}, {1})'.format(item['name'], item['id']), debugMode)
                teamFolderID = item['id']

    if teamFolderID == None:
        # We didn't find the team folder, which is bad
        log('Team folder not found, try checking your team drive folder name and make sure it is exactly correct.', debugMode)
        return
    
    return teamFolderID

### Method to list files in a folder recursively (used in the next two methods)

In [None]:
def recursiveListFilesInFolder(rootFolderID, depth, mimeType):

    queryString = "'" + rootFolderID + "' in parents"
    folderSearch = service.files().list(q=queryString, pageSize=100, fields="nextPageToken, files(name, id, mimeType)").execute()
    files = folderSearch.get('files', [])

    allFiles = []

    for file in files:
        if file['mimeType'] == mimeType:
            allFiles.append(file)
        if depth != 0 and file['mimeType'] == 'application/vnd.google-apps.folder':
            allFiles = allFiles + recursiveListFilesInFolder(file['id'], depth - 1, mimeType)

    return allFiles


### Method to find the Attendance folder ID

In [None]:
def findAttendanceFolderID(teamFolderID, service):
        
    # Recursively look for folders in the team folder, so we can search for our attendance one
    files = recursiveListFilesInFolder(teamFolderID, attendanceFolderSearchDepth, 'application/vnd.google-apps.folder')
    
    attendanceFolderID = None
    # Look for our attendance folder
    if not files:
        log('No files found', debugMode)
    else:
        for file in files:
            if file['name'] == attendanceFolderName:
                log(u'Found attendance folder: ({0}, {1})'.format(file['name'], file['id']), debugMode)
                attendanceFolderID = file['id']
                
    if attendanceFolderID == None:
        # We didn't find the folder where attendance sheets are kept
        log(f'Specified attendance folder not found, make sure that the folder name is exactly correct. You can also try increasing the search depth (current={attendanceFolderSearchDepth})', debugMode)
        return
    
    return attendanceFolderID

### Method to list all sheets in the attendance folder

In [None]:
def getSheetIDInFolder(attendanceFolderID):
    # A depth of -1 means it will search until there are no more directories left ie. infinite depth
    # Let's just hope there aren't any symlinks in this folder :/
    spreadsheets = recursiveListFilesInFolder(attendanceFolderID, -1, 'application/vnd.google-apps.spreadsheet')
    
    if len(spreadsheets) == 0:
        log(f"No spreadsheets found in attendance folder with ID {attendanceFolderID}", debugMode)
        return
    
    spreadsheetIDNames = [[spreadsheets[i]["id"], spreadsheets[i]["name"]] for i in range(len(spreadsheets))]
    
    for s in spreadsheets:
        log(f"Found sheet {s}", debugMode)
        
    return spreadsheetIDNames
    

### Method to download all of the sheets

In [None]:
def downloadSheets(sheetIDList, service):
    
    fileList = ['' for i in range(len(sheetIDList))]
    
    log(f"Beginning download for {len(sheetIDList)} files", debugMode)
    
    i = 0
    for sheetID in sheetIDList:
        
        log(f"Downloading file with ID {sheetID[0]} ({sheetID[1]})...", debugMode)
        
        request = service.files().export_media(fileId=sheetID[0],
                                                 mimeType='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet')
    
        fh = io.BytesIO()
        downloader = MediaIoBaseDownload(fh, request)
        done = False
        while done is False:
            status, done = downloader.next_chunk()
        
        with open(f"{downloadFolder}/{sheetID[0]}.xlsx", 'wb') as file:
            file.write(fh.getvalue())
        fh.close()
        
        fileList[i] = [f"{downloadFolder}/{sheetID[0]}.xlsx", sheetID[0], sheetID[1]]
        i = i + 1
        
        log(f"Download completed for file with ID {sheetID[0]} ({sheetID[1]}): saved to {downloadFolder}/{sheetID[0]}.xlsx", debugMode)
        
    log(f"Download for {len(sheetIDList)} files complete", debugMode)

    return fileList

### The method to parse excel sheets for event/student info

In [None]:
def parseSheet(sheetFile):
    
    # The test file we will be working on
    #sheetName = "Career Studio Takeover.xlsx"

    #try:
        # Load in the excel file using pandas
    file = pd.read_excel(sheetFile[0])
    #except:
    #    log(f'File {sheetName} not found!', debugMode)
    #    return None
    
    #print(file.columns)

    # We can't expect that everyone will format the name exactly as it should be,
    # so we want to identify what each header is actually called
    # Below we have a list of words that should show up in one way or another for each category
    keywords = {'Student ID':["id", 'student', 'ncsu'],
                'Event Name':['event', 'program', 'name'],
                'Student Name':['name', 'first'],
                'Village Points':['village', 'points', 'vp'],
                'Hosts':['host', 'fellow', 'leader'],
                'Date':['date', 'day', 'event', 'program']}
    actualKeywords = {}

    # These are characters that may show up that we don't want to influence the identification process
    arbitraryChars = [':', '(', ')', '[', ']', '{', '}']

    for header in file.columns:
        # Make it lowercase
        currHeader = header.lower()

        # We want to ignore columns that don't have headers
        if 'Unnamed' in header:
            continue

        # Remove semicolons
        for c in arbitraryChars:
            currHeader = currHeader.replace(c, '')

        #print(currHeader)

        qualifierMatches = {}
        i = 0
        for keyword, qualifiers in keywords.items():
            qualifierMatches[keyword] = 0
            for q in qualifiers:
                if q in currHeader:
                    qualifierMatches[keyword] = qualifierMatches[keyword] + 1

            # This is kinda an obscure way of weighting things, but it seems to work for now
            qualifierMatches[keyword] = float(qualifierMatches[keyword]) / (float(len(currHeader.split())) * float(len(keywords[keyword])))
            i = i + 1

        # Now we want to find which index of qualifierMatches has the highest value, which will become the actualKeyword
        key = max(qualifierMatches, key=qualifierMatches.get)
        #print(key)
        actualKeywords[key] = header

        #print(qualifierMatches)

    #print(actualKeywords)

    # We now know where all of our information is, so we should start grabbing it
    
    
    villagePoints = None
    # We now know where all of our information is, so we should start grabbing it
    possibleVillagePoints = file[actualKeywords['Village Points']].dropna().values.tolist()
    #print(possibleVillagePoints)
    
    if isinstance(possibleVillagePoints, list):
        if len(possibleVillagePoints) == 1:
            villagePoints = possibleVillagePoints[0]
            log(f"Found value for village points: {villagePoints} in file {sheetFile[0]} ({sheetFile[2]})", debugMode)
        elif len(possibleVillagePoints) > 1:
            log(f"Warning: more than one populated cell under header {actualKeywords['VillagePoints']}; attempting to take first value", debugMode)
            villagePoints = possibleVillagePoints[0]
            log(f"Found value for village points: {villagePoints} in file {sheetFile[0]} ({sheetFile[2]})", debugMode)
        else:
            log(f"ERROR: Village point value for {sheetFile[0]} ({sheetFile[2]}) not specified, ignoring this file!", debugMode)
            return None
    else:
        villagePoints = possibleVillagePoints
        log(f"Found value for village points: {villagePoints} in file {sheetFile[0]} ({sheetFile[2]})", debugMode)

    eventDate = None
    # Find our possible dates
    
    possibleDates = file[actualKeywords['Date']].dropna().values.tolist()
    #print(possibleDates)
        
    if isinstance(possibleDates, list):
        if len(possibleDates) == 1:
            eventDate = possibleDates[0]
            log(f"Found value for event date: {eventDate} in file {sheetFile[0]} ({sheetFile[2]})", debugMode)
        elif len(possibleDates) > 1:
            log(f"Warning: more than one populated cell under header {actualKeywords['Date']}; attempting to take first value", debugMode)
            eventDate = possibleDates[0]
            log(f"Found value for event date: {eventDate} in file {sheetFile[0]} ({sheetFile[2]})", debugMode)

        else:
            log(f"ERROR: Event date for {sheetFile[0]} ({sheetFile[2]}) not specified, ignoring this file!", debugMode)
            return None
    else:
        eventDate = possibleDates
        log(f"Found value for event date: {eventDate} in file {sheetFile[0]} ({sheetFile[2]})", debugMode)
    
    
    hosts = None
    possibleHosts = file[actualKeywords['Hosts']].dropna().values.tolist()
    #print(possibleHosts)
    
    if isinstance(possibleHosts, list):
        # We can have more than one host, so this is alright to have more than one cell populated
        if len(possibleHosts) >= 1:
            hosts = possibleHosts[0]
            log(f"Found value for hosts: {hosts} in file {sheetFile[0]} ({sheetFile[2]})", debugMode)
        else:
            log(f"ERROR: Hosts for {sheetFile[0]} ({sheetFile[2]}) not specified, ignoring this file!", debugMode)
            return None
    else:
        hosts = possibleHosts
        log(f"Found value for hosts: {hosts} in file {sheetFile[0]} ({sheetFile[2]})", debugMode)

    eventName = None
    possibleNames = file[actualKeywords['Event Name']].dropna().values.tolist()
    #print(possibleNames)
    
    if isinstance(possibleNames, list):
        if len(possibleNames) == 1:
            eventName = possibleNames[0]
            log(f"Found value for event name: {eventName} in file {sheetFile[0]} ({sheetFile[2]})", debugMode)
        elif len(possibleNames) > 1:
            log(f"Warning: more than one populated cell under header {actualKeywords['Event Name']}; attempting to take first value", debugMode)
            eventName = possibleNames[0]
            log(f"Found value for event date: {eventName} in file {sheetFile[0]} ({sheetFile[2]})", debugMode)
        else:
            log(f"ERROR: Event name for {sheetFile[0]} ({sheetFile[2]}) not specified, ignoring this file!", debugMode)
            return None
    else:
        eventName = possibleNames
        log(f"Found value for event name: {eventName} in file {sheetFile[0]} ({sheetFile[2]})", debugMode)

    # Create a struct for the event
    event = Event(eventName, eventDate, villagePoints, hosts, sheetFile[1])
    
    #print([villagePoints, eventDate, hosts, eventName])

    students = file[actualKeywords['Student ID']].dropna().values.tolist()
    #print(students)
    
    # Now that we have all of the information about the event, we should return all of the students who atended and the information about the event
    return [event, students]

### The method to parse all of the files (loop over previous method)

### The method to delete all of the downloaded files

In [None]:
def cleanUpDownloadFolder():
    
    files = os.listdir(downloadFolder)
    
    log(f'Removing {len(files)} files from {downloadFolder}', debugMode)
    
    for f in files:
        os.remove(f'{downloadFolder}/{f}')
        log(f'Removed file {downloadFolder}/{f})', debugMode)
    
    log(f'Removed {len(files)} files', debugMode)
    
    return

## Finally, put it all together

In [None]:
# First, we want to setup our credentials
creds = setupAuthentication()

# Create an object to interact with Google Drive
service = build('drive', 'v3', credentials=creds)

# Fetch the team folder ID
teamFolderID = findTeamFolderID(service)

# Find the attendance folder
attendanceFolderID = findAttendanceFolderID(teamFolderID, service)

# List all of the sheets inside the attendance folder
sheetIDList = getSheetIDInFolder(attendanceFolderID)

# Download all of the sheets and get a list of file names
fileList = downloadSheets(sheetIDList, service)

studentList = {}

for file in fileList:
    results = parseSheet(file)
    if not results == None:
        for s in results[1]:
            try:
                studentList[int(s)] = [studentList[s]] + [results[0]]
            except:
                studentList[int(s)] = results[0]
    
cleanUpDownloadFolder()

for k, v in studentList.items():
    log(f"Student ID: {k}, Events: {v}", debugMode)

log("Execution completed", debugMode)

In [None]:
for file in fileList:
    results = parseSheet(file)
    if not results == None:
        for s in results[1]:
            try:
                studentList[s] = studentList[s] + results[0]
            except:
                studentList[s] = results[0]

In [None]:
for file in ['Career Studio Takeover.xlsx', 'Career Studio Takeover2.xlsx']:
    test = parseSheet(file)
    for s in test[1]:
        try:
            studentList[int(s)] = [studentList[s]] + [test[0]]
        except:
            studentList[int(s)] = test[0]
        
        
print(studentList)

In [None]:
for k, v in studentList.items():
    print(f"{k}, {v}")