In [21]:
import pandas as pd
import numpy as np
import sqlalchemy as sa
from datetime import datetime

from collections import defaultdict
from json import JSONEncoder

from simple_salesforce import Salesforce
from os import environ
from dotenv import load_dotenv
load_dotenv()



True

In [2]:
logfile = open("TableETL.log","w")
def lprint(val):
    outstr = "[%s] %s" % (datetime.now(), val)
    print(outstr)
    logfile.write(outstr)
    logfile.write("\n")
    logfile.flush()
    
lprint("Started...")

[2023-07-12 17:07:30.440996] Started...


In [3]:
sfusername = environ.get('sfusername')
sfpassword = environ.get('sfpassword')
sfsecret = environ.get("sfsecret")
sfinstanceurl = environ.get("sfinstanceurl")
#connstr = environ.get("connstr")
connstr = environ.get("KNOS_Datawarehouse")

In [4]:
lprint("Creating engine")
engine = sa.create_engine(connstr, fast_executemany=True)

[2023-07-12 17:07:30.454959] Creating engine


In [22]:
desiredTables = [
    'School_Program__c',
    'ConstituentRole',
    'ContactProfile',
    'Grade_Level__c',
    'Address',
    'AcademicTerm',
    'AcademicYear',
    'Display_School__c',
    'ContactContactRelation',
    'AccountContactRelation',
    'AccountContactRole',
    'AcademicTermEnrollment',
    'Account',
    'Contact'
]


In [6]:
soqlFilters = defaultdict(lambda: "where IsDeleted = false", {
    'AcademicTermEnrollment':'where Active__c = true and IsDeleted = false'
})

In [7]:
lprint("Creating sessions...")
sf = Salesforce(username=sfusername, password=sfpassword, security_token=sfsecret, instance_url=sfinstanceurl, version='57.0')
lprint("Session created!")

[2023-07-12 17:07:30.502099] Creating sessions...
[2023-07-12 17:07:31.507475] Session created!


In [8]:
metaData = {}

for tbl in desiredTables:
    fieldDescs = {}
    
    lprint("Getting metadata for %s" % tbl)
    
    tblDesc = getattr(sf, tbl).describe()
    
    for field in tblDesc['fields']:
        fieldDescs[field['name']] = {
                                'type':field['type'],
                                'length':field['length']
        }
        
    
    metaData[tbl] = fieldDescs
    


[2023-07-12 17:07:31.518033] Getting metadata for Contact


In [9]:
outputData = {}

for tbl in desiredTables:
    lprint("Querying data for %s" % tbl)
    
    feilds = ", ".join(metaData[tbl].keys())
    
    soql = "select %s from %s %s" % (feilds, tbl, soqlFilters[tbl])
    
    
    lprint("Starting query:  %s" % soql)
    resp = sf.query_all(soql)
    
    outputData[tbl] = resp
    
    lprint("Finished %s" % tbl)
    

[2023-07-12 17:07:32.143248] Querying data for Contact
[2023-07-12 17:07:32.144247] Starting query:  select Id, IsDeleted, MasterRecordId, AccountId, IsPersonAccount, LastName, FirstName, Salutation, Name, RecordTypeId, OtherStreet, OtherCity, OtherState, OtherPostalCode, OtherCountry, OtherLatitude, OtherLongitude, OtherGeocodeAccuracy, OtherAddress, MailingStreet, MailingCity, MailingState, MailingPostalCode, MailingCountry, MailingLatitude, MailingLongitude, MailingGeocodeAccuracy, MailingAddress, Phone, Fax, MobilePhone, HomePhone, OtherPhone, AssistantPhone, ReportsToId, Email, Title, Department, AssistantName, LeadSource, Birthdate, Description, OwnerId, CreatedDate, CreatedById, LastModifiedDate, LastModifiedById, SystemModstamp, LastActivityDate, LastCURequestDate, LastCUUpdateDate, LastViewedDate, LastReferencedDate, EmailBouncedReason, EmailBouncedDate, IsEmailBounced, PhotoUrl, Jigsaw, JigsawContactId, IndividualId, DeceasedDate, Pronouns, GenderIdentity, Is_Student_in_Termi

In [10]:
for tbl in outputData.keys():
    lprint("%s totalRecords %d" % (tbl, outputData[tbl]['totalSize']))

[2023-07-12 17:07:40.854252] Contact totalRecords 7408


In [11]:
lprint("Turning output data into dataframes")
dataFrames = {tbl:pd.DataFrame.from_dict(outputData[tbl]['records']) for tbl in desiredTables}

for key in dataFrames.keys():
    
    
    if 'attributes' in dataFrames[key].columns:
        
        lprint("Dropping attributes from %s" % tbl)
        dataFrames[key].drop(columns=['attributes'], inplace=True)
    
    lprint("%s shape %s" % (key, dataFrames[key].shape))

[2023-07-12 17:07:40.863027] Turning output data into dataframes
[2023-07-12 17:07:41.184899] Dropping attributes from Contact
[2023-07-12 17:07:41.201858] Contact shape (7408, 93)


In [12]:
for key in dataFrames.keys():
    
    lprint("Converting datetimes for %s..." % key)
    
    for col in dataFrames[key].columns:
        if metaData[key][col]['type'] in ['date', 'datetime']:
            
            lprint("Converting %s to datetime..." % col)
            
            dataFrames[key][col] = pd.to_datetime(dataFrames[key][col])
            
    
    

[2023-07-12 17:07:41.211864] Converting datetimes for Contact...
[2023-07-12 17:07:41.211864] Converting Birthdate to datetime...
[2023-07-12 17:07:41.239764] Converting CreatedDate to datetime...
[2023-07-12 17:07:41.267996] Converting LastModifiedDate to datetime...
[2023-07-12 17:07:41.295591] Converting SystemModstamp to datetime...
[2023-07-12 17:07:41.324825] Converting LastActivityDate to datetime...
[2023-07-12 17:07:41.337791] Converting LastCURequestDate to datetime...
[2023-07-12 17:07:41.350864] Converting LastCUUpdateDate to datetime...
[2023-07-12 17:07:41.363419] Converting LastViewedDate to datetime...
[2023-07-12 17:07:41.377336] Converting LastReferencedDate to datetime...
[2023-07-12 17:07:41.391433] Converting EmailBouncedDate to datetime...
[2023-07-12 17:07:41.404353] Converting DeceasedDate to datetime...
[2023-07-12 17:07:41.417354] Converting Date_Promotion_Decision_was_Entered__c to datetime...
[2023-07-12 17:07:41.434443] Converting Re_Entry_Date__c to dateti

In [13]:
jcoder = JSONEncoder()

In [14]:
for tbl in dataFrames.keys():
    for col in dataFrames[tbl].columns:
        
        if np.count_nonzero(dataFrames[tbl][col].map(lambda a: isinstance(a, dict ) or isinstance(a, list ))) > 0:
            lprint("Ordered dict found in %s column %s converting..." % (tbl, col)) 
            
            dataFrames[tbl][col] = dataFrames[tbl][col].map(jcoder.encode)
            
            lprint("Updating type for %s column %s to JSON..." % (tbl, col)) 
            metaData[tbl][col]['type'] = 'JSON'
            
            fieldLen = int(dataFrames[tbl][col].str.len().max())
            
            lprint("Setting fieldlen to %d for %s" % (fieldLen, col))            
            metaData[tbl][col]['length'] = int(dataFrames[tbl][col].str.len().max())
            
            
            

[2023-07-12 17:07:41.559169] Ordered dict found in Contact column MailingAddress converting...
[2023-07-12 17:07:41.611021] Updating type for Contact column MailingAddress to JSON...
[2023-07-12 17:07:41.617016] Setting fieldlen to 203 for MailingAddress


In [15]:
staticFields = {
                 'boolean':sa.Boolean,
                 'date':sa.DATE,
                 'datetime':sa.DATETIME,
                 'double': sa.FLOAT,
                 #'email',
                 #'id',
                 'int':sa.INT,
                 #'multipicklist',
                 #'picklist',
                 #'reference',
                 #'string',
                 'textarea':sa.TEXT,
                 #'JSON':sa.JSON
}

#this is a mess
def getSQLTypes(tbl):
    
    sqlTypes = {}
    
    lprint("Getting SQLTypes for %s" % tbl)
    
    curMeta = metaData[tbl]
        
    for field in curMeta.keys():
        
        if curMeta[field]['type'] in staticFields.keys():
            sqlTypes[field] = staticFields[curMeta[field]['type']]()
        
        
        else:
            fieldLen = curMeta[field]['length']
            
            if fieldLen <= 255:
                sqlTypes[field] = sa.NVARCHAR(fieldLen)
            
            #this is a fix they set some of the custom field max values to weird stuff
            elif np.count_nonzero(~pd.isna(dataFrames[tbl][field])) > 0 \
                            and ( fieldLen := int(dataFrames[tbl][field].str.len().max())) <= 255:
                sqlTypes[field] = sa.NVARCHAR(fieldLen)                
                
            else:
                sqlTypes[field] = sa.TEXT()
            
    return sqlTypes
            
#getSQLTypes('AcademicTermEnrollment')


In [16]:
for tbl in desiredTables:
    csvTblName = "SalesForceEduCloud_%s.csv" % tbl
    lprint("Saving %s to %s" % (tbl, csvTblName))

    dataFrames[tbl].to_csv(csvTblName, index=False)

    lprint("Finished saving %s!" % tbl)

[2023-07-12 17:07:41.864901] Saving Contact to SalesForceEduCloud_Contact.csv
[2023-07-12 17:07:42.383799] Finished saving Contact!


In [20]:
with engine.connect() as conn:

    for tbl in desiredTables:
        
        if dataFrames[tbl].shape[0] == 0:
            lprint("Skipping %s no data!" % tbl)
            continue
        
        sqlTypes = getSQLTypes(tbl)
        
        
        sqlTblName = "SalesForceEduCloud_%s" % tbl
        lprint("Uploading table %s to %s" % (tbl,  sqlTblName))
        
        dataFrames[tbl].to_sql(sqlTblName, conn, schema='etl', if_exists='replace', index=False, dtype=sqlTypes, chunksize=1)
        
        lprint("Finished uploading %s!" % tbl)

[2023-07-12 17:08:57.671670] Getting SQLTypes for Contact
[2023-07-12 17:08:57.688897] Uploading table Contact to testSalesForceEduCloud_Contact
[2023-07-12 17:09:16.601328] Finished uploading Contact!


In [None]:
lprint("=============DONE!===================")

In [None]:
logfile.close()

In [None]:
print("Log file closed")