In [1]:
import pandas as pd
import numpy  as np

In [137]:
# load in data, specify dtype of ambiguous data
df1 = pd.read_csv("data/JobOper.csv", encoding="latin1", dtype={'JobNum': str, 'PurPoint': str, 'WIName': str})
df2 = pd.read_csv("data/jobHead.csv", encoding="latin1", dtype={'JobNum': str, 'EngineerCode_c': str})
df3 = pd.read_csv("data/LaborDtl.csv", encoding="latin1", dtype={'JobNum': str})

In [138]:
# keep only completed Jobs
df1 = df1[df1.JobComplete == True]
df2 = df2[df2.JobComplete == True]

#### Look at available columns in JobOper Table

In [80]:
cols = sorted(df1.columns)

print('JobOper Table\n------------')
for i in range(0, len(cols) - 5, 5):
    print('{:20}{:20}{:20}{:20}{:20}'.format(*cols[i:i+5]))

JobOper Table
------------
ActBurCost          ActLabCost          ActProdHours        AddedOper           AssemblySeq         
CommentText         Company             DaysOut             Description         DocUnitPrice        
DueDate             DueHour             EstProdHours        EstScrap            EstScrapType        
EstSetHours         EstSetHoursPerMch   EstUnitCost         HoursPerMachine     IUM                 
JobComplete         JobEngineered       JobNum              LaborEntryMethod    LastLaborDate       
LoadDate            LoadHour            Machines            MoveDueDate         MoveDueHour         
OpCode              OpComplete          OpDesc              OprSeq              OpsPerPart          
PartNum             PrimaryProdOpDtl    PrimarySetupOpDtl   ProdBurRate         ProdComplete        
ProdCrewSize        ProdLabRate         ProdStandard        PurPoint            QtyCompleted        
QtyPer              QueStartDate        QueStartHour        Revi

In [87]:
cols = sorted(df3.columns)

print('LaborDtl Table\n------------')
for i in range(0, len(cols) - 5, 5):
    print('{:20}{:20}{:20}{:20}{:20}'.format(*cols[i:i+5]))

LaborDtl Table
------------
ActiveTrans         ClockInDate         ClockInMInute       ClockOutMinute      ClockOutTime        
ClockinTime         Company             Complete            CreateDate          Downtime            
EarnedHrs           JobNum              LaborDtlSeq         LaborEntryMethod    LaborHedSeq         
LaborHrs            LaborQty            LaborRate           LaborType           OpCode              
OpComplete          OprSeq              ResourceGrpID       ResourceID          SetupPctComplete    


In [88]:
cols = sorted(df2.columns)

print('JobHead Table\n------------')
for i in range(0, len(cols) - 5, 5):
    print('{:20}{:20}{:20}{:20}{:20}'.format(*cols[i:i+5]))

JobHead Table
------------
AnalysisCode        BasePartNum         BaseRevisionNum     CallLine            CallNum             
Candidate           CheckOff1           CheckOff2           CheckOff3           CheckOff4           
CheckOff5           CloseMeterReading   ClosedDate          CommentText         Company             
ContractID          CreateDate          CreatedBy           CustID              CustName            
Customer_c          DatePurged          DaysLate            DrawNum             DtlsWithinLeadTime  
DueDate             DueHour             EPMExportLevel      EngineerCode_c      EngineeringApproval_c
EngineeringReview_c EquipID             ExpenseCode         ExportRequested     ExternalMES         
ForeignSysRowID     Forward             GroupSeq            HDCaseNum           IUM                 
InCopyList          InitialReview_c     IsCSRSet            IssueTopicID1       IssueTopicID10      
IssueTopicID2       IssueTopicID3       IssueTopicID4       Iss

### Processing LaborDtl table: Get Actual StartDates

In [139]:
# Get Actual Start Dates from labor details
df3 = df3.loc[:, ['JobNum', 'CreateDate']].groupby('JobNum', as_index=False).apply(min)

### Processing JobHead table: getting scheduled dates

In [159]:
# drop jobs without any schedule
df = df2.copy()
df = df.dropna(axis=0, subset=['StartDate', 'JobCompletionDate'])

df['SchedDays'] = df.apply(lambda x: len(pd.bdate_range(x.StartDate, x.DueDate)), axis=1)

# Simplify df2 to just job dates
df = df.loc[:, ['JobNum', 'SchedDays', 'JobCompletionDate']]

# Join the Actual Start Dates to df2
df = df.merge(df3, how='inner', on='JobNum')
df['ActualDays'] = df.apply(lambda x: len(pd.bdate_range(x.CreateDate, x.JobCompletionDate)), axis=1)
df = df.drop(labels=['JobCompletionDate', 'CreateDate'], axis=1)
df.head()

Unnamed: 0,JobNum,SchedDays,ActualDays
0,70398,26,45
1,80586,44,42
2,68912,33,43
3,80917,23,14
4,80639,16,10


### Create a few features from JobOper 

In [160]:
# Turn these columns into strings
df1['JobNum'] = df1.JobNum.map(str)
df1['AssemblySeq'] = df1.AssemblySeq.map(str)
df1['OprSeq'] = df1.OprSeq.map(str)

In [161]:
df['JobNum'] = df.JobNum.map(str)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4413 entries, 0 to 4412
Data columns (total 3 columns):
JobNum        4413 non-null object
SchedDays     4413 non-null int64
ActualDays    4413 non-null int64
dtypes: int64(2), object(1)
memory usage: 137.9+ KB


In [162]:
# Number of operations on the job
num_operations = df1.groupby('JobNum', as_index=False).size()
num_operations = num_operations.to_frame(name='OpCount').reset_index()

# company category
company = df1[['JobNum', 'Company']].groupby('JobNum', as_index=False).first()

# Number of Production hours to complete
tot_production = df1.loc[:, ['JobNum', 'EstProdHours']].groupby('JobNum', as_index=False).sum().rename({'EstProdHours':'TotalProdHours'})

# Number of setup hours
tot_setupHours = df1.loc[:, ['JobNum', 'EstSetHours']].groupby('JobNum', as_index=False).sum().rename({'EstSetHours':'TotalSetupHours'})

# total lead time for all subcontract operations
tot_subleadtime = df1.loc[:, ['JobNum', 'DaysOut']].groupby('JobNum', as_index=False).sum().rename({'DaysOut':'TotalSubLeadTime'})

# total costs of subcontract operations
tot_subconCosts = df1.loc[:, ['JobNum', 'EstUnitCost']].groupby('JobNum', as_index=False).sum().rename({'EstUnitCost':'TotalSubCost'})

# number of subcontract operations
num_subcontract = df1.loc[:, ['JobNum', 'SubContract']].groupby('JobNum', as_index=False).sum().rename({'SubContract':'TotalSubContractOps'})

### Combine all features

In [163]:
df = df.merge(company, how='left', on='JobNum')
df = df.merge(num_operations, how='left', on='JobNum')
df = df.merge(tot_production, how='left', on='JobNum')
df = df.merge(tot_setupHours, how='left', on='JobNum')
df = df.merge(tot_subleadtime, how='left', on='JobNum')
df = df.merge(tot_subconCosts, how='left', on='JobNum')
df = df.merge(num_subcontract, how='left', on='JobNum')

df.head(10)

Unnamed: 0,JobNum,SchedDays,ActualDays,Company,OpCount,EstProdHours,EstSetHours,DaysOut,EstUnitCost,SubContract
0,70398,26,45,HEM,14.0,21.04,2.55,11.0,0.49593,3.0
1,80586,44,42,HEM,12.0,9.25,3.5,29.0,7.99764,4.0
2,68912,33,43,HEM,13.0,36.47,10.25,3.0,0.0,1.0
3,80917,23,14,HEM,7.0,16.41,2.84,8.0,12.74412,2.0
4,80639,16,10,HEM,4.0,3.5,0.0,10.0,0.35,1.0
5,70116,40,55,HEM,4.0,13.5,3.0,0.0,0.0,0.0
6,021088-1,8,5,SMC,7.0,88.02,0.5,2.0,0.0,1.0
7,021365,10,22,,,,,,,
8,020659,24,82,,,,,,,
9,81725,5,2,HEM,4.0,6.89,2.0,0.0,0.0,0.0


In [164]:
print('Total Rows:', df.shape[0])

Total Rows: 4413


In [165]:
df.describe()

Unnamed: 0,SchedDays,ActualDays,OpCount,EstProdHours,EstSetHours,DaysOut,EstUnitCost,SubContract
count,4413.0,4413.0,3025.0,3025.0,3025.0,3025.0,3025.0,3025.0
mean,27.554725,39.112169,6.72,28.401881,2.383617,10.28958,4.712025,1.246612
std,142.780306,47.654519,4.139189,109.950878,3.112705,22.727979,19.706281,1.311582
min,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,6.0,7.0,3.0,3.25,0.5,0.0,0.0,0.0
50%,15.0,25.0,6.0,8.64,2.0,5.0,0.0,1.0
75%,32.0,54.0,9.0,22.1,3.19,13.0,1.96,2.0
max,9133.0,553.0,32.0,3939.11,102.0,535.0,594.6769,10.0


In [166]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4413 entries, 0 to 4412
Data columns (total 10 columns):
JobNum          4413 non-null object
SchedDays       4413 non-null int64
ActualDays      4413 non-null int64
Company         3025 non-null object
OpCount         3025 non-null float64
EstProdHours    3025 non-null float64
EstSetHours     3025 non-null float64
DaysOut         3025 non-null float64
EstUnitCost     3025 non-null float64
SubContract     3025 non-null float64
dtypes: float64(6), int64(2), object(2)
memory usage: 379.2+ KB


In [167]:
df = df.dropna()

In [168]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3025 entries, 0 to 4412
Data columns (total 10 columns):
JobNum          3025 non-null object
SchedDays       3025 non-null int64
ActualDays      3025 non-null int64
Company         3025 non-null object
OpCount         3025 non-null float64
EstProdHours    3025 non-null float64
EstSetHours     3025 non-null float64
DaysOut         3025 non-null float64
EstUnitCost     3025 non-null float64
SubContract     3025 non-null float64
dtypes: float64(6), int64(2), object(2)
memory usage: 260.0+ KB


In [169]:
df.to_csv('features.csv', sep=',', encoding='utf-8')