In [1]:
import six
import sys
sys.modules['sklearn.externals.six'] = six


In [2]:
import numpy as np
import pandas as pd
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import make_pipeline
from feature_engine import missing_data_imputers as mdi
from feature_engine import categorical_encoders as ce
from statsmodels.graphics.gofplots import qqplot
from sklearn.model_selection import train_test_split

%matplotlib inline

In [3]:
with open('HRDataset.csv') as f:
    df = pd.read_csv(f)
f.close()


In [4]:
df.head()

Unnamed: 0,Employee_Name,EmpID,MarriedID,MaritalStatusID,GenderID,EmpStatusID,DeptID,PerfScoreID,FromDiversityJobFairID,PayRate,...,Department,ManagerName,ManagerID,RecruitmentSource,PerformanceScore,EngagementSurvey,EmpSatisfaction,SpecialProjectsCount,LastPerformanceReview_Date,DaysLateLast30
0,"Brown, Mia",1103024456,1,1,0,1,1,3,1,28.5,...,Admin Offices,Brandon R. LeBlanc,1.0,Diversity Job Fair,Fully Meets,2.04,2,6,1/15/19,0.0
1,"LaRotonda, William",1106026572,0,2,1,1,1,3,0,23.0,...,Admin Offices,Brandon R. LeBlanc,1.0,Website Banner Ads,Fully Meets,5.0,4,4,1/17/19,0.0
2,"Steans, Tyrone",1302053333,0,0,1,1,1,3,0,29.0,...,Admin Offices,Brandon R. LeBlanc,1.0,Internet Search,Fully Meets,3.9,5,5,1/18/19,0.0
3,"Howard, Estelle",1211050782,1,1,0,1,1,3,0,21.5,...,Admin Offices,Brandon R. LeBlanc,1.0,Pay Per Click - Google,Fully Meets,3.24,3,4,,
4,"Singh, Nan",1307059817,0,0,0,1,1,3,0,16.56,...,Admin Offices,Brandon R. LeBlanc,1.0,Website Banner Ads,Fully Meets,5.0,3,5,1/15/19,0.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 310 entries, 0 to 309
Data columns (total 35 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Employee_Name               310 non-null    object 
 1   EmpID                       310 non-null    int64  
 2   MarriedID                   310 non-null    int64  
 3   MaritalStatusID             310 non-null    int64  
 4   GenderID                    310 non-null    int64  
 5   EmpStatusID                 310 non-null    int64  
 6   DeptID                      310 non-null    int64  
 7   PerfScoreID                 310 non-null    int64  
 8   FromDiversityJobFairID      310 non-null    int64  
 9   PayRate                     310 non-null    float64
 10  Termd                       310 non-null    int64  
 11  PositionID                  310 non-null    int64  
 12  Position                    310 non-null    object 
 13  State                       310 non

## Data Pre-Processing

In [6]:
for col in df.columns:
    print(col, df[col].nunique(), len(df))

Employee_Name 310 310
EmpID 310 310
MarriedID 2 310
MaritalStatusID 5 310
GenderID 2 310
EmpStatusID 5 310
DeptID 6 310
PerfScoreID 4 310
FromDiversityJobFairID 2 310
PayRate 94 310
Termd 2 310
PositionID 30 310
Position 32 310
State 28 310
Zip 158 310
DOB 306 310
Sex 2 310
MaritalDesc 5 310
CitizenDesc 3 310
HispanicLatino 4 310
RaceDesc 6 310
DateofHire 99 310
DateofTermination 93 310
TermReason 17 310
EmploymentStatus 5 310
Department 6 310
ManagerName 21 310
ManagerID 23 310
RecruitmentSource 23 310
PerformanceScore 4 310
EngagementSurvey 188 310
EmpSatisfaction 5 310
SpecialProjectsCount 8 310
LastPerformanceReview_Date 42 310
DaysLateLast30 1 310


We can drop Employee_Name, EmpID and DOB as most of the values are unique
We can also drop DaysLateLast30 as the feature only contains one unique value

In [7]:
df.drop(['Employee_Name'], axis=1, inplace=True)
df.drop(['EmpID'], axis=1, inplace=True)
df.drop(['DOB'], axis=1, inplace=True)
df.drop(['DaysLateLast30'], axis=1, inplace=True)

In [8]:
# We can drop MaritalStatusID as the same data is maintained in the MaritalDesc feature
# We can drop EmpStatusID as the same data is maintained in the EmploymentStatus feature
# We can drop DeptID as the same data is maintained in the Department feature
# We can drop GenderID as the same data is maintained in the Sex feature
# We can drop PerformanceScore as the same data is maintained in the PerfScoreID feature
# we can drop MarriedID as the same info is maintained in MaritalDesc
df.drop(['MaritalStatusID', 'EmpStatusID', 'DeptID'], axis=1, inplace=True)
df.drop(['GenderID'], axis=1, inplace=True)
df.drop(['PerformanceScore'], axis=1, inplace=True)
df.drop(['MarriedID'], axis=1, inplace=True)

In [9]:
for col in df.columns:
    print(col, df[col].unique(), len(df))

PerfScoreID [3 1 4 2] 310
FromDiversityJobFairID [1 0] 310
PayRate [28.5  23.   29.   21.5  16.56 20.5  55.   56.   55.5  54.   57.   45.
 46.   63.5  65.   43.   48.5  40.1  34.   40.   35.5  41.   42.75 39.55
 42.2  30.2  31.4  60.   62.   21.   63.   64.   28.99 26.   27.49 42.
 37.   39.   27.   47.   28.   49.1  80.   54.5  50.5  51.   53.   38.5
 52.   33.5  16.   20.   18.   24.   15.   22.   24.75 17.   16.75 19.
 19.5  14.   16.76 15.75 21.25 24.5  18.5  19.75 15.25 23.5  15.2  25.
 24.25 26.1  26.39 28.75 22.5  60.25 50.25 52.25 47.6  57.12 55.51 49.25
 48.   45.42 34.95 61.3  58.2  58.5  54.1  56.2  53.8  55.2 ] 310
Termd [0 1] 310
PositionID [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 30 29 28] 310
Position ['Accountant I' 'Administrative Assistant' 'Area Sales Manager'
 'BI Developer' 'BI Director' 'CIO' 'Data Architect'
 'Database Administrator' 'Data Analyst' 'Data Analyst '
 'Director of Operations' 'Director of Sales' 'IT Director

In [10]:
#Let's relabel some of the values for MarriedID, FromDiversityJobFairID, Termd, HispanicLatino

diversity_map = {1: 'yes', 0: 'no'}
termd_map = {1: 'yes', 0: 'no'}
hispanic_latino_map = {'No': 'no', 'Yes': 'yes', 'no': 'no', 'yes': 'yes'}

df['FromDiversityJobFairID'].replace(diversity_map, inplace=True)
df['Termd'].replace(termd_map, inplace=True)
df['HispanicLatino'].replace(hispanic_latino_map, inplace=True)


In [11]:
df[['PositionID', 'Position']].sort_values('PositionID')[50:70]

Unnamed: 0,PositionID,Position
50,9,Data Analyst
51,9,Data Analyst
52,9,Data Analyst
53,10,Director of Operations
54,11,Director of Sales
55,12,IT Director
56,13,IT Manager - DB
57,13,IT Manager - DB
58,13,IT Manager - Infra
59,13,IT Manager - Support


We can drop PositionID as its just duplicate data from Position

In [12]:
df.drop('PositionID', axis=1, inplace=True)

In [13]:
df[['ManagerName', 'ManagerID']].sort_values(by='ManagerID').tail(50)

Unnamed: 0,ManagerName,ManagerID
27,Lynn Daneault,21.0
28,Lynn Daneault,21.0
29,Lynn Daneault,21.0
30,Lynn Daneault,21.0
31,Lynn Daneault,21.0
32,Lynn Daneault,21.0
26,Lynn Daneault,21.0
206,Michael Albert,22.0
198,Michael Albert,22.0
199,Michael Albert,22.0


We can drop ManagerID as its a duplicate of ManagerName, also ManagerID has a few missing values which ManageName doesn't have.

In [14]:
df.drop('ManagerID', axis=1, inplace=True)

In [15]:
# we need to remove trailing spaces from some values in Position, Department
df['Position'] = df['Position'].str.strip()
df['Department'] = df['Department'].str.strip()

In [16]:
df['DateofHire'] = pd.to_datetime(df['DateofHire'])
df['DateofTermination'] = pd.to_datetime(df['DateofTermination'])
df['LastPerformanceReview_Date'] = pd.to_datetime(df['LastPerformanceReview_Date'])

In [17]:
# create new features from the date fields
df['DateofHire_month'] = df['DateofHire'].dt.month
df['DateofHire_day'] = df['DateofHire'].dt.day 
df['DateofHire_year'] = df['DateofHire'].dt.year
df['DateofHire_quarter'] = df['DateofHire'].dt.quarter
df['DateofHire_day_week'] = df['DateofHire'].dt.day_name()
df['DateofHire_weekday'] = np.where(df['DateofHire_day_week'].isin(['Sunday','Saturday']),'yes','no')


df['DateofTerm_month'] = df['DateofTermination'].dt.month
df['DateofTerm_day'] = df['DateofTermination'].dt.day
df['DateofTerm_year'] = df['DateofTermination'].dt.year
df['DateofTerm_quarter'] = df['DateofTermination'].dt.quarter
df['DateofTerm_day_week'] = df['DateofTermination'].dt.day_name()
df['DateofTerm_weekday'] = np.where(df['DateofTerm_day_week'].isin(['Sunday','Saturday']),'yes','no')


df['LastPerform_month'] = df['LastPerformanceReview_Date'].dt.month
df['LastPerform_day'] = df['LastPerformanceReview_Date'].dt.day 
df['LastPerform_year'] = df['LastPerformanceReview_Date'].dt.year 
df['LastPerform_quarter'] = df['LastPerformanceReview_Date'].dt.quarter
df['LastPerform_day_week'] = df['LastPerformanceReview_Date'].dt.day_name()
df['LastPerform_weekday'] = np.where(df['LastPerform_day_week'].isin(['Sunday','Saturday']),'yes','no')


df['tenure_termed'] = df['DateofTermination'] - df['DateofHire']
df['tenure'] = datetime.datetime.today() - df['DateofHire']
df['days_since_review'] = datetime.datetime.today() - df['LastPerformanceReview_Date']

In [28]:
df.head()

Unnamed: 0,PerfScoreID,FromDiversityJobFairID,PayRate,Termd,Position,State,Zip,Sex,MaritalDesc,CitizenDesc,...,DateofTerm_weekday,LastPerform_month,LastPerform_day,LastPerform_year,LastPerform_quarter,LastPerform_day_week,LastPerform_weekday,tenure_termed,tenure,days_since_review
0,3,yes,28.5,no,Accountant I,MA,1450,F,Married,US Citizen,...,no,1.0,15.0,2019.0,1.0,Tuesday,no,,4351,619.0
1,3,no,23.0,no,Accountant I,MA,1460,M,Divorced,US Citizen,...,no,1.0,17.0,2019.0,1.0,Thursday,no,,2454,617.0
2,3,no,29.0,no,Accountant I,MA,2703,M,Single,US Citizen,...,no,1.0,18.0,2019.0,1.0,Friday,no,,2188,616.0
3,3,no,21.5,yes,Administrative Assistant,MA,2170,F,Married,US Citizen,...,no,,,,,,no,58.0,2048,
4,3,no,16.56,no,Administrative Assistant,MA,2330,F,Single,US Citizen,...,no,1.0,15.0,2019.0,1.0,Tuesday,no,,1974,619.0


In [31]:
# we can drop the original date fields as they have been encoded
df.drop(['DateofHire', 'DateofTermination', 'LastPerformanceReview_Date'], axis=1, inplace=True)

In [19]:
# subtring irrelevant data from days since review 
df['days_since_review'] = df['days_since_review'].astype(str)
df['days_since_review'] = [i[0:3] for i in df['days_since_review']]

In [21]:
# substring irrelevant data from tenure
df['tenure'] = df['tenure'].astype(str)
df['tenure'] = [i[0:4] for i in df['tenure']]
  

In [23]:
df['tenure_termed'] = df['tenure_termed'].astype(str)
df['tenure_termed'] = [i[0:2] for i in df['tenure_termed']]

In [27]:
# Nulls for Datetime fields are not labeled as np.nan but as NaT. 
# We need to convert these to np.nan
for var in df.columns:
    df[var].replace(to_replace=['NaT','Na'], value=np.nan, inplace=True)
    


In [None]:
# Cardinality: the number of unique values/categories for each feature
# features with categories which hold 90%+ of values don't have enough variability and
#  can be removed
for var in df.columns:
    print(var, '\n', df[var].value_counts()/len(df))

In [29]:
# We can drop CitizenDesc, DateofHire_weekday, DateofTerm_weekday, LastPerform_quarter, 
#  LastPerform_weekday, LastPerform_year
# The values in the features did not have enough variability to be kept
df.drop(['CitizenDesc', 'DateofHire_weekday', 'DateofTerm_weekday', 
         'LastPerform_quarter', 'LastPerform_weekday', 'LastPerform_year'], axis=1, inplace=True)

In [34]:
continuous = ['PayRate', 'EngagementSurvey',]
discrete = ['SpecialProjectsCount', 'tenure','tenure_termed','days_since_review', 
            'PerfScoreID', 'EmpSatisfaction']
categorical = ['FromDiversityJobFairID', 'Termd', 'Position', 'State', 'Zip', 'Sex',
               'MaritalDesc', 'HispanicLatino', 'RaceDesc', 'TermReason',
               'EmploymentStatus', 'Department', 'ManagerName', 'RecruitmentSource',
               'DateofHire_month', 'DateofHire_day', 'DateofHire_quarter',
               'DateofHire_day_week', 'DateofTerm_month', 'DateofTerm_day',
               'DateofTerm_quarter', 'DateofTerm_day_week', 'LastPerform_month',
               'LastPerform_day', 'LastPerform_day_week', 'DateofHire_year', 'DateofTerm_year']

In [None]:
df.info()

In [None]:
# Let's how examine the percentage of each category for each categorical feature
# This will help us better understand which features have rare categories which 
#  can be aggregated together into 'rare'
for var in categorical:
    print(var, '\n', df[var].value_counts()/len(df))

We encode categories of low frequency as 'rare' in order to help with cardinality which will help when encoding our categorical features (especially when using one-hot encoding).  Secondly, sometimes when we split our data into train/test the training set wouldn't 'see' or fit a rare category and when the category appears in the test set an error will be thrown. 
- Position: Anything less than 1% will be considered ‘rare’
- State: Anything less than 1% will be considered ‘rare’
- Zip: Anything less than 1% will be considered ‘rare’
- RaceDesc: Anything less than 2% will be considered ‘rare’
- ManagerName: Anything less than 5% will be considered ‘rare’
- RecruitmentSource: Anything less than 2% will be considered ‘rare’
- DateofHire_day: Anything less than 1% will be considered ‘rare’
- DateofTerm_month: Anything less than 2% will be considered ‘rare’
- DateofTerm_day: Anything less than 1% will be considered ‘rare’
- LastPerform_day: Anything less than 2% will be considered ‘rare’
- LastPerform_day_week: Anything less than 1% will be considered ‘rare’
- DateofHire_year: Anything less than 2% will be considered ‘rare’
- DateofTerm_year: Anything less than 1% will be considered ‘rare’


In [None]:
df.info()

### Missing Values

In [38]:
for var in df.columns:
    if df[var].isnull().sum()/len(df) > 0:
        print(var, df[var].isnull().mean().round(3))

TermReason 0.003
DateofTerm_month 0.668
DateofTerm_day 0.668
DateofTerm_year 0.668
DateofTerm_quarter 0.668
DateofTerm_day_week 0.668
LastPerform_month 0.332
LastPerform_day 0.332
LastPerform_day_week 0.332
tenure_termed 0.668
days_since_review 0.332


One major difference between categorical and numerical features is whether the magnitude of the numbers are comparable, i.e., is 2019 bigger than 2018, or December(12) bigger than March (3)? Not really. While there is a sequential order in these numbers, their magnitude is not comparable. Thus, transforming into a categorical value may make more sense.

- TermReason: impute with mode
- DateofTerm_month: impute with 'missing' to create a new category
- DateofTerm_day: impute with 'missing' to create a new category
- DateofTerm_year: impute with 'missing' to create a new category
- DateofTerm_quarter: impute with 'missing' to create a new category
- DateofTerm_day_week: impute with 'missing' to create a new category
- LastPerform_month: impute with 'missing' to create a new category
- LastPerform_day: impute with 'missing' to create a new category
- LastPerform_day_week: impute with 'missing' to create a new category
- tenure_termed: drop due to large number of missing data 
- days_since_review: arbitrary imputation along with a missing indicator feature


In [39]:
df.drop('tenure_termed', axis=1, inplace=True)

In [43]:
# converting feature types
df[categorical] = df[categorical].astype('O')
#df[discrete] = pd.Series.to_numeric(df[discrete], errors='coerce').astype('Int64')
    
for col in ['SpecialProjectsCount', 'tenure','days_since_review', 
            'PerfScoreID', 'EmpSatisfaction']:
    df[col] = pd.to_numeric(df[col], errors='coerce').astype(pd.Int64Dtype())

In [51]:
def outlier_treatment(feature):
    sorted(feature)
    q1,q3 = np.percentile(feature , [25,75])
    IQR = q3 - q1
    lower_range = q1 - (1.5 * IQR)
    upper_range = q3 + (1.5 * IQR)
    return lower_range,upper_range

In [55]:
lower_range, upper_range = outlier_treatment(df['EngagementSurvey'])

In [56]:
df[(df['EngagementSurvey'] < lower_range) | (df['EngagementSurvey'] > upper_range)]

Unnamed: 0,PerfScoreID,FromDiversityJobFairID,PayRate,Termd,Position,State,Zip,Sex,MaritalDesc,HispanicLatino,...,DateofTerm_month,DateofTerm_day,DateofTerm_year,DateofTerm_quarter,DateofTerm_day_week,LastPerform_month,LastPerform_day,LastPerform_day_week,tenure,days_since_review


In [57]:
outlier_treatment(df['PayRate'])
lower_range, upper_range = outlier_treatment(df['PayRate'])

df[(df['PayRate'] < lower_range) | (df['PayRate'] > upper_range)]

Unnamed: 0,PerfScoreID,FromDiversityJobFairID,PayRate,Termd,Position,State,Zip,Sex,MaritalDesc,HispanicLatino,...,DateofTerm_month,DateofTerm_day,DateofTerm_year,DateofTerm_quarter,DateofTerm_day_week,LastPerform_month,LastPerform_day,LastPerform_day_week,tenure,days_since_review


In [None]:
# If we want to remove outliers
# identify the outliers in weekly wage_sqrt

# outliers_feature_name = np.where(df[feature_name] 
#  > upper_boundary, True, np.where(df[feature_name] < lower_boundary, True, False))

# trim the df of the outliers
# df = df.loc[~(outliers_feature_name)]

In [61]:
# If you are building a model which does not require a train/test split (ie. clustering)
#  then you can run the pipeline on the entire dataset.
# However, if you are building a model which does require a split (ie. regression/classification)
#  then you have to fit the pipeline on training data and then transform both training and
#  test data.  This way you prevent any data leakage into the test set. 
# We are going to assume we are building a regression model with EngagementSurvey as our target
X = df.drop('EngagementSurvey', axis=1)
y = df['EngagementSurvey']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


### Target or Mean Encoding
- Target encoding does not add new features unlike one-hot encoding which gretly expands the number of features if you have categorical features with many categories.
- This method replaces the categories with digits from 0 to k-1. We calculate the mean for the target variable for each category for each categorical feature and then the means are replaced with aforementioned digits based on their size.  For example, we have a binary target and the first categorical feature is gender and it has three categories (male, female, not-disclosed).  Let's assume the mean for male is 0.8, female is 0.5 and undisclosed is 0.2.  The encoded values will be males=2, females=1 and undisclosed=0 

# Data Processing Pipeline

In [62]:
# impute categorical features with more than 5% missing values w/ a new category 'missing'
process_pipe = make_pipeline(
    mdi.CategoricalVariableImputer(variables=['DateofTerm_month', 'DateofTerm_day',
                                             'DateofTerm_quarter', 'DateofTerm_day_week',
                                             'LastPerform_month', 'LastPerform_day', 
                                             'LastPerform_day_week','DateofTerm_year'], 
                                   imputation_method='missing'),
    # Imputing categorical features with less than 5% missing values w/the mode
    mdi.CategoricalVariableImputer(variables=['TermReason'], imputation_method='frequent'),
    # Imputing missing values for numerical feature 'days_since_review' with the median
    mdi.ArbitraryNumberImputer(arbitrary_number = -99999, variables='days_since_review'),
    # We are adding a feature to indicate (binary indicator) which records were missing
    mdi.AddMissingIndicator(variables=['days_since_review']),
    # Encoding rare categories (less than 1% & the feature must have at least 5 categories)
    ce.RareLabelCategoricalEncoder(tol=0.01, n_categories=5,
                                   variables=['State']),
    # Encoding rare categories (less than 2% & the feature must have at least 5 categories)
    ce.RareLabelCategoricalEncoder(tol=0.02, n_categories=5,
                                   variables=['Position', 'Zip', 'DateofTerm_day',
                                              'LastPerform_day_week', 'DateofTerm_year', 
                                              'RaceDesc', 'TermReason', 'RecruitmentSource',
                                             'DateofHire_day', 'DateofTerm_month', 
                                              'LastPerform_day', 'DateofHire_year']),
    # Encoding rare categories (less than 5% & the feature must have at least 5 categories)
    ce.RareLabelCategoricalEncoder(tol=0.05, n_categories=5,
                                   variables=['ManagerName']),
    # Target or Mean encoding for categorical features
    ce.OrdinalCategoricalEncoder(encoding_method='ordered',
                                 variables=['FromDiversityJobFairID', 'Termd','Position', 
                                            'State','Zip','Sex', 'MaritalDesc', 
                                            'HispanicLatino', 'RaceDesc', 'TermReason', 
                                            'EmploymentStatus', 'Department', 'ManagerName', 
                                            'RecruitmentSource', 'DateofHire_month', 
                                            'DateofHire_day', 'DateofHire_day', 
                                            'DateofHire_quarter', 'DateofHire_day_week', 
                                            'DateofTerm_month', 'DateofTerm_day', 
                                            'DateofTerm_year', 'DateofTerm_quarter', 
                                            'DateofTerm_day_week', 'LastPerform_month', 
                                            'LastPerform_day', 'LastPerform_day_week']
    
))

In [63]:
process_pipe.fit(X_train, y_train)

Pipeline(steps=[('categoricalvariableimputer-1',
                 CategoricalVariableImputer(variables=['DateofTerm_month',
                                                       'DateofTerm_day',
                                                       'DateofTerm_quarter',
                                                       'DateofTerm_day_week',
                                                       'LastPerform_month',
                                                       'LastPerform_day',
                                                       'LastPerform_day_week',
                                                       'DateofTerm_year'])),
                ('categoricalvariableimputer-2',
                 CategoricalVariableImputer(imputation_method='frequent',
                                            variables=['Ter...
                                                      'HispanicLatino',
                                                      'RaceDesc', 'TermReason',
    

In [64]:
X_train_clean = process_pipe.transform(X_train)
X_test_clean = process_pipe.transform(X_test)

  "NaN values were introduced in the returned dataframe by the encoder."


In [66]:
X_train_clean.head()

Unnamed: 0,PerfScoreID,FromDiversityJobFairID,PayRate,Termd,Position,State,Zip,Sex,MaritalDesc,HispanicLatino,...,DateofTerm_month,DateofTerm_day,DateofTerm_year,DateofTerm_quarter,DateofTerm_day_week,LastPerform_month,LastPerform_day,LastPerform_day_week,tenure,days_since_review
126,3,1,20.0,0,4,1,3,1,2,0,...,3,2,2,1,3,2,3,3,3405,624
109,3,1,16.0,0,4,1,3,1,1,0,...,3,2,2,1,3,2,11,6,2594,620
247,4,1,22.0,0,2,1,0,0,1,0,...,3,2,2,1,3,2,8,6,2188,606
234,3,1,26.0,0,2,1,3,1,2,0,...,3,2,2,1,3,2,11,6,3098,620
202,3,1,15.25,1,4,1,1,1,2,0,...,0,1,3,0,1,1,6,5,3370,-99999
