In [8]:
from datetime import datetime
import pandas as pd
import pickle
from sklearn.preprocessing import LabelEncoder
import xgboost as xg
from sklearn.model_selection import train_test_split

In [9]:
data = pd.read_csv('baltimore-city-employee-salaries-fy2019.csv')
data.columns= data.columns.str.lower()
gross_mean = data.loc[:, 'gross'].mean()
data['gross'] = data['gross'].replace([0.0, 1.0], gross_mean)
data.loc[:, 'gross'].fillna(gross_mean, inplace=True)
data.loc[:, 'hire_dt'].fillna(datetime(2020, 1, 1), inplace=True)


In [10]:
data['jobtitle'] = data['jobtitle'].str.strip()
data['deptid'] = data['deptid'].str[:3]
data['descr'] = data['descr'].str.replace('(\s+[( ]$|\s+\(\d{3}\)$|\s+\(\d{1}$)', '', regex=True)
data.head()


Unnamed: 0,name,jobtitle,deptid,descr,hire_dt,annual_rt,gross
0,"Aaron,Kareem D",Utilities Inst Repair I,A50,DPW-Water & Waste Water,08/27/2018 12:00:00 AM,32470.0,25743.94
1,"Aaron,Patricia G",Facilities/Office Services II,A03,OED-Employment Dev,10/24/1979 12:00:00 AM,60200.0,57806.13
2,"Abadir,Adam O",Council Technician,A02,City Council,12/12/2016 12:00:00 AM,64823.0,64774.11
3,"Abaku,Aigbolosimuan O",Police Officer,A99,Police Department,04/17/2018 12:00:00 AM,53640.0,59361.55
4,"Abbeduto,Mack",Assistant State's Attorney,A29,States Attorneys Office,05/22/2017 12:00:00 AM,68562.0,61693.59


In [11]:
le_jt = LabelEncoder()
le_d = LabelEncoder()

le_jobtitle = le_jt.fit_transform(data['jobtitle'])
le_department = le_d.fit_transform(data['descr'])

pickle.dump(le_jt, open( 'le_jobtitle.pickle', "wb" ) )
pickle.dump(le_d, open( 'le_department.pickle', "wb" ) )

data['jobtitle'] = le_jobtitle
data['descr'] = le_department
data.head()

Unnamed: 0,name,jobtitle,deptid,descr,hire_dt,annual_rt,gross
0,"Aaron,Kareem D",952,A50,11,08/27/2018 12:00:00 AM,32470.0,25743.94
1,"Aaron,Patricia G",353,A03,53,10/24/1979 12:00:00 AM,60200.0,57806.13
2,"Abadir,Adam O",234,A02,5,12/12/2016 12:00:00 AM,64823.0,64774.11
3,"Abaku,Aigbolosimuan O",717,A99,56,04/17/2018 12:00:00 AM,53640.0,59361.55
4,"Abbeduto,Mack",56,A29,65,05/22/2017 12:00:00 AM,68562.0,61693.59


In [12]:
convert_point = datetime(2020, 1, 1)
data['hire_dt'] = pd.to_datetime(data['hire_dt'])
work_experience = convert_point - data['hire_dt']
data['work_exp'] = work_experience.dt.days

In [13]:
X = data[['jobtitle', 'descr', 'annual_rt', 'work_exp']]
y = data.loc[:, 'gross']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
model = xg.XGBRegressor(objective ='reg:squarederror', n_estimators = 10).fit(X_train, y_train)
pickle.dump(model, open("salary-predict.pickle", "wb"))

##### Проведём тестирование
##### Let`s make some tests

In [14]:
model = pickle.load(open("salary-predict.pickle", "rb"))
le_jt = pickle.load(open( 'le_jobtitle.pickle', "rb" ))
le_d = pickle.load(open( 'le_department.pickle', "rb" ))

le_jobtitle_feature = le_jt.transform(['Watershed Manager'])
le_department_feature = le_d.transform(['DPW-Water & Waste Water'])


annual_rt = 95747.0
work_exp = 9858
print(le_jobtitle_feature[0])
print(le_department_feature[0])
print([[le_jobtitle_feature[0], le_department_feature[0], annual_rt, work_exp]])
final_data = pd.DataFrame(
    data={
        'jobtitle':le_jobtitle_feature[0],
        'descr':le_department_feature[0],
        'annual_rt':annual_rt,
        'work_exp':work_exp
    },
    index=[0]
)

result = model.predict(final_data)  # input must be 2D array
print(result)


997
11
[[997, 11, 95747.0, 9858]]
[101473.69]
