In [3]:
# Import our dependencies
%matplotlib inline
from matplotlib import pyplot as plt
from sklearn.datasets import make_classification
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.metrics import classification_report
#import tensorflow as tf
#import seaborn as sns

In [4]:
# Read data from csv
df = pd.read_csv('clean_job.csv')
df.head()

Unnamed: 0,Job Title,Salary Estimate,Rating,Company Name,Location,Headquarters,Size,Type of ownership,Industry,Sector,Revenue
0,"Data Analyst, Center on Immigration and Justic...",$37K-$66K (Glassdoor est.),3.2,Vera Institute of Justice\n3.2,"New York, NY","New York, NY",201 to 500 employees,Nonprofit Organization,Social Assistance,Non-Profit,$100 to $500 million (USD)
1,Quality Data Analyst,$37K-$66K (Glassdoor est.),3.8,Visiting Nurse Service of New York\n3.8,"New York, NY","New York, NY",10000+ employees,Nonprofit Organization,Health Care Services & Hospitals,Health Care,$2 to $5 billion (USD)
2,"Senior Data Analyst, Insights & Analytics Team...",$37K-$66K (Glassdoor est.),3.4,Squarespace\n3.4,"New York, NY","New York, NY",1001 to 5000 employees,Company - Private,Internet,Information Technology,Unknown / Non-Applicable
3,Data Analyst,$37K-$66K (Glassdoor est.),4.1,Celerity\n4.1,"New York, NY","McLean, VA",201 to 500 employees,Subsidiary or Business Segment,IT Services,Information Technology,$50 to $100 million (USD)
4,Reporting Data Analyst,$37K-$66K (Glassdoor est.),3.9,FanDuel\n3.9,"New York, NY","New York, NY",501 to 1000 employees,Company - Private,Sports & Recreation,"Arts, Entertainment & Recreation",$100 to $500 million (USD)


In [5]:
df.shape

(1894, 11)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1894 entries, 0 to 1893
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Job Title          1894 non-null   object 
 1   Salary Estimate    1894 non-null   object 
 2   Rating             1894 non-null   float64
 3   Company Name       1894 non-null   object 
 4   Location           1894 non-null   object 
 5   Headquarters       1894 non-null   object 
 6   Size               1894 non-null   object 
 7   Type of ownership  1894 non-null   object 
 8   Industry           1894 non-null   object 
 9   Sector             1894 non-null   object 
 10  Revenue            1894 non-null   object 
dtypes: float64(1), object(10)
memory usage: 162.9+ KB


In [7]:
df.dtypes

Job Title             object
Salary Estimate       object
Rating               float64
Company Name          object
Location              object
Headquarters          object
Size                  object
Type of ownership     object
Industry              object
Sector                object
Revenue               object
dtype: object

In [8]:
df.keys()

Index(['Job Title', 'Salary Estimate', 'Rating', 'Company Name', 'Location',
       'Headquarters', 'Size', 'Type of ownership', 'Industry', 'Sector',
       'Revenue'],
      dtype='object')

In [9]:
#seperated[["Salary Estimate","Company", "est"]] = df["Salary Estimate"].str.split(' ', expand=True)
#df = df["Salary Estimate"].str.split(' ', expand=True)

In [10]:
df.keys()

Index(['Job Title', 'Salary Estimate', 'Rating', 'Company Name', 'Location',
       'Headquarters', 'Size', 'Type of ownership', 'Industry', 'Sector',
       'Revenue'],
      dtype='object')

In [11]:
# Salary parsing

df['hourly'] = df['Salary Estimate'].apply(lambda x: 1 if 'per hour' in x.lower() else 0 )
df['employer_prodived'] = df['Salary Estimate'].apply(lambda x: 1 if 'employer provided salary' in x.lower() else 0 )

df = df[df['Salary Estimate'] !='-1']
salary = df['Salary Estimate'].apply(lambda x: x.split('(')[0])
minus_Kd = salary.apply(lambda x: x.replace('K', '').replace('$', ''))

minus_hr = minus_Kd.apply(lambda x: x.lower().replace('per hour', '').replace('employer provided salary', '').replace(':', ''))

df['min_salary'] = minus_hr.apply(lambda x: int(x.split('-')[0]))
df['max_salary'] = minus_hr.apply(lambda x: int(x.split('-')[1]))
df['avg_salary'] = (df.min_salary + df.max_salary)/2

In [12]:
# at bryon, trying to use Qcut and binning.all()g to define the rande 
pd.qcut(df.avg_salary, q=6).value_counts()

(68.5, 78.0]      328
(78.0, 92.5]      323
(51.5, 60.5]      322
(33.499, 51.5]    318
(60.5, 68.5]      308
(92.5, 150.0]     295
Name: avg_salary, dtype: int64

In [13]:
df.dropna(inplace=True)

In [14]:
df.keys()

Index(['Job Title', 'Salary Estimate', 'Rating', 'Company Name', 'Location',
       'Headquarters', 'Size', 'Type of ownership', 'Industry', 'Sector',
       'Revenue', 'hourly', 'employer_prodived', 'min_salary', 'max_salary',
       'avg_salary'],
      dtype='object')

In [15]:
df_model = df[['avg_salary','Rating','Location','Size','Type of ownership','Industry','Sector','Revenue']]

df_model

Unnamed: 0,avg_salary,Rating,Location,Size,Type of ownership,Industry,Sector,Revenue
0,51.5,3.2,"New York, NY",201 to 500 employees,Nonprofit Organization,Social Assistance,Non-Profit,$100 to $500 million (USD)
1,51.5,3.8,"New York, NY",10000+ employees,Nonprofit Organization,Health Care Services & Hospitals,Health Care,$2 to $5 billion (USD)
2,51.5,3.4,"New York, NY",1001 to 5000 employees,Company - Private,Internet,Information Technology,Unknown / Non-Applicable
3,51.5,4.1,"New York, NY",201 to 500 employees,Subsidiary or Business Segment,IT Services,Information Technology,$50 to $100 million (USD)
4,51.5,3.9,"New York, NY",501 to 1000 employees,Company - Private,Sports & Recreation,"Arts, Entertainment & Recreation",$100 to $500 million (USD)
...,...,...,...,...,...,...,...,...
1889,91.0,4.1,"Broomfield, CO",51 to 200 employees,Company - Private,Computer Hardware & Software,Information Technology,$25 to $50 million (USD)
1890,91.0,2.5,"Denver, CO",51 to 200 employees,Company - Private,Staffing & Outsourcing,Business Services,Unknown / Non-Applicable
1891,91.0,2.9,"Centennial, CO",10000+ employees,Company - Public,Wholesale,Business Services,$10+ billion (USD)
1892,91.0,3.1,"Centennial, CO",201 to 500 employees,Company - Private,Enterprise Software & Network Solutions,Information Technology,$25 to $50 million (USD)


In [16]:
df_dum=pd.get_dummies(df_model)
X = df_dum.drop('avg_salary', axis=1)
y = df_dum.avg_salary.values

In [17]:
# salary ranges with text removed
#y = df['avg_salary']
y

array([51.5, 51.5, 51.5, ..., 91. , 91. , 91. ])

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=37)

In [20]:
# Train a Random Forest Classifier model (changed to Random ForestRegressor for more accurancy) on the scaled data and print the model score
clf = RandomForestRegressor()

In [21]:
np.mean(cross_val_score(clf,X_train,y_train,scoring = 'neg_mean_absolute_error', cv= 3))

-15.849954614710555

In [22]:
#tune model GridsearchCV
parameters = {'n_estimators':range(100,200,100), 'criterion':('squared_error','absolute_error'), 'max_features':('auto','sqrt','log2')}

gs = GridSearchCV(clf,parameters,scoring='neg_mean_absolute_error',cv=3)
gs.fit(X_train.values,y_train)

gs.best_score_
gs.best_estimator_

RandomForestRegressor(criterion='absolute_error', max_features='sqrt')

In [22]:
# find mean_absolute_error
clf_pred = gs.best_estimator_.predict(X_test.values)

from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test,clf_pred)


15.929663588390502

In [23]:
# Seralization of the model
import pickle
pickled = {'model': gs.best_estimator_}
pickle.dump(pickled, open('model_clf'+'.p', 'wb'))

In [24]:
# The following is for prediction
gs.best_estimator_.predict(np.array(list(X_test.iloc[1,:])).reshape(1,-1))[0]
list(X_test.iloc[1,:])

[4.1,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0

In [25]:
fname = 'model_clf.p'
with open(fname, 'rb') as pickled:
    data = pickle.load(pickled)
    model= data['model']

model.predict(np.array(list(X_test.iloc[17,:])).reshape(1,-1))[0]

75.3075

In [26]:
print(y_test[17])

63.0


In [27]:
import flask
import json
from flask import Flask, jsonify, request
import numpy as np
import pickle

def load_model():
    fname = 'model_clf.p'
    with open(fname, 'rb') as pickled:
    data = pickle.load(pickled)
    model= data['model']
return model

app=Flask(__name_)

@app.route('/salary', method=['GET'])

def predict_salary():
    request_json = request.get_json()
    x = request_json['input']
    x_in = np.array(x).reshape(1,-1)
    
    model = load_model()
    predicted_salary = model.predict(x_in)[0]
    response = json.dumps({'response': predicted_salaray})
    return response, 200

if __name__=="__main__":
    app.run(debug=True)

IndentationError: expected an indented block (1477315679.py, line 10)