In [249]:
import pandas as pd 
import matplotlib.pyplot as plt 
import numpy as np 


In [250]:
from sklearn.model_selection import train_test_split , GridSearchCV
from sklearn.linear_model import Ridge,LinearRegression,Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.metrics import r2_score,mean_squared_error

In [251]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer


In [252]:
df = pd.read_csv('data/eda_data.csv')

In [253]:
df.sample(5)

Unnamed: 0.1,Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Headquarters,Size,Founded,...,age,python_yn,R_yn,spark,aws,excel,job_simp,seniority,desc_len,num_comp
273,273,Data Scientist,$68K-$112K (Glassdoor est.),Must Have: ? 10+ years of experience in Analyt...,3.5,L&T Infotech\n3.5,"San Ramon, CA","Mumbai, India",10000+ employees,1997,...,23,1,0,0,0,1,data scientist,na,1231,3
311,311,Product Engineer – Data Science,$63K-$101K (Glassdoor est.),Overview\n\n\nThe Product Engineer for applied...,3.5,Esri\n3.5,"Arlington, VA","Redlands, CA",1001 to 5000 employees,1969,...,51,1,0,0,0,0,na,na,4196,1
219,219,Lead Data Scientist,$124K-$204K (Glassdoor est.),"Job Description\n\nSince 1851, MassMutuals com...",3.6,MassMutual\n3.6,"Boston, MA","Springfield, MA",5001 to 10000 employees,1851,...,169,1,0,1,1,0,data scientist,senior,4414,0
205,205,Sr. Data Engineer,$75K-$140K (Glassdoor est.),Position Purpose\n\nAs a Senior Data Engineer ...,3.4,Echo Global Logistics\n3.4,"Chicago, IL","Chicago, IL",1001 to 5000 employees,2005,...,15,0,0,0,0,0,data engineer,senior,4288,3
416,416,Director Data Science,$124K-$199K (Glassdoor est.),We’re looking for a director to drive our data...,3.6,TRANZACT\n3.6,"Fort Lee, NJ","Fort Lee, NJ",1001 to 5000 employees,1989,...,31,1,0,0,1,1,director,na,5121,0


In [None]:
# choose relevant columns 
df.columns

In [None]:
df_model.tail()

In [270]:
X =df_model.drop(['num_comp','Rating','employer_provided','same_state','desc_len','avg_salary','hourly','Sector'],axis = 1) 
X_new =df_model.drop(['num_comp','Rating','employer_provided','same_state','desc_len','avg_salary','hourly','Sector'],axis = 1) 

y=  df_model['avg_salary']

In [271]:
X.columns

Index(['Size', 'Type of ownership', 'Industry', 'Revenue', 'job_state', 'age',
       'python_yn', 'spark', 'aws', 'excel', 'job_simp', 'seniority'],
      dtype='object')

In [256]:
num_features = X.select_dtypes(exclude="object").columns
cat_features = X.select_dtypes(include="object").columns

In [257]:
cat_features  # industry and sector are same 

Index(['Size', 'Type of ownership', 'Industry', 'Revenue', 'job_state',
       'job_simp', 'seniority'],
      dtype='object')

In [258]:
num_features

Index(['age', 'python_yn', 'spark', 'aws', 'excel'], dtype='object')

In [274]:
numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, cat_features),
         ("StandardScaler", numeric_transformer, num_features),        
    ]
)


In [276]:
pickle.dump(preprocessor,open("preprocessor.pkl",'wb'))

In [275]:
X = preprocessor.fit(X,y)

In [277]:
with open('preprocessor.pkl', 'rb') as f:
    preprocessing_pipeline = pickle.load(f)

In [278]:
data = preprocessing_pipeline.transform(X_new)

In [282]:
data.shape

(742, 146)

In [283]:
X = data

In [284]:
X_train, X_test, y_train, y_test = train_test_split(X,y ,test_size=0.2,random_state=42)


In [294]:
params_xgb =  {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 500}
xgb = XGBRegressor(**params_xgb)
xgb.fit(X_train,y_train)
y_pred = xgb.predict(X_test)
evaluate_model(y_test,y_pred)



(18.26235421194835, 0.7959551560415641)

In [295]:
pickle.dump(xgb,open('xgb.pkl','wb'))

In [296]:
model = pickle.load(open('xgb.pkl','rb'))
y_pred = model.predict(X_test)
evaluate_model(y_test,y_pred)

(18.26235421194835, 0.7959551560415641)

In [286]:
X_train.shape

(593, 146)

In [None]:
y_train.head()

In [287]:
def evaluate_model(true, predicted):
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return rmse, r2_square

In [289]:
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    'SVR':SVR(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(), 
    "AdaBoost Regressor": AdaBoostRegressor()
}
model_list = []
r2_list =[]

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train and Test dataset
    model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)

    model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    r2_list.append(model_test_r2)
    
    print('='*35)
    print('\n')

Linear Regression
Model performance for Training set
- Root Mean Squared Error: 21.1184
- R2 Score: 0.6973
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 26.0579
- R2 Score: 0.5846


Lasso
Model performance for Training set
- Root Mean Squared Error: 28.0758
- R2 Score: 0.4650
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 31.0604
- R2 Score: 0.4098


Ridge
Model performance for Training set
- Root Mean Squared Error: 21.5378
- R2 Score: 0.6852
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 25.0408
- R2 Score: 0.6164


SVR
Model performance for Training set
- Root Mean Squared Error: 34.0675
- R2 Score: 0.2123
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 37.1589
- R2 Score: 0.1552


Decision Tree
Model performance for Training set
- Root Mean Squared Error: 3.3030
- R2 Score: 0.9926
--------------------

In [290]:
pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'R2_Score']).sort_values(by=["R2_Score"],ascending=False)


Unnamed: 0,Model Name,R2_Score
5,Random Forest Regressor,0.796535
6,XGBRegressor,0.790733
4,Decision Tree,0.704818
2,Ridge,0.616372
0,Linear Regression,0.584577
7,AdaBoost Regressor,0.52617
1,Lasso,0.409763
3,SVR,0.155232


In [None]:
# we can take top 5 models and build a stacking ensemble and check it's accuracty 
from sklearn.ensemble import StackingRegressor


xg boost will be the output layer that is the final estimator  and then input layers will be of random forest ridge decion Tree 

In [291]:
# hyper prameter tuning of xgboost 
param_grid = {
    'max_depth': [3, 5, 7,9],
    'learning_rate': [0.1, 0.01, 0.001],
    'n_estimators': [100, 500, 1000]
}
xgb = XGBRegressor()
grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)


In [292]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
evaluate_model(y_test,y_pred)

(18.26235421194835, 0.7959551560415641)

In [293]:
print("Best Parameters: ", grid_search.best_params_)
print("Best Score: ", grid_search.best_score_)
#changing parameters for xgb parameters 

Best Parameters:  {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 500}
Best Score:  0.651757574699495


In [None]:
rf = RandomForestRegressor()

param_grid = {
    'max_depth': [None, 5, 10,20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4,8,9]
}
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
evaluate_model(y_test,y_pred)

In [None]:
print("Best Parameters: ", grid_search.best_params_)
print("Best Score: ", grid_search.best_score_)
#changing parameters for random forest 
params_randomforest = {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2}

In [None]:
ridge = Ridge()

# Define the hyperparameter grid
param_grid = {
    'alpha': [0.1,0.01,0.001, 1.0, 2,2.5,3,5,10.0],
    'solver': ['auto', 'lsqr', 'sparse_cg']
}

grid_search = GridSearchCV(estimator=ridge, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
evaluate_model(y_test,y_pred)

In [None]:
print("Best Parameters: ", grid_search.best_params_)
print("Best Score: ", grid_search.best_score_)
params_ridge = {'alpha': 3, 'solver': 'lsqr'}

In [None]:
tree = DecisionTreeRegressor()

param_grid = {
    'max_depth': [None, 5, 10,20],
    'min_samples_split': [2, 5, 10,20,25],
    'min_samples_leaf': [1, 2, 4,6,8,9],
    'max_features': ['auto', 'sqrt', 'log2']
}

grid_search = GridSearchCV(estimator=tree, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
evaluate_model(y_test,y_pred)

In [None]:
print("Best Parameters: ", grid_search.best_params_)
print("Best Score: ", grid_search.best_score_)
params_decisiontree = {'max_depth': 20, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 2}

In [None]:
params_dt = {'max_depth': 20, 'max_features': 1, 'min_samples_leaf': 1, 'min_samples_split': 2}
params_xgb=  {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2,'max_features':1}
params_rdg = {'alpha': 3, 'solver': 'lsqr'}
params_rndt = {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2}

In [None]:
sr_tuple = [
    ('decisiontree',DecisionTreeRegressor(**params_dt)),
    ('ridge',Ridge(**params_rdg)),
    ('randomforest',RandomForestRegressor(**params_rndt)),
] 
final_estimator = XGBRegressor(**params_xgb)

In [None]:
SR = StackingRegressor(sr_tuple,final_estimator=final_estimator)

In [None]:
SR.fit(X_train,y_train).score(X_test,y_test)

In [None]:
import pickle

In [None]:
xgb = XGBRegressor(**params_xgb)
xgb.fit(X_train,y_train)
y_pred = xgb.predict(X_test)
evaluate_model(y_test,y_pred)

pickle.dump(xgb,open('xgb.pickle','wb'))

In [None]:
m = open('xgb.pickle','rb')
model = pickle.load(m)

In [None]:
y_pred = model.predict(X_test)
evaluate_model(y_test,y_pred)

In [None]:
# done 