In [156]:
import pickle
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Lasso
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import time
import numpy as np
from sklearn.metrics import make_scorer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Ridge
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor

In [3]:
df = pickle.load(open("jobs_df_cleaned.pkl", "rb"))

In [168]:
pd.set_option('display.max_rows', 500)
df

Unnamed: 0,salary,description,company_size,company_type,industry,revenue,company_rating,recommend_to_a_friend,ceo_approval,interview_difficulty,location_bin
0,144000,TikTok is the leading destination for short-fo...,1001 to 5000 employees,Private,Internet,,4.1,79.0,100.0,2.7,CA
1,107000,THE ROLE\n\nTesla's mission is to accelerate t...,10000+ employees,Public,Transportation Equipment Manufacturing,$2 to $5 billion (CAD) per year,3.5,59.0,75.0,2.9,CA
2,150000,Facebook's mission is to give people the power...,10000+ employees,Public,Internet,$5 to $10 billion (CAD) per year,4.4,90.0,94.0,3.1,CA
3,130000,Job Description\n\nThe New York Times is commi...,1001 to 5000 employees,Public,News Outlets,$1 to $2 billion (CAD) per year,3.8,80.0,95.0,2.9,NY
4,66225,About Cerebri AI Cerebri AI CVX platform uses ...,1 to 50 employees,Private,Enterprise Software & Network Solutions,,3.9,75.0,80.0,2.4,ON
5,102000,Description\n\nSHIFT: Day Job\n\nSCHEDULE: Ful...,10000+ employees,Public,Insurance Operators,$10+ billion (CAD) per year,3.4,63.0,66.0,2.7,GA
6,116000,Job Overview\n\nJob Summary\n\nThe Research Da...,10000+ employees,College / University,Colleges & Universities,$50 to $100 million (CAD) per year,4.2,82.0,89.0,2.8,CA
7,63600,"Location:\n1 Presidents Choice Circle, Brampto...",10000+ employees,Public,Vehicle Dealers,$10+ billion (CAD) per year,3.5,69.0,80.0,2.2,ON
8,66000,Job Summary\n\nThe UBC Data Science Institute ...,10000+ employees,College / University,Colleges & Universities,$2 to $5 billion (CAD) per year,4.2,87.0,82.0,2.7,BC
9,85000,Job Number: R0091474\n\nData Scientist\n\nThe ...,10000+ employees,Public,Consulting,$5 to $10 billion (CAD) per year,3.9,76.0,93.0,2.7,VA


In [4]:
def scoring_metric_(y_true, y_predicted):
    
    error_1 = ((y_true - y_predicted)**2)
    
    error_2 = (y_true - y_predicted)
    error_2[error_2 <= 0] = 0
    
    total_error = error_1 + error_2
    
    average_error = np.average(total_error)
    
    return average_error

In [5]:
scoring_metric = make_scorer(scoring_metric_, greater_is_better = False)

In [6]:
X = df.iloc[:, 1:]
Y = df.iloc[:, 0]

text_columns = 'description'
text_pipe =Pipeline(steps = [("tfidf", TfidfVectorizer(token_pattern =  r"[a-zA-Z0-9\+\-\.]+"))])

continuous_columns = ['company_rating', 'recommend_to_a_friend', 'ceo_approval', 'interview_difficulty']
continuous_pipe =Pipeline(steps = [("impute_continuous", SimpleImputer())])



categorical_columns = ['company_size', 'company_type', 'industry', 'revenue', 'location_bin']
categorical_pipe =Pipeline(steps = [("impute_categorical", SimpleImputer(strategy = 'most_frequent')), ('one_hot', OneHotEncoder(handle_unknown = 'ignore'))])




lasso_preprocess= ColumnTransformer([("text", text_pipe, text_columns), ("num", continuous_pipe, continuous_columns), ("cat", categorical_pipe, categorical_columns)], remainder = 'passthrough')
    
lasso_pipe = Pipeline(steps = [("lasso_preprocess", lasso_preprocess), ("lasso", Lasso(normalize = True, warm_start=True))])

lasso_parameters = {'lasso_preprocess__text__tfidf__ngram_range' : [(1,2), (1,3)], 'lasso_preprocess__text__tfidf__min_df' : [0.0, 0.05, 0.10], 'lasso_preprocess__text__tfidf__max_df' : [0.80, 0.85, 0.90], 'lasso_preprocess__num__impute_continuous__strategy': ['mean', 'median'], 'lasso_preprocess__num__impute_continuous__add_indicator' : [True, False], 'lasso_preprocess__cat__impute_categorical__add_indicator' : [True, False], "lasso__alpha" : [1.0, 2.0, 4.0]}

#lasso_gs = GridSearchCV(lasso_pipe, lasso_parameters, scoring = scoring_metric, return_train_score = True, n_jobs = -1, verbose=10)    

In [149]:
ridge_preprocess= ColumnTransformer([("text", text_pipe, text_columns), ("num", continuous_pipe, continuous_columns), ("cat", categorical_pipe, categorical_columns)], remainder = 'passthrough')
ridge_pipe = Pipeline(steps = [("ridge_preprocess", ridge_preprocess), ("ridge", Ridge(normalize = True))])
ridge_parameters = {'ridge_preprocess__text__tfidf__ngram_range' : [(1,2), (1,3)], 'ridge_preprocess__text__tfidf__min_df' : [0.0, 0.05, 0.10], 'ridge_preprocess__text__tfidf__max_df' : [0.80, 0.85, 0.90], 'ridge_preprocess__num__impute_continuous__strategy': ['mean', 'median'], 'ridge_preprocess__num__impute_continuous__add_indicator' : [True, False], 'ridge_preprocess__cat__impute_categorical__add_indicator' : [True, False], "ridge__alpha" : [1.0, 2.0, 4.0, 8.0]}
ridge_gs = GridSearchCV(ridge_pipe, ridge_parameters, scoring = make_scorer(scoring_metric_, greater_is_better = False), return_train_score = True, n_jobs = -1, verbose=10)    

ridge_gs.fit(X, Y)















Fitting 5 folds for each of 576 candidates, totalling 2880 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    3.0s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    3.9s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    4.5s
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    5.3s
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:    6.4s
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:    7.4s
[Parallel(n_jobs=-1)]: Done  61 tasks      | elapsed:    8.6s
[Parallel(n_jobs=-1)]: Done  74 tasks      | elapsed:    9.9s
[Parallel(n_jobs=-1)]: Done  89 tasks      | elapsed:   11.4s
[Parallel(n_jobs=-1)]: Done 104 tasks      | elapsed:   12.8s
[Parallel(n_jobs=-1)]: Done 121 tasks      | elapsed:   14.3s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:   16.0s
[Parallel(n_jobs=-1)]: Done 157 tasks      | elapsed:   17.9s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:   19.6s
[Parallel(n_jobs=-1)]: Done 197 tasks      | elapsed:  

GridSearchCV(estimator=Pipeline(steps=[('ridge_preprocess',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('text',
                                                                         Pipeline(steps=[('tfidf',
                                                                                          TfidfVectorizer(token_pattern='[a-zA-Z0-9\\+\\-\\.]+'))]),
                                                                         'description'),
                                                                        ('num',
                                                                         Pipeline(steps=[('impute_continuous',
                                                                                          SimpleImputer())]),
                                                                         ['company_rating',
                                              

In [150]:
ridge_gs.best_score_

-632986257.9196513

In [None]:
lasso_gs.fit(X, Y)

In [None]:
pickle.dump(lasso_gs, open("lasso_gs.pkl", "wb"))

In [None]:
lasso_gs.cv_results_

In [None]:
arr = np.full((205,), df['salary'].mean())

In [None]:
score = scoring_metric_(df['salary'], arr)
score

In [None]:
lasso_gs.best_score_

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2)
#best_lasso_pipe = lasso_gs.best_estimator_






text_pipe_best_lasso = Pipeline(steps = [("tfidf", TfidfVectorizer(token_pattern =  r"[a-zA-Z0-9\+\-\.]+", ngram_range = (1,3), min_df = 0.05, max_df = 0.9))])
continuous_pipe_best_lasso = Pipeline(steps = [("impute_continuous", SimpleImputer(strategy = 'median', add_indicator = True ))])
categorical_pipe_best_lasso = Pipeline(steps = [("impute_categorical", SimpleImputer(strategy = 'most_frequent', add_indicator = True)), ('one_hot', OneHotEncoder(handle_unknown = 'ignore'))])
best_lasso_preprocess = ColumnTransformer([("text", text_pipe_best_lasso, text_columns), ("num", continuous_pipe_best_lasso, continuous_columns), ("cat", categorical_pipe_best_lasso, categorical_columns)], remainder = 'passthrough', sparse_threshold = 0)

#X_train_processed = best_lasso_preprocess.fit_transform(X_train)
#X_test_processed = best_lasso_preprocess.transform(X_test)

#lasso_residual_plot = ResidualsPlot(model = Lasso(normalize = True, alpha = 4.0, max_iter = 2000), hist=False, qqplot = True)
#lasso_residual_plot.fit(X_train_processed, Y_train)
#lasso_residual_plot.score(X_test_processed, Y_test)
#lasso_residual_plot.show()

##best_lasso_pipe = Pipeline(steps = [("best_lasso_preprocess", lasso_preprocess), ("lasso", Lasso(normalize = True, alpha = 4.0, max_iter = 2000))])


##best_lasso_pipe.fit(X_train, Y_train)
##best_lasso_predictions = best_lasso_pipe.predict(X_test)

In [None]:
X_processed.shape

In [None]:
X_processed = best_lasso_preprocess.fit_transform(X)
cd = CooksDistance()
cd.fit(X_processed, Y)
cd.show()

In [None]:
X_processed

In [None]:
best_lasso_predictions

In [None]:
residuals = Y_test - best_lasso_predictions

In [None]:
plt.pyplot.scatter(best_lasso_predictions, residuals)

In [153]:
text_columns = 'description'
text_pipe_rf =Pipeline(steps = [("tfidf", TfidfVectorizer(token_pattern =  r"[a-zA-Z0-9\+\-\.]+"))])

continuous_columns = ['company_rating', 'recommend_to_a_friend', 'ceo_approval', 'interview_difficulty']
continuous_pipe_rf =Pipeline(steps = [("impute_continuous", SimpleImputer())])

size_cats =list(df["company_size"].dropna().unique())
type_cats =list(df["company_type"].dropna().unique())
industry_cats =list(df["industry"].dropna().unique())
revenue_cats =list(df["revenue"].dropna().unique())
location_cats =list(df["location_bin"].dropna().unique())

cats_for_cat_vars = [size_cats, type_cats, industry_cats, revenue_cats, location_cats]

categorical_columns = ['company_size', 'company_type', 'industry', 'revenue', 'location_bin']
categorical_pipe_rf =Pipeline(steps = [("impute_categorical", SimpleImputer(strategy = 'most_frequent')), ('one_hot', OneHotEncoder( handle_unknown = 'ignore'))])


rf_preprocess= ColumnTransformer([("text", text_pipe_rf, text_columns), ("num", continuous_pipe_rf, continuous_columns), ("cat", categorical_pipe_rf, categorical_columns)], remainder = 'passthrough')


rf_pipe = Pipeline(steps = [("rf_preprocess", rf_preprocess), ("rf", RandomForestRegressor(n_estimators = 500, oob_score = True, n_jobs = -1))])

rf_parameters = {'rf_preprocess__text__tfidf__ngram_range' : [(1,2), (1,3)], 'rf_preprocess__text__tfidf__min_df' : [0.0, 0.05, 0.10], 'rf_preprocess__text__tfidf__max_df' : [0.80, 0.85, 0.90], 'rf_preprocess__num__impute_continuous__strategy': ['mean', 'median'], 'rf_preprocess__num__impute_continuous__add_indicator' : [True], 'rf_preprocess__cat__impute_categorical__add_indicator' : [True], 'rf__max_features': [175, 350, 700, 1400]}

rf_gs = GridSearchCV(rf_pipe, rf_parameters, scoring = scoring_metric, return_train_score = True, n_jobs = -1, verbose=10)    










In [None]:
rf_gs.fit(X, Y)

In [None]:
rf_gs.best_estimator_

In [None]:
rf_gs.best_estimator_

In [None]:
rf_gs.best_score_

In [None]:
rf_gs.cv_results_

In [154]:
et_pipe = Pipeline(steps = [("rf_preprocess", rf_preprocess), ("et", ExtraTreesRegressor(n_estimators = 500, bootstrap=True, oob_score = True, n_jobs = -1))])
et_parameters = {'rf_preprocess__text__tfidf__ngram_range' : [(1,2), (1,3)], 'rf_preprocess__text__tfidf__min_df' : [0.0, 0.05, 0.10], 'rf_preprocess__text__tfidf__max_df' : [0.80, 0.85, 0.90], 'rf_preprocess__num__impute_continuous__strategy': ['mean', 'median'], 'rf_preprocess__num__impute_continuous__add_indicator' : [True], 'rf_preprocess__cat__impute_categorical__add_indicator' : [True], 'et__max_features': [175, 350, 700, 1400]}
et_gs = GridSearchCV(et_pipe, et_parameters, scoring = make_scorer(scoring_metric_, greater_is_better = False), return_train_score = True, n_jobs = -1, verbose=10)    
et_gs.fit(X, Y)


Fitting 5 folds for each of 144 candidates, totalling 720 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    9.3s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:   12.2s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   17.1s
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   21.7s
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:   30.6s
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:   37.0s
[Parallel(n_jobs=-1)]: Done  61 tasks      | elapsed:   44.0s
[Parallel(n_jobs=-1)]: Done  74 tasks      | elapsed:   51.7s
[Parallel(n_jobs=-1)]: Done  89 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 104 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 121 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 157 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 197 tasks      | elapsed:  

GridSearchCV(estimator=Pipeline(steps=[('rf_preprocess',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('text',
                                                                         Pipeline(steps=[('tfidf',
                                                                                          TfidfVectorizer(token_pattern='[a-zA-Z0-9\\+\\-\\.]+'))]),
                                                                         'description'),
                                                                        ('num',
                                                                         Pipeline(steps=[('impute_continuous',
                                                                                          SimpleImputer())]),
                                                                         ['company_rating',
                                                 

In [155]:
et_gs.best_score_

-420524545.37193096

In [157]:
gb_pipe = Pipeline(steps = [("rf_preprocess", rf_preprocess), ("gb", GradientBoostingRegressor())])
gb_parameters = {'rf_preprocess__text__tfidf__ngram_range' : [(1,2), (1,3)], 'rf_preprocess__text__tfidf__min_df' : [0.0, 0.05, 0.10], 'rf_preprocess__text__tfidf__max_df' : [0.80, 0.85, 0.90], 'rf_preprocess__num__impute_continuous__strategy': ['median'], 'rf_preprocess__num__impute_continuous__add_indicator' : [True], 'rf_preprocess__cat__impute_categorical__add_indicator' : [True], 'gb__learning_rate': [0.1, 0.05, 0.025], 'gb__n_estimators' : [100, 200, 400], 'gb__max_features' : [None, 'sqrt'], 'gb__max_depth' : [1,2,3]}
gb_gs = GridSearchCV(gb_pipe, gb_parameters, scoring = make_scorer(scoring_metric_, greater_is_better = False), return_train_score = True, n_jobs = -1, verbose=10)    
gb_gs.fit(X, Y)

Fitting 5 folds for each of 972 candidates, totalling 4860 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    4.0s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    6.0s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    7.4s
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    8.9s
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:   11.5s
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:   13.4s
[Parallel(n_jobs=-1)]: Done  61 tasks      | elapsed:   16.9s
[Parallel(n_jobs=-1)]: Done  74 tasks      | elapsed:   19.0s
[Parallel(n_jobs=-1)]: Done  89 tasks      | elapsed:   22.2s
[Parallel(n_jobs=-1)]: Done 104 tasks      | elapsed:   27.3s
[Parallel(n_jobs=-1)]: Done 121 tasks      | elapsed:   33.2s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:   37.4s
[Parallel(n_jobs=-1)]: Done 157 tasks      | elapsed:   44.8s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:   49.8s
[Parallel(n_jobs=-1)]: Done 197 tasks      | elapsed:  

GridSearchCV(estimator=Pipeline(steps=[('rf_preprocess',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('text',
                                                                         Pipeline(steps=[('tfidf',
                                                                                          TfidfVectorizer(token_pattern='[a-zA-Z0-9\\+\\-\\.]+'))]),
                                                                         'description'),
                                                                        ('num',
                                                                         Pipeline(steps=[('impute_continuous',
                                                                                          SimpleImputer())]),
                                                                         ['company_rating',
                                                 

In [159]:
gb_gs.best_score_

-390555067.43253165

In [161]:
gb_gs.best_params_

{'gb__learning_rate': 0.1,
 'gb__max_depth': 1,
 'gb__max_features': None,
 'gb__n_estimators': 200,
 'rf_preprocess__cat__impute_categorical__add_indicator': True,
 'rf_preprocess__num__impute_continuous__add_indicator': True,
 'rf_preprocess__num__impute_continuous__strategy': 'median',
 'rf_preprocess__text__tfidf__max_df': 0.85,
 'rf_preprocess__text__tfidf__min_df': 0.1,
 'rf_preprocess__text__tfidf__ngram_range': (1, 3)}

In [162]:
gb_gs.cv_results_

{'mean_fit_time': array([ 2.74525433,  5.92115731,  0.79287896,  0.98755779,  0.62472825,
         0.85251832,  2.6113904 ,  6.20207639,  0.74341054,  1.02485757,
         0.68177576,  0.8981966 ,  2.97304506,  6.2251771 ,  0.77472758,
         0.94387374,  0.63869071,  0.83098302,  4.68946662, 10.81214523,
         1.21897531,  1.46467214,  1.00600624,  1.22088079,  4.6879128 ,
        10.89780736,  1.24928832,  1.46941051,  0.99949222,  1.18572574,
         4.70933819, 10.85712547,  1.2521987 ,  1.46931472,  1.02210436,
         1.20624471,  8.81419892, 20.3124999 ,  2.1478477 ,  2.46974325,
         1.67167559,  1.91227589,  8.78211689, 21.2258389 ,  2.14912806,
         2.49367728,  1.78645325,  2.1204114 ,  9.50795755, 22.39375019,
         2.26687255,  2.60246286,  1.80105486,  2.02647538,  0.67329006,
         1.40782938,  0.44701328,  0.58363829,  0.3560473 ,  0.55556359,
         0.68583689,  1.49637151,  0.47865143,  0.59309883,  0.3624681 ,
         0.56748209,  0.70280819, 

In [167]:
pickle.dump(gb_gs.best_estimator_, open("gb_pipeline.pkl", "wb"))

In [None]:
best_text_pipe_rf =Pipeline(steps = [("tfidf", TfidfVectorizer(token_pattern =  r"[a-zA-Z0-9\+\-\.]+"))])


best_continuous_pipe_rf =Pipeline(steps = [("impute_continuous", SimpleImputer())])

best_categorical_pipe_rf =Pipeline(steps = [("impute_categorical", SimpleImputer(strategy = 'most_frequent')), ('one_hot', OneHotEncoder( handle_unknown = 'ignore'))])


best_rf_preprocess= ColumnTransformer([("text", best_text_pipe_rf, text_columns), ("num", best_continuous_pipe_rf, continuous_columns), ("cat", best_categorical_pipe_rf, categorical_columns)], remainder = 'passthrough')


best_rf_pipe = Pipeline(steps = [("best_rf_preprocess", best_rf_preprocess), ("rf", RandomForestRegressor(n_estimators = 1000, oob_score = True, n_jobs = -1))])

best_rf_parameters = {'best_rf_preprocess__text__tfidf__ngram_range' : [(1,3)], 'best_rf_preprocess__text__tfidf__min_df' : [0.05], 'best_rf_preprocess__text__tfidf__max_df' : [0.10, 0.15, 0.20, 0.25, 0.80], 'best_rf_preprocess__num__impute_continuous__strategy': ['median'], 'best_rf_preprocess__num__impute_continuous__add_indicator' : [True], 'best_rf_preprocess__cat__impute_categorical__add_indicator' : [True], 'rf__max_features': [350, 700, 1400]}

best_rf_gs = GridSearchCV(best_rf_pipe, best_rf_parameters, scoring = scoring_metric, return_train_score = True, n_jobs = -1, verbose=10)    


In [None]:
best_rf_gs.fit(X, Y)

In [None]:
best_rf_gs.best_estimator_

In [None]:
best_rf_gs.best_score_

In [None]:
II_text_pipe_rf =Pipeline(steps = [("tfidf", TfidfVectorizer(token_pattern =  r"[a-zA-Z0-9\+\-\.]+"))])


II_continuous_pipe_rf =Pipeline(steps = [("impute_continuous", IterativeImputer(add_indicator = True, min_value = min_value, max_value = max_value))])

II_categorical_pipe_rf =Pipeline(steps = [("impute_categorical", IterativeImputer(add_indicator = True, initial_strategy = 'most_frequent')), ('one_hot', OneHotEncoder( handle_unknown = 'ignore'))])


II_rf_preprocess= ColumnTransformer([("text", best_text_pipe_rf, text_columns), ("num", best_continuous_pipe_rf, continuous_columns), ("cat", best_categorical_pipe_rf, categorical_columns)], remainder = 'passthrough')


#best_rf_pipe = Pipeline(steps = [("best_rf_preprocess", best_rf_preprocess), ("rf", RandomForestRegressor(n_estimators = 1000, oob_score = True, n_jobs = -1))])

#best_rf_parameters = {'best_rf_preprocess__text__tfidf__ngram_range' : [(1,3)], 'best_rf_preprocess__text__tfidf__min_df' : [0.05], 'best_rf_preprocess__text__tfidf__max_df' : [0.80], 'rf__max_features': [1400]}

#best_rf_gs = GridSearchCV(best_rf_pipe, best_rf_parameters, scoring = scoring_metric, return_train_score = True, n_jobs = -1, verbose=10)    


In [33]:
X_one_hot = pd.get_dummies(X, dummy_na = True, columns = ['company_size', 'company_type', 'industry', 'revenue', 'location_bin'])
X_one_hot

Unnamed: 0,description,company_rating,recommend_to_a_friend,ceo_approval,interview_difficulty,company_size_1 to 50 employees,company_size_10000+ employees,company_size_1001 to 5000 employees,company_size_201 to 500 employees,company_size_5001 to 10000 employees,company_size_501 to 1000 employees,company_size_51 to 200 employees,company_size_nan,company_type_College / University,company_type_Government,company_type_Hospital,company_type_Non-profit Organisation,company_type_Private,company_type_Public,company_type_Subsidiary or Business Segment,company_type_nan,industry_Accounting,industry_Advertising & Marketing,industry_Aerospace & Defence,industry_Architectural & Engineering Services,industry_Banks & Building Societies,industry_Biotech & Pharmaceuticals,industry_Chemical Manufacturing,industry_Colleges & Universities,industry_Computer Hardware & Software,industry_Consulting,industry_Consumer Product Hire,industry_Consumer Products Manufacturing,"industry_Department, Clothing, & Shoe Shops",industry_Education Training Services,industry_Energy,industry_Enterprise Software & Network Solutions,industry_Express Delivery Services,industry_Food & Drink Manufacturing,industry_General Merchandise & Superstores,industry_Government Agencies,industry_Grocery Shops & Supermarkets,industry_Health Charities,"industry_Health, Beauty & Fitness",industry_Healthcare Services & Hospitals,industry_IT Services,industry_Industrial Manufacturing,industry_Insurance Operators,industry_Internet,industry_Investment Banking & Asset Management,industry_Legal,industry_Lending,industry_Logistics & Supply Chain,industry_Mining,industry_Miscellaneous Manufacturing,industry_News Outlets,industry_Other Retail Shops,industry_Pet & Pet Supply Shops,industry_Pharmacies & Health Shops,industry_Regional & County Councils,industry_Research & Development,industry_Social Services,industry_Staffing & Outsourcing,industry_Telecommunications Services,industry_Transportation Equipment Manufacturing,industry_Transportation Management,industry_Utilities,industry_Vehicle Dealers,industry_Video Games,industry_nan,revenue_$1 to $2 billion (CAD) per year,revenue_$1 to $5 million (CAD) per year,revenue_$10 to $25 million (CAD) per year,revenue_$10+ billion (CAD) per year,revenue_$100 to $500 million (CAD) per year,revenue_$2 to $5 billion (CAD) per year,revenue_$5 to $10 billion (CAD) per year,revenue_$5 to $10 million (CAD) per year,revenue_$50 to $100 million (CAD) per year,revenue_$500 million to $1 billion (CAD) per year,revenue_Less than $1 million (CAD) per year,revenue_nan,location_bin_AZ,location_bin_BC,location_bin_CA,location_bin_CO,location_bin_D.C.,location_bin_England,location_bin_GA,location_bin_IL,location_bin_Ireland,location_bin_MA,location_bin_MD,location_bin_MI,location_bin_NC,location_bin_NJ,location_bin_NY,location_bin_ON,location_bin_OR,location_bin_Other,location_bin_QC,location_bin_Scotland,location_bin_TX,location_bin_VA,location_bin_WA,location_bin_nan
0,TikTok is the leading destination for short-fo...,4.1,79.0,100.0,2.7,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,THE ROLE\n\nTesla's mission is to accelerate t...,3.5,59.0,75.0,2.9,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Facebook's mission is to give people the power...,4.4,90.0,94.0,3.1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Job Description\n\nThe New York Times is commi...,3.8,80.0,95.0,2.9,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
4,About Cerebri AI Cerebri AI CVX platform uses ...,3.9,75.0,80.0,2.4,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
5,Description\n\nSHIFT: Day Job\n\nSCHEDULE: Ful...,3.4,63.0,66.0,2.7,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,Job Overview\n\nJob Summary\n\nThe Research Da...,4.2,82.0,89.0,2.8,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,"Location:\n1 Presidents Choice Circle, Brampto...",3.5,69.0,80.0,2.2,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
8,Job Summary\n\nThe UBC Data Science Institute ...,4.2,87.0,82.0,2.7,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,Job Number: R0091474\n\nData Scientist\n\nThe ...,3.9,76.0,93.0,2.7,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


In [41]:
missing_company_size_indices = X_one_hot[(X_one_hot['company_size_nan'] == 1)].index

X_one_hot.iloc[missing_company_size_indices, 5:12] = np.nan

In [44]:
X_one_hot = X_one_hot.drop(columns = 'company_size_nan')

In [53]:
missing_revenue_indices = X_one_hot[(X_one_hot['revenue_nan'] == 1)].index
X_one_hot.iloc[missing_revenue_indices, 69:80] = np.nan

In [54]:
X_one_hot = X_one_hot.drop(columns = 'revenue_nan')

In [82]:
X_one_hot = X_one_hot.drop(columns = ['company_type_nan', 'industry_nan', 'location_bin_nan'])

In [142]:
ct = ColumnTransformer([("impute", IterativeImputer(estimator = RandomForestRegressor(n_estimators = 500, n_jobs = -1), add_indicator = True), slice(1, len(X_one_hot.columns))), ("text", TfidfVectorizer(token_pattern =  r"[a-zA-Z0-9\+\-\.]+", ngram_range = (1,3), min_df = 0.05, max_df = 0.8), 'description')], remainder = 'passthrough')

rf_pipe = Pipeline(steps = [("ct", ct), ("rf", RandomForestRegressor(n_estimators = 500, oob_score = True, n_jobs = -1, max_features = 1400))])

rf_parameters = {'ct__impute__estimator__max_features' : [25, 50, 75, 100]}

rf_gs = GridSearchCV(rf_pipe, rf_parameters, scoring = make_scorer(scoring_metric_, greater_is_better = False), return_train_score = True, n_jobs = -1, verbose=10)

#rf_cv = cross_val_score(rf_pipe, X_one_hot, Y, scoring = make_scorer(scoring_metric_, greater_is_better = False), verbose = 10, n_jobs = -1)


In [143]:
rf_gs.fit(X_one_hot, Y)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  20 | elapsed: 18.5min remaining: 104.9min
[Parallel(n_jobs=-1)]: Done   6 out of  20 | elapsed: 18.5min remaining: 43.2min
[Parallel(n_jobs=-1)]: Done   9 out of  20 | elapsed: 18.5min remaining: 22.7min
[Parallel(n_jobs=-1)]: Done  12 out of  20 | elapsed: 18.5min remaining: 12.4min
[Parallel(n_jobs=-1)]: Done  15 out of  20 | elapsed: 18.6min remaining:  6.2min
[Parallel(n_jobs=-1)]: Done  18 out of  20 | elapsed: 26.6min remaining:  3.0min
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed: 26.7min finished


GridSearchCV(estimator=Pipeline(steps=[('ct',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('impute',
                                                                         IterativeImputer(add_indicator=True,
                                                                                          estimator=RandomForestRegressor(n_estimators=500,
                                                                                                                          n_jobs=-1)),
                                                                         slice(1, 101, None)),
                                                                        ('text',
                                                                         TfidfVectorizer(max_df=0.8,
                                                                                         min_df=0.05,
                        

In [145]:
rf_gs.best_score_

-415603156.3486622

In [129]:
rf_cv

array([-4.19182336e+08, -5.79896435e+08, -5.47206881e+08, -2.59441840e+08,
       -2.63256642e+08])

In [130]:
np.average(rf_cv)

-413796826.8424664

In [85]:
ct = ColumnTransformer([("impute", IterativeImputer(add_indicator = True), slice(1, len(X_one_hot.columns)))], remainder = 'passthrough')
lap = ct.fit_transform(X_one_hot)



In [88]:
lap1 = pd.DataFrame(lap)
lap1.isnull().any()

0      False
1      False
2      False
3      False
4      False
5      False
6      False
7      False
8      False
9      False
10     False
11     False
12     False
13     False
14     False
15     False
16     False
17     False
18     False
19     False
20     False
21     False
22     False
23     False
24     False
25     False
26     False
27     False
28     False
29     False
30     False
31     False
32     False
33     False
34     False
35     False
36     False
37     False
38     False
39     False
40     False
41     False
42     False
43     False
44     False
45     False
46     False
47     False
48     False
49     False
50     False
51     False
52     False
53     False
54     False
55     False
56     False
57     False
58     False
59     False
60     False
61     False
62     False
63     False
64     False
65     False
66     False
67     False
68     False
69     False
70     False
71     False
72     False
73     False
74     False
75     False
76     False

In [106]:
faggot3=ColumnTransformer([("faggot", IterativeImputer(), slice(0,len(faggot2.columns)))])
faggot4 = faggot3.fit_transform(faggot2)

ValueError: could not convert string to float: 'a'

In [77]:
faggot4

array([[ 2.,  3.,  4.],
       [ 6.,  7.,  8.],
       [10., 11., 12.]])

In [75]:
len(faggot2.columns)

4

In [99]:
tfidf=TfidfVectorizer(token_pattern =  r"[a-zA-Z0-9\+\-\.]+", ngram_range = (1,3), min_df = 0.05, max_df = 0.8)
corpus = tfidf.fit_transform(X_one_hot['description'])

In [113]:
o = ct.fit_transform(X_one_hot)



In [114]:
o

<205x2190 sparse matrix of type '<class 'numpy.float64'>'
	with 61010 stored elements in Compressed Sparse Row format>