In [85]:
import pandas as pd
import string
import re
import nltk
import matplotlib.pyplot as plt
import numpy as np
import warnings
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
nltk.download('stopwords')
nltk.download('wordnet')
stopword = nltk.corpus.stopwords.words("english")
ps = nltk.PorterStemmer()
wn = nltk.WordNetLemmatizer()
warnings.filterwarnings("ignore", category=DeprecationWarning)
%matplotlib inline

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ckkoc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ckkoc\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [33]:
data = pd.read_csv("job_list_all.csv", sep = ",")
data.head()

Unnamed: 0,job_title,company_name,location,summary,job search
0,Data Analyst - YouTube,Webhelp Americas,"San Bruno, CAâ€¢Temporarily Remote",Strong grasp on data structure and data integr...,data analyst
1,Marketing Data Analyst (REMOTE),Molina Healthcare,"Long Beach, CA 90802â€¢Remote",Designs and implements processes and solutions...,data analyst
2,Entry Level Data Analyst,Insight Global,"Carlsbad, CA 92008",They will be responsible for data administrati...,data analyst
3,Entry Level - Data Engineer / Data Analyst (STEM),PCS Global Tech,"Poway, CAâ€¢Temporarily Remote","Provides plan with data, reporting, and analys...",data analyst
4,Data Analyst/Data Engineer,MT Global US INC,"Foster City, CA 94404","\* Owner of the core company data pipeline, re...",data analyst


In [34]:
def remove_punct(text):
    text_nopunct = "".join([char for char in text if char not in string.punctuation])
    return text_nopunct
data["summary_clean"] = data["summary"].apply(lambda x: remove_punct(x))
data.head()

Unnamed: 0,job_title,company_name,location,summary,job search,summary_clean
0,Data Analyst - YouTube,Webhelp Americas,"San Bruno, CAâ€¢Temporarily Remote",Strong grasp on data structure and data integr...,data analyst,Strong grasp on data structure and data integr...
1,Marketing Data Analyst (REMOTE),Molina Healthcare,"Long Beach, CA 90802â€¢Remote",Designs and implements processes and solutions...,data analyst,Designs and implements processes and solutions...
2,Entry Level Data Analyst,Insight Global,"Carlsbad, CA 92008",They will be responsible for data administrati...,data analyst,They will be responsible for data administrati...
3,Entry Level - Data Engineer / Data Analyst (STEM),PCS Global Tech,"Poway, CAâ€¢Temporarily Remote","Provides plan with data, reporting, and analys...",data analyst,Provides plan with data reporting and analyses...
4,Data Analyst/Data Engineer,MT Global US INC,"Foster City, CA 94404","\* Owner of the core company data pipeline, re...",data analyst,Owner of the core company data pipeline respo...


In [35]:
def tokenize(text):
    tokens = re.split('\W+', text)
    return tokens
data["tokens"] = data["summary_clean"].apply(lambda x: tokenize(x.lower()))
data.head()

Unnamed: 0,job_title,company_name,location,summary,job search,summary_clean,tokens
0,Data Analyst - YouTube,Webhelp Americas,"San Bruno, CAâ€¢Temporarily Remote",Strong grasp on data structure and data integr...,data analyst,Strong grasp on data structure and data integr...,"[strong, grasp, on, data, structure, and, data..."
1,Marketing Data Analyst (REMOTE),Molina Healthcare,"Long Beach, CA 90802â€¢Remote",Designs and implements processes and solutions...,data analyst,Designs and implements processes and solutions...,"[designs, and, implements, processes, and, sol..."
2,Entry Level Data Analyst,Insight Global,"Carlsbad, CA 92008",They will be responsible for data administrati...,data analyst,They will be responsible for data administrati...,"[they, will, be, responsible, for, data, admin..."
3,Entry Level - Data Engineer / Data Analyst (STEM),PCS Global Tech,"Poway, CAâ€¢Temporarily Remote","Provides plan with data, reporting, and analys...",data analyst,Provides plan with data reporting and analyses...,"[provides, plan, with, data, reporting, and, a..."
4,Data Analyst/Data Engineer,MT Global US INC,"Foster City, CA 94404","\* Owner of the core company data pipeline, re...",data analyst,Owner of the core company data pipeline respo...,"[, owner, of, the, core, company, data, pipeli..."


In [36]:
def remove_stopwords(tokens):
    text = [word for word in tokens if word not in stopword]
    return text
data["no_stop"] = data["tokens"].apply(lambda x: remove_stopwords(x))
data.head()

Unnamed: 0,job_title,company_name,location,summary,job search,summary_clean,tokens,no_stop
0,Data Analyst - YouTube,Webhelp Americas,"San Bruno, CAâ€¢Temporarily Remote",Strong grasp on data structure and data integr...,data analyst,Strong grasp on data structure and data integr...,"[strong, grasp, on, data, structure, and, data...","[strong, grasp, data, structure, data, integri..."
1,Marketing Data Analyst (REMOTE),Molina Healthcare,"Long Beach, CA 90802â€¢Remote",Designs and implements processes and solutions...,data analyst,Designs and implements processes and solutions...,"[designs, and, implements, processes, and, sol...","[designs, implements, processes, solutions, as..."
2,Entry Level Data Analyst,Insight Global,"Carlsbad, CA 92008",They will be responsible for data administrati...,data analyst,They will be responsible for data administrati...,"[they, will, be, responsible, for, data, admin...","[responsible, data, administration, tasks, pul..."
3,Entry Level - Data Engineer / Data Analyst (STEM),PCS Global Tech,"Poway, CAâ€¢Temporarily Remote","Provides plan with data, reporting, and analys...",data analyst,Provides plan with data reporting and analyses...,"[provides, plan, with, data, reporting, and, a...","[provides, plan, data, reporting, analyses, en..."
4,Data Analyst/Data Engineer,MT Global US INC,"Foster City, CA 94404","\* Owner of the core company data pipeline, re...",data analyst,Owner of the core company data pipeline respo...,"[, owner, of, the, core, company, data, pipeli...","[, owner, core, company, data, pipeline, respo..."


In [38]:
def stemming(tnp):
    text = [ps.stem(word) for word in tnp]
    return text
data["stems"] = data["no_stop"].apply(lambda x: stemming(x))
data.head()

Unnamed: 0,job_title,company_name,location,summary,job search,summary_clean,tokens,no_stop,stems
0,Data Analyst - YouTube,Webhelp Americas,"San Bruno, CAâ€¢Temporarily Remote",Strong grasp on data structure and data integr...,data analyst,Strong grasp on data structure and data integr...,"[strong, grasp, on, data, structure, and, data...","[strong, grasp, data, structure, data, integri...","[strong, grasp, data, structur, data, integr, ..."
1,Marketing Data Analyst (REMOTE),Molina Healthcare,"Long Beach, CA 90802â€¢Remote",Designs and implements processes and solutions...,data analyst,Designs and implements processes and solutions...,"[designs, and, implements, processes, and, sol...","[designs, implements, processes, solutions, as...","[design, implement, process, solut, associ, wi..."
2,Entry Level Data Analyst,Insight Global,"Carlsbad, CA 92008",They will be responsible for data administrati...,data analyst,They will be responsible for data administrati...,"[they, will, be, responsible, for, data, admin...","[responsible, data, administration, tasks, pul...","[respons, data, administr, task, pull, data, i..."
3,Entry Level - Data Engineer / Data Analyst (STEM),PCS Global Tech,"Poway, CAâ€¢Temporarily Remote","Provides plan with data, reporting, and analys...",data analyst,Provides plan with data reporting and analyses...,"[provides, plan, with, data, reporting, and, a...","[provides, plan, data, reporting, analyses, en...","[provid, plan, data, report, analys, enabl, da..."
4,Data Analyst/Data Engineer,MT Global US INC,"Foster City, CA 94404","\* Owner of the core company data pipeline, re...",data analyst,Owner of the core company data pipeline respo...,"[, owner, of, the, core, company, data, pipeli...","[, owner, core, company, data, pipeline, respo...","[, owner, core, compani, data, pipelin, respon..."


In [87]:
def lemmatizing(tnp):
    text = [wn.lemmatize(word) for word in tnp]
    return text
data["lemmatize"] = data["no_stop"].apply(lambda x: lemmatizing(x))
data.to_csv("preprocessed_data.csv", sep=',')
data.head()

Unnamed: 0.1,Unnamed: 0,job_title,company_name,location,summary,job search,summary_clean,tokens,no_stop,stems,lemmatize
0,0,Data Analyst - YouTube,Webhelp Americas,"San Bruno, CAâ€¢Temporarily Remote",Strong grasp on data structure and data integrity issues. Attention to detail and work ethic for...,data analyst,Strong grasp on data structure and data integrity issues Attention to detail and work ethic for ...,"['strong', 'grasp', 'on', 'data', 'structure', 'and', 'data', 'integrity', 'issues', 'attention'...","['strong', 'grasp', 'data', 'structure', 'data', 'integrity', 'issues', 'attention', 'detail', '...","['strong', 'grasp', 'data', 'structur', 'data', 'integr', 'issu', 'attent', 'detail', 'work', 'e...","[[, ', s, t, r, o, n, g, ', ,, , ', g, r, a, s, p, ', ,, , ', d, a, t, a, ', ,, , ', s, t, r,..."
1,1,Marketing Data Analyst (REMOTE),Molina Healthcare,"Long Beach, CA 90802â€¢Remote",Designs and implements processes and solutions associated with a wide variety of data sets used ...,data analyst,Designs and implements processes and solutions associated with a wide variety of data sets used ...,"['designs', 'and', 'implements', 'processes', 'and', 'solutions', 'associated', 'with', 'a', 'wi...","['designs', 'implements', 'processes', 'solutions', 'associated', 'wide', 'variety', 'data', 'se...","['design', 'implement', 'process', 'solut', 'associ', 'wide', 'varieti', 'data', 'set', 'use', '...","[[, ', d, e, s, i, g, n, s, ', ,, , ', i, m, p, l, e, m, e, n, t, s, ', ,, , ', p, r, o, c, e,..."
2,2,Entry Level Data Analyst,Insight Global,"Carlsbad, CA 92008",They will be responsible for data administration tasks such as pulling data from internal system...,data analyst,They will be responsible for data administration tasks such as pulling data from internal system...,"['they', 'will', 'be', 'responsible', 'for', 'data', 'administration', 'tasks', 'such', 'as', 'p...","['responsible', 'data', 'administration', 'tasks', 'pulling', 'data', 'internal', 'systems', 'sc...","['respons', 'data', 'administr', 'task', 'pull', 'data', 'intern', 'system', 'scrub', 'sort', '1...","[[, ', r, e, s, p, o, n, s, i, b, l, e, ', ,, , ', d, a, t, a, ', ,, , ', a, d, m, i, n, i, s,..."
3,3,Entry Level - Data Engineer / Data Analyst (STEM),PCS Global Tech,"Poway, CAâ€¢Temporarily Remote","Provides plan with data, reporting, and analyses that enable data-driven decision-making. Experi...",data analyst,Provides plan with data reporting and analyses that enable datadriven decisionmaking Experience ...,"['provides', 'plan', 'with', 'data', 'reporting', 'and', 'analyses', 'that', 'enable', 'datadriv...","['provides', 'plan', 'data', 'reporting', 'analyses', 'enable', 'datadriven', 'decisionmaking', ...","['provid', 'plan', 'data', 'report', 'analys', 'enabl', 'datadriven', 'decisionmak', 'experi', '...","[[, ', p, r, o, v, i, d, e, s, ', ,, , ', p, l, a, n, ', ,, , ', d, a, t, a, ', ,, , ', r, e,..."
4,4,Data Analyst/Data Engineer,MT Global US INC,"Foster City, CA 94404","\* Owner of the core company data pipeline, responsible for scaling up data processing flow to m...",data analyst,Owner of the core company data pipeline responsible for scaling up data processing flow to meet...,"['', 'owner', 'of', 'the', 'core', 'company', 'data', 'pipeline', 'responsible', 'for', 'scaling...","['', 'owner', 'core', 'company', 'data', 'pipeline', 'responsible', 'scaling', 'data', 'processi...","['', 'owner', 'core', 'compani', 'data', 'pipelin', 'respons', 'scale', 'data', 'process', 'flow...","[[, ', ', ,, , ', o, w, n, e, r, ', ,, , ', c, o, r, e, ', ,, , ', c, o, m, p, a, n, y, ', ,,..."


In [88]:
pd.set_option('display.max_colwidth', 100) 
data = pd.read_csv("preprocessed_data.csv", sep=',')

In [89]:
def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopword]
    return text

In [90]:
count_vect = CountVectorizer(analyzer=clean_text)
X_counts = count_vect.fit_transform(data['summary'])
print(X_counts.shape)
print(count_vect.get_feature_names())

(2559, 2960)
['', '1', '10', '1015', '10tb', '10â', '11000', '1159', '12', '12week', '13', '14', '15', '150170k', '1553', '180', '1st', '1stparti', '2', '20', '21st', '23', '24', '247', '25', '2year', '2â', '3', '30', '311', '34', '35', '36', '360degre', '362', '3d', '3year', '3ã', '4', '40', '401k', '41621', '430', '46', '48', '5', '510', '510â', '515', '57', '5g', '6', '6ã', '7', '70', '710', '730', '8', '810', 'ab', 'abil', 'abl', 'abreast', 'abstract', 'abstractâ', 'abus', 'academ', 'academia', 'academicwork', 'acceler', 'accentur', 'accept', 'access', 'accomplish', 'accord', 'account', 'accredit', 'accrual', 'accur', 'accuraci', 'achiev', 'achieveâ', 'acl', 'acquir', 'acquisit', 'across', 'acrossâ', 'act', 'action', 'activ', 'activevideo', 'activis', 'actuari', 'acumen', 'acycl', 'ad', 'adam', 'adapt', 'addit', 'address', 'adept', 'adequ', 'adher', 'adhoc', 'adjust', 'administr', 'adob', 'adobeomnitur', 'adobeâ', 'adopt', 'advanc', 'advancedâ', 'advantag', 'advertis', 'advertising

In [91]:
X_counts_df = pd.DataFrame(X_counts.toarray(), columns=count_vect.get_feature_names())
X_counts_df.head()

Unnamed: 0,Unnamed: 1,1,10,1015,10tb,10â,11000,1159,12,12week,...,zone,â,å,æ,ç,œbig,œcleanâ,œdata,œoper,œstoryâ
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,1,2,1,0,0,0,0,0


In [92]:
ngram_vect = CountVectorizer(ngram_range=(2,2),analyzer=clean_text)
X_counts = ngram_vect.fit_transform(data['summary'])
print(X_counts.shape)
print(ngram_vect.get_feature_names())

(2559, 2960)
['', '1', '10', '1015', '10tb', '10â', '11000', '1159', '12', '12week', '13', '14', '15', '150170k', '1553', '180', '1st', '1stparti', '2', '20', '21st', '23', '24', '247', '25', '2year', '2â', '3', '30', '311', '34', '35', '36', '360degre', '362', '3d', '3year', '3ã', '4', '40', '401k', '41621', '430', '46', '48', '5', '510', '510â', '515', '57', '5g', '6', '6ã', '7', '70', '710', '730', '8', '810', 'ab', 'abil', 'abl', 'abreast', 'abstract', 'abstractâ', 'abus', 'academ', 'academia', 'academicwork', 'acceler', 'accentur', 'accept', 'access', 'accomplish', 'accord', 'account', 'accredit', 'accrual', 'accur', 'accuraci', 'achiev', 'achieveâ', 'acl', 'acquir', 'acquisit', 'across', 'acrossâ', 'act', 'action', 'activ', 'activevideo', 'activis', 'actuari', 'acumen', 'acycl', 'ad', 'adam', 'adapt', 'addit', 'address', 'adept', 'adequ', 'adher', 'adhoc', 'adjust', 'administr', 'adob', 'adobeomnitur', 'adobeâ', 'adopt', 'advanc', 'advancedâ', 'advantag', 'advertis', 'advertising

In [93]:
X_counts_df = pd.DataFrame(X_counts.toarray(), columns=ngram_vect.get_feature_names())
X_counts_df.head()

Unnamed: 0,Unnamed: 1,1,10,1015,10tb,10â,11000,1159,12,12week,...,zone,â,å,æ,ç,œbig,œcleanâ,œdata,œoper,œstoryâ
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,1,2,1,0,0,0,0,0


In [94]:
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf = tfidf_vect.fit_transform(data['summary'])
print(X_tfidf.shape)
print(tfidf_vect.get_feature_names())

(2559, 2960)
['', '1', '10', '1015', '10tb', '10â', '11000', '1159', '12', '12week', '13', '14', '15', '150170k', '1553', '180', '1st', '1stparti', '2', '20', '21st', '23', '24', '247', '25', '2year', '2â', '3', '30', '311', '34', '35', '36', '360degre', '362', '3d', '3year', '3ã', '4', '40', '401k', '41621', '430', '46', '48', '5', '510', '510â', '515', '57', '5g', '6', '6ã', '7', '70', '710', '730', '8', '810', 'ab', 'abil', 'abl', 'abreast', 'abstract', 'abstractâ', 'abus', 'academ', 'academia', 'academicwork', 'acceler', 'accentur', 'accept', 'access', 'accomplish', 'accord', 'account', 'accredit', 'accrual', 'accur', 'accuraci', 'achiev', 'achieveâ', 'acl', 'acquir', 'acquisit', 'across', 'acrossâ', 'act', 'action', 'activ', 'activevideo', 'activis', 'actuari', 'acumen', 'acycl', 'ad', 'adam', 'adapt', 'addit', 'address', 'adept', 'adequ', 'adher', 'adhoc', 'adjust', 'administr', 'adob', 'adobeomnitur', 'adobeâ', 'adopt', 'advanc', 'advancedâ', 'advantag', 'advertis', 'advertising

In [95]:
X_tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vect.get_feature_names())
X_tfidf_df.head()

Unnamed: 0,Unnamed: 1,1,10,1015,10tb,10â,11000,1159,12,12week,...,zone,â,å,æ,ç,œbig,œcleanâ,œdata,œoper,œstoryâ
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.084265,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.260476,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.090396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.057043,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.286558,0.573116,0.286558,0.0,0.0,0.0,0.0,0.0


In [96]:
print(X_tfidf_df.loc[(X_tfidf_df!=0).any(axis=1)])

                       1   10  1015  10tb  10â  11000  1159   12  12week  ...  \
0     0.000000  0.000000  0.0   0.0   0.0  0.0    0.0   0.0  0.0     0.0  ...   
1     0.084265  0.000000  0.0   0.0   0.0  0.0    0.0   0.0  0.0     0.0  ...   
2     0.000000  0.260476  0.0   0.0   0.0  0.0    0.0   0.0  0.0     0.0  ...   
3     0.090396  0.000000  0.0   0.0   0.0  0.0    0.0   0.0  0.0     0.0  ...   
4     0.057043  0.000000  0.0   0.0   0.0  0.0    0.0   0.0  0.0     0.0  ...   
...        ...       ...  ...   ...   ...  ...    ...   ...  ...     ...  ...   
2554  0.000000  0.000000  0.0   0.0   0.0  0.0    0.0   0.0  0.0     0.0  ...   
2555  0.078563  0.000000  0.0   0.0   0.0  0.0    0.0   0.0  0.0     0.0  ...   
2556  0.000000  0.000000  0.0   0.0   0.0  0.0    0.0   0.0  0.0     0.0  ...   
2557  0.080581  0.000000  0.0   0.0   0.0  0.0    0.0   0.0  0.0     0.0  ...   
2558  0.066091  0.000000  0.0   0.0   0.0  0.0    0.0   0.0  0.0     0.0  ...   

      zone    â         å  

In [101]:
def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopword]
    return text

# TF-IDF
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf = tfidf_vect.fit_transform(data['summary'])
X_tfidf_feat = pd.DataFrame(X_tfidf.toarray())

# CountVectorizer
count_vect = CountVectorizer(analyzer=clean_text)
X_count = count_vect.fit_transform(data['summary'])
X_count_feat = pd.DataFrame(X_count.toarray())

In [102]:
rf = RandomForestClassifier()
param = {'n_estimators': [10, 150, 300],
        'max_depth': [30, 60, 90, None]}

gs = GridSearchCV(rf, param, cv=5, n_jobs=-1)# n_jobs=-1 for parallelizing search
gs_fit = gs.fit(X_count_feat, data['job search'])
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False).head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
8,28.508678,2.736939,0.366097,0.074781,90.0,300,"{'max_depth': 90, 'n_estimators': 300}",0.826172,0.851562,0.884766,0.830078,0.861057,0.850727,0.021426,1
11,19.549322,0.984116,0.176985,0.040474,,300,"{'max_depth': None, 'n_estimators': 300}",0.830078,0.865234,0.875,0.820312,0.853229,0.848771,0.020679,2
10,13.738385,0.387122,0.250753,0.01925,,150,"{'max_depth': None, 'n_estimators': 150}",0.818359,0.859375,0.875,0.822266,0.857143,0.846429,0.022228,3
7,13.835127,0.176865,0.244027,0.014902,90.0,150,"{'max_depth': 90, 'n_estimators': 150}",0.824219,0.865234,0.876953,0.830078,0.833659,0.846029,0.021016,4
4,14.422146,1.010972,0.272405,0.045487,60.0,150,"{'max_depth': 60, 'n_estimators': 150}",0.820312,0.853516,0.859375,0.835938,0.851272,0.844083,0.014184,5


In [103]:
gs = GridSearchCV(rf, param, cv=5, n_jobs=-1)# n_jobs=-1 for parallelizing search
gs_fit = gs.fit(X_tfidf_feat, data['job search'])
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False).head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
7,18.488348,0.926705,0.305064,0.030659,90.0,150,"{'max_depth': 90, 'n_estimators': 150}",0.814453,0.845703,0.873047,0.839844,0.853229,0.845255,0.019048,1
11,28.782107,2.145447,0.208467,0.086192,,300,"{'max_depth': None, 'n_estimators': 300}",0.814453,0.849609,0.878906,0.826172,0.847358,0.8433,0.022134,2
8,35.477613,1.961275,0.553704,0.069227,90.0,300,"{'max_depth': 90, 'n_estimators': 300}",0.814453,0.849609,0.873047,0.832031,0.845401,0.842908,0.019429,3
10,16.861695,1.091813,0.309244,0.066631,,150,"{'max_depth': None, 'n_estimators': 150}",0.818359,0.849609,0.869141,0.832031,0.827789,0.839386,0.018003,4
4,15.313545,1.465484,0.232212,0.007614,60.0,150,"{'max_depth': 60, 'n_estimators': 150}",0.8125,0.847656,0.859375,0.832031,0.843444,0.839001,0.015879,5


In [None]:
def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopword]
    return text

# TF-IDF
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf = tfidf_vect.fit_transform(data['summary'])
X_tfidf_feat = pd.DataFrame(X_tfidf.toarray())

# CountVectorizer
count_vect = CountVectorizer(analyzer=clean_text)
X_count = count_vect.fit_transform(data['summary'])
X_count_feat = pd.DataFrame(X_count.toarray())

In [None]:
gs_predict = gs.predict(data['job search'])