In [48]:
# library needed
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import re
from sklearn import preprocessing
import scipy
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import sklearn

# import data

In [49]:
dat = pd.read_csv("./job_training_data.csv")

In [50]:
stop_words = {".",'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 're',
 's',
 'same',
 'shan',
 "shan't",
 'she',
 "she's",
 'should',
 "should've",
 'shouldn',
 "shouldn't",
 'so',
 'some',
 'such',
 't',
 'than',
 'that',
 "that'll",
 'the',
 'their',
 'theirs',
 'them',
 'themselves',
 'then',
 'there',
 'these',
 'they',
 'this',
 'those',
 'through',
 'to',
 'too',
 'under',
 'until',
 'up',
 've',
 'very',
 'was',
 'wasn',
 "wasn't",
 'we',
 'were',
 'weren',
 "weren't",
 'what',
 'when',
 'where',
 'which',
 'while',
 'who',
 'whom',
 'why',
 'will',
 'with',
 'won',
 "won't",
 'wouldn',
 "wouldn't",
 'y',
 'you',
 "you'd",
 "you'll",
 "you're",
 "you've",
 'your',
 'yours',
 'yourself',
 'yourselves',"us"}

# Main Parsing function

1. get number of words
2. one-hot encode descriptive features and categorical features
3. include number of na's for each column

In [51]:
def parsing(dat, stop_words):
    
    # The four descriptive columns
    
    # create machine for fitting (not required for testing data)
    vectorizer_profile = CountVectorizer(stop_words=stop_words)
    vectorizer_description = CountVectorizer(stop_words=stop_words)
    vectorizer_requirements = CountVectorizer(stop_words=stop_words)
    vectorizer_benefits = CountVectorizer(stop_words=stop_words)
    
    # Fit word processor (not required for testing data)
    vectorizer_profile.fit(dat['company_profile'].fillna('NaN'))
    vectorizer_description.fit(dat['description'].fillna('NaN'))
    vectorizer_requirements.fit(dat['requirements'].fillna('NaN'))
    vectorizer_benefits.fit(dat['benefits'].fillna('NaN'))
    
    # transforming to one hot encoding
    profile_onehot = pd.DataFrame.sparse.from_spmatrix(vectorizer_profile.transform(dat['company_profile'].fillna('NaN')))
    description_onehot = pd.DataFrame.sparse.from_spmatrix(vectorizer_description.transform(dat['description'].fillna('NaN')))
    requirements_onehot = pd.DataFrame.sparse.from_spmatrix(vectorizer_requirements.transform(dat['requirements'].fillna('NaN')))
    benifits_onehot = pd.DataFrame.sparse.from_spmatrix(vectorizer_benefits.transform(dat['benefits'].fillna('NaN')))
    print('fitting done')
    
    # func to count words in document
    document_word_count = lambda document: len(document.split(' '))
    
    # get text length for each descriptive column
    profile_count = pd.Series([document_word_count(dat['company_profile'].fillna('NaN')[i]) for i in range(dat.shape[0])])
    print('done with profile')
    description_count = pd.Series([document_word_count(dat['description'].fillna('NaN')[i]) for i in range(dat.shape[0])])
    print('done with description')
    requirements_count = pd.Series([document_word_count(dat['requirements'].fillna('NaN')[i]) for i in range(dat.shape[0])])
    print('done with requirements')
    benifits_count = pd.Series([document_word_count(dat['benefits'].fillna('NaN')[i]) for i in range(dat.shape[0])])
    print('text length done')
    
    '''
    # get frequency
    profile_freq = (profile_onehot.transpose()/profile_count).transpose()
    print('done with profile')
    description_freq = (description_onehot.transpose()/description_count).transpose()
    print('done with description')
    requirements_freq = (requirements_onehot.transpose()/requirements_count).transpose()
    print('done with requirements')
    benifits_freq = (benifits_onehot.transpose()/benifits_count).transpose()
    print('frequency done')
    '''
    
    # concat results
    descriptive_summary = pd.concat([profile_onehot,description_onehot,requirements_onehot,benifits_onehot,profile_count,description_count,requirements_count,benifits_count],
             axis=1,ignore_index=True)
    
    # other columns (one-hot)
    text = dat.fillna('NAN')[['location','department','salary_range','employment_type','required_experience','required_education','industry','function.']].agg(' '.join, axis=1)
    vectorizer = CountVectorizer(stop_words=stop_words)
    # not required for testing
    vectorizer.fit(text)
    # required for testing
    text_onehot = pd.DataFrame.sparse.from_spmatrix(vectorizer.transform(text)) 
    
    # concat all results
    data = pd.concat([descriptive_summary,text_onehot,dat[['telecommuting', 'has_company_logo', 'has_questions']],dat.isna().astype('int')],axis = 1,ignore_index=True)
    return(data)

In [52]:
data_normalized = preprocessing.scale(parsing(dat.iloc[:,:-1],stop_words=stop_words),with_mean=True,with_std=True)

fitting done
done with profile
done with description
done with requirements
text length done




In [53]:
# np.savetxt("data_normalized.csv",data_normalized ,delimiter=",")

# PCA

In [54]:
from sklearn.decomposition import PCA

In [55]:
pca = PCA()
pca.fit(data_normalized)

PCA()

In [56]:
components = pca.components_[0:np.where(pca.explained_variance_ratio_.cumsum() > 0.9)[0][0]+1,:]
pca_features = np.matmul(data_normalized,np.transpose(components))

In [57]:
#pd.DataFrame(pca_features).to_csv('pca_features.csv')

# Trees

In [58]:
import time

In [59]:
rf = RandomForestClassifier()

In [60]:
param_grid = {'max_features':['sqrt', 'log2'],
             'n_estimators':[200],
             'ccp_alpha':[0,0.3,0.5],
             'n_jobs':[6]}
clf = GridSearchCV(rf, param_grid,cv=10)

In [61]:
start = time.time()
clf.fit(pca_features,dat.fraudulent)
print(time.time()-start)

886.3657009601593


In [63]:
clf.best_params_

{'ccp_alpha': 0, 'max_features': 'sqrt', 'n_estimators': 200, 'n_jobs': 6}

In [64]:
sum(clf.predict(pca_features) == dat.fraudulent)/len(dat.fraudulent)

1.0

In [65]:
sklearn.metrics.confusion_matrix(clf.predict(pca_features),dat.fraudulent)

array([[5103,    0],
       [   0,  259]], dtype=int64)

In [22]:
start = time.time()
print()
rf = RandomForestClassifier(n_jobs =6 , max_features='log2', n_estimators = 200)
rf.fit(pca_features,dat.fraudulent)
print(time.time()-start)


6.176079750061035


# Code to Deal with Salary

In [None]:
# see if character is in text
def alpha_in_text(text):
    return(any(c.isalpha() for c in text))

# see how many dashes are in text
def number_of_dashes(text):
    return(sum([1 for i in text if '-' in i]))

# extract smallest salary range value
def salary_extract_first(text):
    
    if pd.isna(text) is True:
        return(-1)
    
    elif alpha_in_text(text) is True:
        return(-2)
    
    elif '-' in text:
        if number_of_dashes(text) == 1:
            if re.split('-',text)[0].isdigit() is True:
                return(float(re.split('-',text)[0]))
            else:
                return(-1)
            
        else:
            return(-1)
    else:
        return(-1)
    
# largest salary range value
def salary_extract_second(text):
    
    if pd.isna(text) is True:
        return(-1)
    
    elif alpha_in_text(text) is True:
        return(-2)
    
    elif '-' in text:
        if number_of_dashes(text) == 1:
            if re.split('-',text)[1].isdigit() is True:
                return(float(re.split('-',text)[1]))
            else:
                return(-1)
            
        else:
            return(-1)
    else:
        return(-1)

# convert numeric salary to category
def salary_category_first(number):
    percentile = [60.0, 14000.0, 20000.0, 30000.0, 35000.0, 44374.4, 55000.0, 70000.0, 90000.0]
    if number == -1:
        return(str(1))
    
    if number == -2:
        return(str(2))
    
    for i in range(len(percentile)):
        if i not in {0,8}:
            if (number > percentile[i-1]) & (number <= percentile[i]):
                return(i+3)
            else:
                continue
            
        if i == 0:
            if number < percentile[0]:
                return(str(i+3))
        if i == 8:
            if number >= percentile[8]:
                return(str(i+3))
            


def salary_category_second(number):
    percentile = [120, 20000.0, 30000.0, 40000.0, 50000.0, 65000.0, 80000.0, 100000.0, 130000.0]
    if number == -1:
        return(str(1))
    
    if number == -2:
        return(str(2))
    
    for i in range(len(percentile)):
        if i not in {0,8}:
            if (number > percentile[i-1]) & (number <= percentile[i]):
                return(i+3)
            else:
                continue
            
        if i == 0:
            if number < percentile[0]:
                return(str(i+3))
        if i == 8:
            if number >= percentile[8]:
                return(str(i+3))
            
def convert_salary_to_category(series):
    # first column
    category_1 = series.apply(salary_extract_first).apply(salary_category_first)
    dummy_1 = pd.get_dummies(category_1, drop_first=True)
    
    # second column
    category_2 = series.apply(salary_extract_second).apply(salary_category_second)
    dummy_2 = pd.get_dummies(category_2, drop_first=True)
    
    return(pd.concat([dummy_1,dummy_2],axis=1,ignore_index=True))

In [None]:
# Example:
convert_salary_to_category(dat.salary_range)

# Below are Test code (Ignore)

In [None]:
test = dat.salary_range.apply(salary_extract_second).apply(salary_category_second)

In [None]:
dummy = pd.get_dummies(test,drop_first=True)

In [None]:
low = dat.salary_range.apply(salary_extract_first)

In [None]:
up = dat.salary_range.apply(salary_extract_second)

In [None]:
print([np.percentile(low[low>=0],i*10) for i in range(1,10)])

In [None]:
print([np.percentile(up[up>=0],i*10) for i in range(1,10)])

In [None]:
max(low)

In [None]:
text = '1---,-asdfasdf109-'
sum([1 for i in text if '-' in i])

In [None]:
re.split('-','35000-40000')

In [None]:
dat.loc[dat.salary_range=='800000000-1200000000',:]

In [None]:
dat.shape[1]

In [None]:
[len(pd.unique(dat.iloc[:,i])) for i in range(dat.shape[1])]

In [None]:
# main parsing function
# dat: the raw pandas dataframe
# stop_words: a set of words to omitt when parsing
def parsing(dat, stop_words):
    
    # let character rows be merged into one single text (only for columns of type 'object')
    data = dat.fillna('NaN')
    data_object_colnames = data.columns[data.dtypes == 'object']
    text = data[data_object_colnames].agg(' '.join, axis=1)
    
    # results for parsing
    vectorizer = CountVectorizer(stop_words=stop_words)
    result = pd.DataFrame.sparse.from_spmatrix(vectorizer.fit_transform(text))
    
    # merge parsed results, number of NA for each field, numeric columns (less fraud and job id)
    cleaned_data = pd.concat([result,
                              dat.isna().astype('int'),
                              dat.loc[:,dat.dtypes == 'int64'].iloc[:,1:4]], axis = 1, ignore_index=True)
    return cleaned_data
    

In [None]:
stop_words = {"a", "the"}
result = parsing(dat, stop_words)

In [None]:
result

# Below are code for thought process (can ignore)

In [None]:
# takes in a text
# gets rid of punctuation
def ridpunctuation(text):
    return re.sub(r'[^\w\s]', '', text)

In [None]:
# Method 1
# let character rows be merged into one single text
data = dat.fillna(' NaN ')
data_object_colnames = data.columns[data.dtypes == 'object']
text = data[data_object_colnames[0]]
for col in range(1,len(data_object_colnames)):
    text = text + data[data_object_colnames[col]]

In [None]:
# Method 2
data = dat.fillna('NaN')
data_object_colnames = data.columns[data.dtypes == 'object']
text = data[data_object_colnames].agg(' '.join, axis=1)

In [None]:
text

In [None]:
# results for parsing
vectorizer = CountVectorizer()
result = pd.DataFrame.sparse.from_spmatrix(vectorizer.fit_transform(text))

In [None]:
import numpy as np

In [None]:
# NA info 
cleaned_data = pd.concat([result ,dat.isna().astype('int'),dat.loc[:,dat.dtypes == 'int64'].iloc[:,1:4]],axis = 1,ignore_index=True)

# Below are scratch code (can ignore)

In [None]:
test_series = pd.DataFrame({"a":['hello"s , hello hello,hi you are',"hello hello today's\xa0Document Hello"]})

In [None]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown='ignore')

In [None]:
enc.fit(test_series)

In [None]:
enc.transform(test_series)

In [None]:
# save results of parsing
vectorizer = CountVectorizer()
result = vectorizer.fit_transform(test_series['a'])

In [None]:
pd.DataFrame.sparse.from_spmatrix(result)

In [None]:
vectorizer.vocabulary_

In [None]:
test = CountVectorizer().fit_transform(data.iloc[:,1])
test_names
pd.DataFrame.sparse.from_spmatrix(test)

In [None]:
def parsing_series(series):
    sparse_parse = CountVectorizer(stop_words=stop_words).fit_transform(series)
    pandas.DataFrame.sparse.from_spmatrix


In [None]:
data

In [None]:
def parsing(col):
    

In [None]:
# concat na to 
pd.concat([data ,data.isna().astype('int')],axis = 1,ignore_index=True)

In [None]:
pd.concat()