In [1]:
import tensorflow as tf
import pandas as pd
import spacy
import re
import os
import numpy as np

from spacy.lang.en import English
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences




# Read the data

In [2]:
curr_path = os.getcwd()
file_path = curr_path + '/dataset/fake_job_postings.csv'
df = pd.read_csv(file_path)

Since this dataset is pure text dataset, so its totally fine to fill up the null value with ' ' space.

In [3]:
df.fillna(' ',inplace=True)

Then we add on every text columns together.

In [4]:
df['text']=df['title']+" " + df['department'] + " " + df['company_profile'] + " " + df['description'] + " " + df['requirements'] + " " + df['benefits'] + " " 

Preview the dataset

In [5]:
df.head()

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent,text
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0,"Marketing Intern Marketing We're Food52, and w..."
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0,Customer Service - Cloud Video Production Succ...
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0,Commissioning Machinery Assistant (CMA) Valo...
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0,Account Executive - Washington DC Sales Our pa...
4,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0,Bill Review Manager SpotSource Solutions LLC...


In [6]:
#Delete all the un-necessary Columns
delete_list=['job_id','title','location','telecommuting','has_company_logo','has_questions','department','salary_range','company_profile','description','requirements','benefits','employment_type','required_experience','required_education','industry','function']

for val in delete_list:
    del df[val]
df.head()

Unnamed: 0,fraudulent,text
0,0,"Marketing Intern Marketing We're Food52, and w..."
1,0,Customer Service - Cloud Video Production Succ...
2,0,Commissioning Machinery Assistant (CMA) Valo...
3,0,Account Executive - Washington DC Sales Our pa...
4,0,Bill Review Manager SpotSource Solutions LLC...


In [7]:
df_1 = df.copy()
df_1.head()

Unnamed: 0,fraudulent,text
0,0,"Marketing Intern Marketing We're Food52, and w..."
1,0,Customer Service - Cloud Video Production Succ...
2,0,Commissioning Machinery Assistant (CMA) Valo...
3,0,Account Executive - Washington DC Sales Our pa...
4,0,Bill Review Manager SpotSource Solutions LLC...


# Data Cleaning

1. Data cleanups
2. LEMMANIZATION
3. TOKENIZATION AND PADDING

This code block is designed to preprocess text data in the 'text' column of a DataFrame by removing special characters, converting to lowercase, eliminating unnecessary whitespaces, and removing stopwords. The re module is used for regular expressions, and spaCy is used for natural language processing, specifically for stopword removal.

1. Removing Special Characters and Whitespaces

2. Removing Unwanted Texts and Special Characters

3. Converting Text to Lowercase

4. Removing Unnecessary Whitespaces

5. Removing Stop Words

In [8]:
#Data Cleanups

df['text']=df['text'].str.replace('\n','')
df['text']=df['text'].str.replace('\r','')
df['text']=df['text'].str.replace('\t','')
  
  #This removes unwanted texts
df['text'] = df['text'].apply(lambda x: re.sub(r'[0-9]','',x))
df['text'] = df['text'].apply(lambda x: re.sub(r'[/(){}\[\]\|@,;.:-]',' ',x))
  
  #Converting all upper case to lower case
df['text']= df['text'].apply(lambda s:s.lower() if type(s) == str else s)
  
  #Remove un necessary white space
df['text']=df['text'].str.replace('  ',' ')

  #Remove Stop words
nlp=spacy.load("en_core_web_sm") # python -m spacy download en_core_web_sm 
# These lines use the spaCy library to load the 
# English language model (en_core_web_sm). It then iterates through the 
# words in each 'text' column, 
# removing stopwords 
# (common words that often don't carry much meaning) 
# using nlp.vocab[word].is_stop.

df['text'] =df['text'].apply(lambda x: ' '.join([word for word in x.split() if nlp.vocab[word].is_stop==False ]))

LEMMANIZATION

This code block performs lemmatization on the text data in the 'text' column using spaCy. Lemmatization helps reduce words to their base or root form, making it easier to analyze and understand the underlying meaning of the text. The lemmatized sentences are then stored in a new column named 'processed' in the same DataFrame.

In [9]:
sp = spacy.load('en_core_web_sm')
output=[]

for sentence in df['text']:
    sentence=sp(str(sentence))
    s=[token.lemma_ for token in sentence]
    output.append(' '.join(s))
df['processed']=pd.Series(output)

In [10]:
df.head()

Unnamed: 0,fraudulent,text,processed
0,0,marketing intern marketing we're food we've cr...,market intern marketing we be food we have cre...
1,0,customer service cloud video production succes...,customer service cloud video production succes...
2,0,commissioning machinery assistant cma valor se...,commission machinery assistant cma valor servi...
3,0,account executive washington dc sales passion ...,account executive washington dc sale passion i...
4,0,bill review manager spotsource solutions llc g...,bill review manager spotsource solution llc gl...


TOKENIZATION AND PADDING

This code block sets up parameters for tokenization, initializes a Tokenizer, fits it on the processed text data, and then retrieves the word indices using the word_index attribute of the tokenizer. This process is a crucial step in preparing text data for training neural networks, particularly for natural language processing tasks like sentiment analysis or text classification.

In [11]:
vocab_size = 100000
embedding_dim = 64
max_length = 250
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
training_size = 20000
#Tokenization

tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(df['processed'].values)
word_index = tokenizer.word_index
print(len(word_index))

96956


Prepare for the model building

In [12]:
X = tokenizer.texts_to_sequences(df['processed'].values)    # Tokenize the dataset
X = pad_sequences(X, maxlen=max_length)   # Padding the dataset
Y = df['fraudulent']       #Assign the value of y  
print(Y.shape)

(17880,)


### Save the word_index & tokenizer

In [13]:
import pickle

# 保存 Tokenizer
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

# 保存 word_index
with open('word_index.pickle', 'wb') as handle:
    pickle.dump(word_index, handle, protocol=pickle.HIGHEST_PROTOCOL)


Split the dataset for the model building

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size = 0.20,random_state=41)

In [15]:
def read_tokenizer_wordIndex():
    with open('tokenizer.pickle', 'rb') as handle:
        loaded_tokenizer = pickle.load(handle)
    # 加载 word_index
    with open('word_index.pickle', 'rb') as handle:
        loaded_word_index = pickle.load(handle)
        
    return loaded_tokenizer, loaded_word_index

In [16]:
loaded_tokenizer, loaded_word_index = read_tokenizer_wordIndex()

## Model

random forest

In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [43]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", report)

Accuracy: 0.9686800894854586
Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.98      3414
           1       1.00      0.31      0.47       162

    accuracy                           0.97      3576
   macro avg       0.98      0.65      0.73      3576
weighted avg       0.97      0.97      0.96      3576



To save the random forest model

In [19]:
with open('random_forest.pickle', 'wb') as handle:
    pickle.dump(rf_model, handle, protocol=pickle.HIGHEST_PROTOCOL)

Logistic Regression

In [20]:
from sklearn.linear_model import LogisticRegression

In [21]:
lr_model = LogisticRegression()

lr_model.fit(X_train, y_train)

lr_predict = lr_model.predict(X_test)

# 评估模型性能
accuracy = accuracy_score(y_test, lr_predict)
report = classification_report(y_test, lr_predict)

print("Accuracy:", accuracy)
print("Classification Report:\n", report)

Accuracy: 0.9527404921700223
Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98      3414
           1       0.18      0.01      0.02       162

    accuracy                           0.95      3576
   macro avg       0.57      0.50      0.50      3576
weighted avg       0.92      0.95      0.93      3576



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


To save logistic regression model

In [22]:
with open('logistic_regression.pickle', 'wb') as handle:
    pickle.dump(lr_model, handle, protocol=pickle.HIGHEST_PROTOCOL)

XGboost

In [23]:
import xgboost as xgb

In [24]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [73]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# 创建XGBoost模型
xgb_model = xgb.XGBClassifier(
    max_depth=5,
    learning_rate=0.3,
    objective='binary:logistic',
    n_estimators=100
)

# 训练XGBoost模型
xgb_model.fit(X_train, y_train)

# 在测试集上进行预测
xgb_pred = xgb_model.predict(X_test)

accuracy = accuracy_score(y_test, xgb_pred)
print(f"Accuracy: {accuracy}")

# 输出分类报告
print("\nClassification Report:")
print(classification_report(y_test, xgb_pred))

Accuracy: 0.9720357941834452

Classification Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.99      3414
           1       0.96      0.40      0.57       162

    accuracy                           0.97      3576
   macro avg       0.96      0.70      0.78      3576
weighted avg       0.97      0.97      0.97      3576



To save the XGboost model

In [26]:
with open('xgboost.pickle', 'wb') as handle:
    pickle.dump(xgb_model, handle, protocol=pickle.HIGHEST_PROTOCOL)

Ensemble Learning

In [27]:
from sklearn.ensemble import VotingClassifier

In [28]:
ensemble_model = VotingClassifier(
    estimators=[('logistic', lr_model), ('xgboost', xgb_model)],
    voting='hard'  # 使用软投票，即综合概率
)

In [74]:
ensemble_model = VotingClassifier(
    estimators=[('1', xgb_model), ('2', rf_model)],
    voting='soft'  # 使用软投票，即综合概率
)

In [75]:
ensemble_model.fit(X_train, y_train)

# 在测试集上进行预测
ensemble_pred = ensemble_model.predict(X_test)

# 计算准确率
accuracy = accuracy_score(y_test, ensemble_pred)
print(f"Accuracy: {accuracy}")

# 输出分类报告
print("\nClassification Report:")
print(classification_report(y_test, ensemble_pred))

Accuracy: 0.9711968680089486

Classification Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.99      3414
           1       1.00      0.36      0.53       162

    accuracy                           0.97      3576
   macro avg       0.99      0.68      0.76      3576
weighted avg       0.97      0.97      0.96      3576



To save the ensemble model

In [76]:
with open('ensemble_model.pickle', 'wb') as handle:
    pickle.dump(ensemble_model, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Try to use own data to predict

In [32]:
#predict_data = ['data science, bachelor degree, 1-3 years work experience']
#predict_data = ["Marketing Intern Marketing We're Food52, and we've created a groundbreaking and award-winning cooking site. We support, connect, and celebrate home cooks, and give them everything they need in one place.We have a top editorial, business, and engineering team. We're focused on using technology to find new and better ways to connect people around their specific food interests, and to offer them superb, highly curated information about food and cooking. We attract the most talented home cooks and contributors in the country; we also publish well-known professionals like Mario Batali, Gwyneth Paltrow, and Danny Meyer. And we have partnerships with Whole Foods Market and Random House.Food52 has been named the best food website by the James Beard Foundation and IACP, and has been featured in the New York Times, NPR, Pando Daily, TechCrunch, and on the Today Show.We're located in Chelsea, in New York City. Food52, a fast-growing, James Beard Award-winning online food community and crowd-sourced and curated recipe hub, is currently interviewing full- and part-time unpaid interns to work in a small team of editors, executives, and developers in its New York City headquarters.Reproducing and/or repackaging existing Food52 content for a number of partner sites, such as Huffington Post, Yahoo, Buzzfeed, and more in their various content management systemsResearching blogs and websites for the Provisions by Food52 Affiliate ProgramAssisting in day-to-day affiliate program support, such as screening affiliates and assisting in any affiliate inquiriesSupporting with PR &amp; Events when neededHelping with office administrative work, such as filing, mailing, and preparing for meetingsWorking with developers to document bugs and suggest improvements to the siteSupporting the marketing and executive staff Experience with content management systems a major plus (any blogging counts!)Familiar with the Food52 editorial voice and aestheticLoves food, appreciates the importance of home cooking and cooking with the seasonsMeticulous editor, perfectionist, obsessive attention to detail, maddened by typos and broken links, delighted by finding and fixing themCheerful under pressureExcellent communication skillsA+ multi-tasker and juggler of responsibilities big and smallInterested in and engaged with social media like Twitter, Facebook, and PinterestLoves problem-solving and collaborating to drive Food52 forwardThinks big picture but pitches in on the nitty gritty of running a small company (dishes, shopping, administrative support)Comfortable with the realities of working for a startup: being on call on evenings and weekends, and working long hours"]
predict_data = ["asdwuqhnodiqhishocihiowhoin>?wdqd qwdqwdqdqwdq >"] 

df_predict = pd.DataFrame({'text' : predict_data})
df_predict.head()

Unnamed: 0,text
0,asdwuqhnodiqhishocihiowhoin>?wdqd qwdqwdqdqwdq >


In [33]:
def data_cleanups(df: pd.DataFrame):
    #Data Cleanups

    df['text']=df['text'].str.replace('\n','')
    df['text']=df['text'].str.replace('\r','')
    df['text']=df['text'].str.replace('\t','')
  
  #This removes unwanted texts
    df['text'] = df['text'].apply(lambda x: re.sub(r'[0-9]','',x))
    df['text'] = df['text'].apply(lambda x: re.sub(r'[/(){}\[\]\|@,;.:-]',' ',x))
  
  #Converting all upper case to lower case
    df['text']= df['text'].apply(lambda s:s.lower() if type(s) == str else s)
  
  #Remove un necessary white space
    df['text']=df['text'].str.replace('  ',' ')

  #Remove Stop words
    nlp=spacy.load("en_core_web_sm") # python -m spacy download en_core_web_sm 
    df['text'] =df['text'].apply(lambda x: ' '.join([word for word in x.split() if nlp.vocab[word].is_stop==False ]))
    
    sp = spacy.load('en_core_web_sm')
    output=[]

    for sentence in df['text']:
        sentence=sp(str(sentence))
        s=[token.lemma_ for token in sentence]
        output.append(' '.join(s))
    df['processed']=pd.Series(output)
    
    return df

In [34]:
to_predict = data_cleanups(df_predict)
to_predict

Unnamed: 0,text,processed
0,asdwuqhnodiqhishocihiowhoin>?wdqd qwdqwdqdqwdq >,asdwuqhnodiqhishocihiowhoin>?wdqd qwdqwdqdqwdq >


In [35]:
values_to_predict = tokenizer.texts_to_sequences(to_predict['processed'].values)    # Tokenize the dataset
values_to_predict = pad_sequences(values_to_predict, maxlen=max_length)  
values_to_predict

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0]])

In [36]:
values_to_predict_loaded = loaded_tokenizer.texts_to_sequences(to_predict['processed'].values)    # Tokenize the dataset
values_to_predict_loaded = pad_sequences(values_to_predict_loaded, maxlen=max_length)  
values_to_predict_loaded

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0]])

In [37]:
prediction = rf_model.predict(values_to_predict)
prediction

array([1], dtype=int64)

In [38]:
prediction_loaded = rf_model.predict(values_to_predict_loaded)
prediction_loaded

array([1], dtype=int64)

-------------------------------------------------------------------