In [None]:
import pandas as pd
import numpy as np
import gensim
import gensim.downloader

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
 
import string

from keras.layers import LSTM, Activation, Dropout, Dense, Input, Bidirectional
from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.models import Model
from keras.callbacks import ModelCheckpoint

from keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

from tqdm import tqdm


In [None]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Data exploration

In [None]:
df = pd.read_csv("train.csv")
df = df.drop(["Unnamed: 0"],axis=1)

df_val = pd.read_csv("dev.csv")
df_val = df_val.drop(["Unnamed: 0"],axis=1)
df.head()

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,11284,Maintenance Mechanic,"US, NY, Niagara Falls",Maintenance,,"Niacet is a leading producer of organic salts,...",DEPARTMENT: MaintenanceREPORTS TO: ...,,,0,1,0,Full-time,Mid-Senior level,,Chemicals,Manufacturing,0
1,11511,LS17 6DJ Customer Service Apprenticeship Avail...,"GB, , Leeds",,,Established on the principles that full time e...,This is fantastic opportunity for someone want...,Government funding is only available for 16-18...,Future prospects,0,1,1,,,,,,0
2,5663,Technical Support Representative,"US, OH, Westerville",,,Working for Status Solutions means that you ar...,A Customer Support Representative at Status So...,"TasksMonitor the Network Operations Center, ca...",Starting salary based on experience and educat...,0,1,1,Full-time,Not Applicable,Unspecified,Computer Software,Customer Service,0
3,16816,Ruby on Rails Specialist,"US, CA, Long Beach",,,,Ruby on Rails Web Engineer (RoR)Now Hiring Rub...,,,0,0,0,Full-time,Mid-Senior level,,Information Technology and Services,,0
4,3733,Caregiver - Bridgman,"US, MI, Bridgman",,,"""Our mission to our clients is to preserve the...",Home Sweet Home In-Home Care is one of the fas...,,Competitive compensation with performance revi...,0,1,1,,,High School or equivalent,Hospital & Health Care,Health Care Provider,0


#### Missing values

In [None]:
df_fake = df.loc[df["fraudulent"]==1]
df_true = df.loc[df["fraudulent"]==0]
df_fake

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
52,6528,Client Audit Coordinator,"US, TX, Austin",COMPLIANCE UNIT 701,26000-30000,The Accion story began more than 100 years ag...,Description SUMMARY: The Client Audit Coordina...,High school diploma or equivalent; Bachelor's ...,We offer a competitive and comprehensive range...,0,1,1,Full-time,Associate,High School or equivalent,Financial Services,Customer Service,1
75,9842,Client Services Manager - SM1,"US, CA, San Mateo",Client Services,,#URL_ddb080358fa5eecf5a67c649cfb4ffc343c484389...,"Client Services Manager - SM1San Mateo, CA#URL...",Basic Qualifications:Proven experience in mana...,Our core values drive our culture. This is wha...,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Marketing and Advertising,Information Technology,1
77,17806,Receptionist,"GB, ,",,,,We the Montcalm Hotel are looking for the posi...,Good Requirement,Package: Basic salary ranges from GBP2500 - 60...,0,0,0,Full-time,Entry level,,Hospitality,,1
84,9258,CAD Operator,"US, TX, Houston",Engineering,,,Corporate overviewAker Solutions is a global p...,Qualifications &amp; personal attributes Educa...,We offer• Friendly colleagues in an industry w...,0,0,0,,,,Construction,Design,1
96,5784,Position Admin Assistant,"AU, NSW, Sydney",,22000-28000,,We require an experienced Admin Assistant with...,• Excellent time management &amp; superb prese...,Part Time – 15 to 20 hours per week – you choo...,0,0,1,Part-time,Internship,Unspecified,Executive Office,Administrative,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14204,5751,"Agency Sales Managers $150-$175,000/yr","US, NY, Elmira",,,We have aggressive growth plans in place for t...,We provide No Credit Check Financing to custom...,Experience and Skills Required- Minimum of 3 ...,We Provide- Complete online training and offic...,0,1,0,Full-time,,High School or equivalent,Financial Services,Sales,1
14228,17677,Director of Engineering: Applications,"US, ,",Engineering,,...,Director of...,What you bring to the table:BS / MS in compute...,"What We Offer: Competitive salary, excellent b...",0,1,1,Full-time,Director,Master's Degree,Ranching,Engineering,1
14253,10483,Senior System Test Engineer,"US, CA, San Jose",Engineering,,Aptitude Staffing Solutions has redesigned the...,"Senior System Test Engineer | San Jose, CAAs a...",Skills and RequirementsBS in Computer Science ...,"Broad responsibility, autonomy and visibility ...",0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Computer Networking,Engineering,1
14254,14129,Adminstrative/Data Entry,"US, IL, Chicago",,63000-63000,,We are a network of national standards bodies....,"No requirements, Apply if you are in the force...","The job includes benefits such as health, dent...",0,0,0,Full-time,Entry level,Unspecified,Human Resources,Human Resources,1


In [None]:
def getNaNRatio(df):
    rows = len(df)
    for column in df.columns:
        print(f"{column}: {df[column].isna().sum()/rows}")

In [None]:
getNaNRatio(df_fake)

job_id: 0.0
title: 0.0
location: 0.01925722145804677
department: 0.6107290233837689
salary_range: 0.7359009628610729
company_profile: 0.6685006877579092
description: 0.001375515818431912
requirements: 0.16506189821182943
benefits: 0.40440165061898214
telecommuting: 0.0
has_company_logo: 0.0
has_questions: 0.0
employment_type: 0.28060522696011003
required_experience: 0.49105914718019256
required_education: 0.515818431911967
industry: 0.3177441540577717
function: 0.38514442916093533
fraudulent: 0.0


In [None]:
getNaNRatio(df_true)

job_id: 0.0
title: 0.0
location: 0.01922368711792001
department: 0.6455770788834058
salary_range: 0.8439272298740517
company_profile: 0.16144951020107534
description: 0.0
requirements: 0.15150622376077189
benefits: 0.4015614642409958
telecommuting: 0.0
has_company_logo: 0.0
has_questions: 0.0
employment_type: 0.19179494733740884
required_experience: 0.3928702953524343
required_education: 0.45223539809972746
industry: 0.2739191279369522
function: 0.36083081682256757
fraudulent: 0.0


#### Correlation (features with binary value)

In [None]:
# Correlation
df[["telecommuting","fraudulent"]].corr()

Unnamed: 0,telecommuting,fraudulent
telecommuting,1.0,0.03296
fraudulent,0.03296,1.0


In [None]:
df[["has_company_logo","fraudulent"]].corr()

Unnamed: 0,has_company_logo,fraudulent
has_company_logo,1.0,-0.264599
fraudulent,-0.264599,1.0


-0.26 is significant correlation for keeping this feature

In [None]:
df[["has_questions","fraudulent"]].corr()

Unnamed: 0,has_questions,fraudulent
has_questions,1.0,-0.092613
fraudulent,-0.092613,1.0


## Preprocessing

In [None]:
# Lowercase, removing stop words, punctuation, numbers, stemming

# Input -> array of docs
def preprocess_texts(texts):
    preprocessed = []
    stop_words = stopwords.words('english')
    punct=string.punctuation
    # stemmer=PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    
    print("Preprocessing..")
    for text in tqdm(texts):
        words = word_tokenize(text)
        words = [''.join([i for i in word if not i.isdigit()]) for word in words] # Removing numbers
        # All in one :)
        words = [lemmatizer.lemmatize(word.lower()) for word in words if (word not in stop_words) and (word not in punct)]
        preprocessed.append(words)
        
    print("Done")
    return preprocessed
        

## Fill nan

In [None]:
df = df.fillna("none")
df_val = df_val.fillna("none")

## Small features embeddings

In [55]:
def merge(df):
    # df = df.assign(small_features=lambda df: df["title"]+" "+df["location"]+" "+df["department"]+
    #           df["employment_type"]+" "+df["required_experience"]+" "+df["required_education"]+" "+
    #            df["industry"]+" "+df["function"])
    
    # df = df.assign(small_features=lambda df: df["description"])
    
    df = df.assign(small_features=lambda df: df["title"]+" "+df["location"]+" "+df["department"]+
              df["employment_type"]+" "+df["required_experience"]+" "+df["required_education"]+" "+
               df["industry"]+" "+df["function"]+" "+df["description"])
    

    return df[["small_features","fraudulent"]]

In [56]:
df_small = merge(df)
df_val_small = merge(df_val)
df_small.head()

Unnamed: 0,small_features,fraudulent
0,"Maintenance Mechanic US, NY, Niagara Falls Mai...",0
1,LS17 6DJ Customer Service Apprenticeship Avail...,0
2,"Technical Support Representative US, OH, Weste...",0
3,"Ruby on Rails Specialist US, CA, Long Beach no...",0
4,"Caregiver - Bridgman US, MI, Bridgman nonenon...",0


In [57]:
features_train = np.array(df_small["small_features"])
target_train = np.array(df_small["fraudulent"])

features_val = np.array(df_val_small["small_features"])
target_val = np.array(df_val_small["fraudulent"])

In [58]:
preprocessed_train = preprocess_texts(features_train)
preprocessed_val = preprocess_texts(features_val)
#print(preprocessed[:3])

Preprocessing..


100%|██████████| 14304/14304 [00:33<00:00, 422.10it/s]


Done
Preprocessing..


100%|██████████| 1788/1788 [00:04<00:00, 408.86it/s]

Done





#### GloVe

In [None]:
# Download
glove_vectors = gensim.downloader.load('glove-wiki-gigaword-50') # 50 dimensions embeddings
embedding_dim = 50



In [None]:
# Example

print(glove_vectors["cat"])
glove_vectors.most_similar("cat",topn=10)

[ 0.45281  -0.50108  -0.53714  -0.015697  0.22191   0.54602  -0.67301
 -0.6891    0.63493  -0.19726   0.33685   0.7735    0.90094   0.38488
  0.38367   0.2657   -0.08057   0.61089  -1.2894   -0.22313  -0.61578
  0.21697   0.35614   0.44499   0.60885  -1.1633   -1.1579    0.36118
  0.10466  -0.78325   1.4352    0.18629  -0.26112   0.83275  -0.23123
  0.32481   0.14485  -0.44552   0.33497  -0.95946  -0.097479  0.48138
 -0.43352   0.69455   0.91043  -0.28173   0.41637  -1.2609    0.71278
  0.23782 ]


[('dog', 0.9218005537986755),
 ('rabbit', 0.8487821221351624),
 ('monkey', 0.804108202457428),
 ('rat', 0.7891963124275208),
 ('cats', 0.7865270376205444),
 ('snake', 0.7798910737037659),
 ('dogs', 0.7795814871788025),
 ('pet', 0.7792249917984009),
 ('mouse', 0.773166835308075),
 ('bite', 0.7728800177574158)]

In [59]:
max_input_length = max([len(doc) for doc in preprocessed_train]) 
print(max_input_length)

811


In [60]:
def texts2embeddings(texts,max_length,em_dim):
    zero_vector = np.zeros(em_dim)
    embeddings = []
    total_words = 0
    missing_words = 0
    for text in tqdm(texts):
        embedded_text = []
        for word in text:
            total_words += 1
            try:
                embedded_text.append(glove_vectors[word])
            except:
                embedded_text.append(zero_vector)
                missing_words += 1
        embeddings.append(embedded_text)
    embeddings = pad_sequences(embeddings, maxlen=max_length, padding='post',value=zero_vector,dtype="float32")
    print("Missing words:",missing_words/total_words)
    return embeddings

In [61]:
embedded_features_train = texts2embeddings(preprocessed_train,max_input_length,embedding_dim)
embedded_features_val = texts2embeddings(preprocessed_val,max_input_length,embedding_dim)

100%|██████████| 14304/14304 [00:02<00:00, 5609.76it/s]


Missing words: 0.08806345082975225


100%|██████████| 1788/1788 [00:00<00:00, 4530.07it/s]


Missing words: 0.08926859527434726


In [None]:
# with open('embedded_features_train.npy', 'wb') as f:
#     np.save(f, embedded_features_train)
# with open('embedded_features_val.npy', 'wb') as f:
#     np.save(f, embedded_features_val)

In [None]:
# with open('embedded_features_train.npy', 'rb') as f:
#     embedded_features_train = np.load(f)
# with open('embedded_features_val.npy', 'rb') as f:
#     embedded_features_val = np.load(f)

In [62]:
from functools import lru_cache
# Building model

checkpoint_filepath = './checkpoint'
model_checkpoint_callback = ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True)


model = Sequential()
model.add(Bidirectional(LSTM(100, input_shape=(max_input_length, embedding_dim))))
model.add(Dropout(0.2)) # Regularization
model.add(Dense(1,activation="sigmoid")) 
model.compile(loss='binary_crossentropy', optimizer='adam',metrics=["accuracy"])
history = model.fit(embedded_features_train, target_train, epochs=30, batch_size=10, verbose=2,
                    validation_data=(embedded_features_val,target_val), callbacks=[model_checkpoint_callback])


# The model weights (that are considered the best) are loaded into the model.
model.load_weights(checkpoint_filepath)

Epoch 1/30
1431/1431 - 140s - loss: 0.1713 - accuracy: 0.9514 - val_loss: 0.1158 - val_accuracy: 0.9636 - 140s/epoch - 98ms/step
Epoch 2/30
1431/1431 - 83s - loss: 0.1405 - accuracy: 0.9555 - val_loss: 0.1060 - val_accuracy: 0.9698 - 83s/epoch - 58ms/step
Epoch 3/30
1431/1431 - 83s - loss: 0.1095 - accuracy: 0.9664 - val_loss: 0.0915 - val_accuracy: 0.9709 - 83s/epoch - 58ms/step
Epoch 4/30
1431/1431 - 83s - loss: 0.0872 - accuracy: 0.9734 - val_loss: 0.0837 - val_accuracy: 0.9760 - 83s/epoch - 58ms/step
Epoch 5/30
1431/1431 - 83s - loss: 0.0721 - accuracy: 0.9776 - val_loss: 0.0758 - val_accuracy: 0.9782 - 83s/epoch - 58ms/step
Epoch 6/30
1431/1431 - 84s - loss: 0.0615 - accuracy: 0.9804 - val_loss: 0.0700 - val_accuracy: 0.9782 - 84s/epoch - 58ms/step
Epoch 7/30
1431/1431 - 84s - loss: 0.0473 - accuracy: 0.9860 - val_loss: 0.0658 - val_accuracy: 0.9793 - 84s/epoch - 58ms/step
Epoch 8/30
1431/1431 - 83s - loss: 0.0381 - accuracy: 0.9874 - val_loss: 0.0652 - val_accuracy: 0.9787 - 83s/

KeyboardInterrupt: ignored

In [63]:
print("Baseline:",(1-sum(target_val)/len(target_val)))

Baseline: 0.9619686800894854


In [64]:
predicted = model.predict(embedded_features_val)

# As labels
treshold = 0.5
predicted_labels = []
for p in predicted:
    if p >= treshold:
        predicted_labels.append([1])
    else:
        predicted_labels.append([0])

print("F1:")
print("Micro:",f1_score(target_val,predicted_labels,average="micro"))
print("Macro:",f1_score(target_val,predicted_labels,average="macro"))
print("Weighted:",f1_score(target_val,predicted_labels,average="weighted"))
print("For fraudulent:",f1_score(target_val,predicted_labels,average="binary"))

F1:
Micro: 0.9804250559284117
Macro: 0.8592642068262932
Weighted: 0.9799138287170104
For fraudulent: 0.7286821705426356


Seems gooood!=)

In [65]:
df_val['fraudulent'].value_counts(), sum(predicted > 0.5), sum(predicted <= 0.5)

(0    1720
 1      68
 Name: fraudulent, dtype: int64, array([61]), array([1727]))