In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import classification_report
from sklearn.utils import shuffle
import pandas as pd
import numpy as np
import nltk
import re

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/artjom/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
def merge(df):
    df = df.assign(merged_features=lambda df: 
                   df["title"]+" "
                   +df["location"]+" "
                   +df["department"]+" "
                   +df["employment_type"]+" "
                   +df["required_experience"]+" "
                   +df["required_education"]+" "
                   +df["industry"]+" "
                   +df["function"]+" "
                   +df["description"]+" "
                   +df["requirements"]+" "
                   +df["benefits"]
                   )
    return df[["merged_features","fraudulent"]]

In [9]:
df_train = pd.read_csv("data/train.csv")
df_train = df_train.fillna("none")

df_test = pd.read_csv("data/dev.csv")
df_test = df_test.fillna("none")

df_aug = pd.read_csv("augmented.csv", sep='\t')
df_aug['fraudulent'] = np.ones(len(df_aug), dtype=int)
df_aug = df_aug.replace(" ",np.nan)
df_aug = df_aug.fillna("none")

df = pd.concat([df_aug, df_train])
df = shuffle(df)
df = df.reset_index()

df_merged = merge(df)
df_test_merged = merge(df_test)

X_train = np.array(df_merged["merged_features"])
y_train = np.array(df_merged["fraudulent"])

X_test = np.array(df_test_merged["merged_features"])
y_test = np.array(df_test_merged["fraudulent"])

stopwords = nltk.corpus.stopwords.words('english')
lemmatizer = WordNetLemmatizer()

In [10]:
# Params description
# data - array of sentences
def preprocessing(data: list):
    output = []
    for sentence in data:
        review = re.sub('[^a-zA-Z]', ' ', sentence)
        review = review.lower()
        review = review.split()
        review = [lemmatizer.lemmatize(word) for word in review if not word in set(stopwords)]
        review = ' '.join(review)
        output.append(review)
    return output

X_train = preprocessing(X_train)

#### TF-IDF Vectorizer

In [11]:
tf_idf = TfidfVectorizer()
X_train_tf = tf_idf.fit_transform(X_train)
X_test_tf = tf_idf.transform(X_test)

#### Naive Bayes Classifier

In [12]:
naive_bayes_classifier = MultinomialNB()
naive_bayes_classifier.fit(X_train_tf, y_train)
y_pred = naive_bayes_classifier.predict(X_test_tf)

In [13]:
print(classification_report(y_test, y_pred, target_names=['Not fraudulent', 'Fraudulent']))

                precision    recall  f1-score   support

Not fraudulent       0.97      1.00      0.99      1720
    Fraudulent       0.91      0.29      0.44        68

      accuracy                           0.97      1788
     macro avg       0.94      0.65      0.72      1788
  weighted avg       0.97      0.97      0.97      1788

