In [9]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import preprocessing
import matplotlib.pyplot as plt

In [10]:
data=pd.read_csv('fake_job_postings.csv',encoding='utf-16')

UnicodeError: UTF-16 stream does not start with BOM

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data.info()

In [None]:
data.isnull().sum()

# Data Pre-processing

In [None]:
# Deal with missing values and drop unneccessary columns

# Location missing values will be assigned none
data['location'] = data.location.fillna('none')

# department missing values will be assigned not specified
data['department'] = data.department.fillna('not specified')

# drop salary range, benefits, telecommuting, has_questions (not compulsory) in the context of Nigeria
data.drop(['salary_range', 'benefits','telecommuting','has_questions'],
             axis=1, inplace=True)

# Company profile missing values will be assigned none
data['company_profile'] = data.company_profile.fillna('none')

# Company profile missing values will be assigned not specified
data['requirements'] = data.requirements.fillna('not specified')

# employment_type missing values will be assigned not specified
data['employment_type'] = data.employment_type.fillna('not specified')

# required_experience missing values will be assigned not specified
data['required_experience'] = data.required_experience.fillna('not specified')

# required_education missing values will be assigned not specified
data['required_education'] = data.required_education.fillna('not specified')

# industry missing values will be assigned not specified
data['industry'] = data.industry.fillna('not specified')

# function missing values will be assigned not specified
data['function'] = data.function.fillna('not specified')

In [None]:
data.isnull().sum()

In [None]:
data.head()

In [None]:
data.columns

In [None]:
print('Data set:')
for col_name in data.columns:
    if data[col_name].dtypes == 'object' :
        unique_cat = len(data[col_name].unique())
        print("Feature '{col_name}' has {unique_cat} categories".format(col_name=col_name, unique_cat=unique_cat))

print()

In [None]:
df = data[['title', 'location','company_profile', 'requirements', 'employment_type',
       'required_experience', 'required_education', 'industry', 'function',
       'fraudulent']]

In [None]:
# Check if there is any null value
df.isna().apply(pd.value_counts)

In [None]:
#Check for number of null values
df.isnull().sum()

In [None]:
df_num = df[['fraudulent']]
df_cat = df[['title', 'location','company_profile', 'requirements','employment_type',
       'required_experience', 'required_education', 'industry', 'function']]

In [None]:
# Checking for Outliers in numerical data
plt.figure(figsize=[16,8])
sns.boxplot(data = df_num)
plt.show()

In [None]:
#Removing Outliers from columns
df_num = df_num[df_num['fraudulent'] < 0.9 ]

In [None]:
df.shape


In [None]:
#fig, axes = plt.subplots(ncols=2, figsize=(17, 5), dpi=100)
#plt.tight_layout()

#df["fraudulent"].value_counts().plot(kind='pie', ax=axes[0], labels=['Real Post (95%)', 'Fake Post (5%)'])
#temp = df["fraudulent"].value_counts()
#sns.barplot(temp.index, temp, ax=axes[1])

#axes[0].set_ylabel(' ')
#axes[1].set_ylabel(' ')
#axes[1].set_xticklabels(["Real Post (17014) [0's]", "Fake Post (866) [1's]"])

#axes[0].set_title('Target Distribution in Dataset', fontsize=13)
#axes[1].set_title('Target Count in Dataset', fontsize=13)

#plt.show()

In [None]:
cat_cols = ["employment_type", "required_experience", "required_education",]
# visualizating catagorical variable by target
import matplotlib.gridspec as gridspec # to do the grid of plots
grid = gridspec.GridSpec(3, 3, wspace=0.5, hspace=0.5) # The grid of chart
plt.figure(figsize=(15,25)) # size of figure

# loop to get column and the count of plots
for n, col in enumerate(df[cat_cols]): 
    ax = plt.subplot(grid[n]) # feeding the figure of grid
    sns.countplot(x=col, data=df, hue='fraudulent', palette='Set2') 
    ax.set_ylabel('Count', fontsize=12) # y axis label
    ax.set_title(f'{col} Distribution by Target', fontsize=15) # title label
    ax.set_xlabel(f'{col} values', fontsize=12) # x axis label
    xlabels = ax.get_xticklabels() 
    ylabels = ax.get_yticklabels() 
    ax.set_xticklabels(xlabels,  fontsize=10)
    ax.set_yticklabels(ylabels,  fontsize=10)
    plt.legend(fontsize=8)
    plt.xticks(rotation=90) 
    total = len(df)
    sizes=[] # Get highest values in y
    for p in ax.patches: # loop to all objects
        height = p.get_height()
        sizes.append(height)
        ax.text(p.get_x()+p.get_width()/2.,
                height + 3,
                '{:1.2f}%'.format(height/total*100),
                ha="center", fontsize=10) 
    ax.set_ylim(0, max(sizes) * 1.15) #set y limit based on highest heights


plt.show()

In [None]:
fig,(ax1,ax2)= plt.subplots(ncols=2, figsize=(17, 5), dpi=100)
length=df[df["fraudulent"]==1]['requirements'].str.len()
ax1.hist(length,bins = 20,color='orangered')
ax1.set_title('Fake Post')
length=df[df["fraudulent"]==0]['requirements'].str.len()
ax2.hist(length, bins = 20)
ax2.set_title('Real Post')
fig.suptitle('Characters in description')
plt.show()

In [None]:
fig,(ax1,ax2)= plt.subplots(ncols=2, figsize=(17, 5), dpi=100)
num=df[df["fraudulent"]==1]['company_profile'].str.split().map(lambda x: len(x))
ax1.hist(num,bins = 20,color='orangered')
ax1.set_title('Fake Post')
num=df[df["fraudulent"]==0]['company_profile'].str.split().map(lambda x: len(x))
ax2.hist(num, bins = 20)
ax2.set_title('Real Post')
fig.suptitle('Words in company profile')
plt.show()

In [None]:
df['fraudulent'].values

In [None]:
fraud = df[df['fraudulent']== 1]
fraud.shape

In [None]:
not_fraud = df[df['fraudulent']== 0]
not_fraud.shape

In [None]:
fraud = fraud.sample(1403, replace=True)
fraud.shape, not_fraud.shape

In [None]:
df = fraud.append(not_fraud)
df.reset_index()

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['title'] = le.fit_transform(df['title'])
df['location'] = le.fit_transform(df['location'])
df['company_profile'] = le.fit_transform(df['company_profile'])
df['requirements'] = le.fit_transform(df['requirements'])
df['employment_type'] = le.fit_transform(df['employment_type'])
df['required_experience'] = le.fit_transform(df['required_experience'])
df['required_education'] = le.fit_transform(df['required_education'])
df['industry'] = le.fit_transform(df['industry'])
df['function'] = le.fit_transform(df['function'])

In [None]:
df = df.reset_index()
df.head()

In [None]:
from sklearn.model_selection import train_test_split

X = df[['index', 'title', 'location', 'company_profile', 'requirements',
       'employment_type','required_experience', 'required_education', 'industry', 'function']].values
Y = df[['fraudulent']].values


In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y)

In [None]:
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
import sklearn.metrics as metrics
from sklearn.metrics import accuracy_score

In [None]:
import warnings
warnings.filterwarnings('ignore')


In [None]:
### Logistic Regression

clf=LogisticRegression()
clf.fit(X_train, Y_train)
preds=clf.predict(X_test)
print('accuracy with Logistic Regression:',accuracy_score(Y_test, preds), '%')

### Random Forest

clf=RandomForestClassifier()
clf.fit(X_train, Y_train)
preds=clf.predict(X_test)
print('accuracy with Random Forest:',accuracy_score(Y_test, preds), '%')

### Support Vector Machine

clf=SVC()
clf.fit(X_train, Y_train)
preds=clf.predict(X_test)
print('accuracy with Support Vector Machine:',accuracy_score(Y_test, preds), '%')

### Decision Tree

clf=DecisionTreeClassifier()
clf.fit(X_train, Y_train)
preds=clf.predict(X_test)
print('accuracy with Decision Tree:',accuracy_score(Y_test, preds), '%')

### K-Nearest Neighbors

clf=KNeighborsClassifier()
clf.fit(X_train, Y_train)
preds=clf.predict(X_test)
print('accuracy with K-Nearest Neighbors :',accuracy_score(Y_test, preds), '%')

### Naive Bayes

clf=GaussianNB()
clf.fit(X_train, Y_train)
preds=clf.predict(X_test)
print('accuracy with Naive Bayes:',accuracy_score(Y_test, preds), '%')


In [None]:
test_vector = np.reshape(np.asarray([17614,5362,1393,1669,11417,1,7,13,75,37]),(1,10))
p = int(clf.predict(test_vector)[0])

if p==0:
    print('Job profile is Real')
else:
    print('Job profile is fake')
