# Indeed dataset

Context

- Original dataset was created by PromptCloud and DataStock. This dataset contains around 30K records in it. 
- Dataset can be  downloaded from here -> (https://app.datastock.shop/?site_name=Indeed Job Posting) and saved into `Data` folder.

### Libraries

In [None]:
#Exploratory 
import pandas as pd
import numpy as np
import missingno


#Data Visualization 
import seaborn as sns
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from IPython.display import Image

#Data Modeling
import nltk
from nltk.corpus import gutenberg, stopwords
from nltk import FreqDist
from nltk import word_tokenize
import string
import re

#Data Evaluation 

#Spacy
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

#Ignore any warnings
import warnings;
warnings.filterwarnings('ignore')

#Mellissas utils
from utils import *


## Business Problem

In [None]:
Image('../Images/Image4.png')

From : https://www.fox2detroit.com/news/indeed-com-scam-steals-personal-info-from-woman-with-fake-job-posting

## Dataset

In [None]:
df = pd.read_csv('../Data/20191031__30k_data.csv')

df.head()

In [None]:
df.info()

### Drop all the 0 non-null since these wont provide information 

In [None]:
# Remove all the 0 non-null columns
df = df.dropna(axis = 1, how = 'all',inplace=False)
df.head(2)

In [None]:
df.info()

In [None]:
#Checking for missing values
df.isna().apply(pd.value_counts)

In [None]:
df['Job Title']

In [None]:
df.Location

In [None]:
df['Apply Url']

Maybe turn all urls to url, and all Nans to unknown

In [None]:
df['Company Name']

In [None]:
df['Employer Logo']

Add both dataset logo or nologo

In [None]:
df['Companydescription']

In [None]:
df['Company Name'].sort_values()

In [None]:
df.loc[29110]

## Data Viz

In [None]:
# checking missing data in our Indeed dataframe.
missingno.matrix(df,color=(0.1, 0.6, 0.7))

## Data Preparation

CompanyLogo / noCompanyLogo.... Companydescription  / Companyprofile


In [None]:
# Fill nulls with 'unknown'
df['Companydescription' ]= df['Companydescription'].fillna('unknown')

# Fill nulls with nocompanylogo or hascompanylogo
df['Employer Logo'] = df['Employer Logo'].notna()
df['Employer Logo'].replace({False:"nocompanylogo",True:'hascompanylogo'},inplace=True)

In [None]:
df.info()

In [None]:
# Selecting specific columns for new text col
df['text'] = '  '+ df['Job Title'] +'  '+ df['Job Description'] +'  '+ df['Employer Logo']+'  '+ df['Companydescription'] + '  '
df.head(2)


In [None]:
# Create our list of punctuation marks
punctuations = string.punctuation

# Create our list of stopwords
nlp = spacy.load("en_core_web_sm")
stop_words = spacy.lang.en.stop_words.STOP_WORDS

# Load English tokenizer, tagger, parser, NER and word vectors
parser = English()

In [None]:
# Any text for guidance 
df.text[919]

In [None]:
# Turns everything lowercase
df.text = df.text.str.lower()

In [None]:
# removes numbers
df.text = df.text.apply(remove_number)

In [None]:
# removes weird symbols
df.text = df.text.apply(remove_symbols)

In [None]:
# Apply to the DF series
df.text = df.text.apply(remove_punctuations)
df.text = df.text.apply(remove_symbols)

In [None]:
# Prefer to have States caps
df.text = df['text'] + '  ' + df['State']+'  '


In [None]:
#Sample of what 'text' looks like after cleaning
df.text[9139]

In [None]:
STOPWORDS = spacy.lang.en.stop_words.STOP_WORDS
plt.figure(figsize = (16,14))
wc = WordCloud(min_font_size = 3,  
               max_words = 200, 
               width = 1600, 
               height = 800,
               background_color='white',
               stopwords = STOPWORDS).generate(str(" ".join(df.text)))
plt.imshow(wc,interpolation = 'bilinear')
plt.axis("off");

## Prep Indeed Dataset for testing

In [None]:
indeed = df.filter(['text'], axis=1)

indeed

In [None]:
type(indeed)

In [None]:
#indeed.to_csv(r'../Data\indeed')

In [None]:
!ls ../Data

In [None]:
indeed = pd.read_csv('../Data/indeed', index_col=0)

In [None]:
indeed.loc[90].text

### Indeed Dataset is ready now to be test on my model