In [None]:
import os
# Find the latest version of spark 3.0  from http://www-us.apache.org/dist/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.0.2'
spark_version = 'spark-3.<enter version>'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www-us.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

In [13]:
# Start Spark session
import os
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Hashing").getOrCreate()

In [14]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, StopWordsRemover
import nltk
nltk.download('averaged_perceptron_tagger')
import pandas as pd
from collections import Counter
from  itertools import chain
import numpy as np
import ast as ast
import string
import re
import warnings
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
nltk.download('stopwords')
nltk.download('wordnet')
stopword = nltk.corpus.stopwords.words("english")
ps = nltk.PorterStemmer()
wn = nltk.WordNetLemmatizer()
warnings.filterwarnings("ignore", category=DeprecationWarning)
%matplotlib inline

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ckkoc\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ckkoc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ckkoc\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Pre-Processing


In [15]:
#Import CSV into Data
file_path = "job_list_engineer.csv"
csv_df = spark.read.csv(path=file_path, sep=",", header=True)
csv_df.show()

+--------------------+--------------------+--------------------+--------------------+
|           job_title|        company_name|            location|             summary|
+--------------------+--------------------+--------------------+--------------------+
|       Data Engineer| Synaptein Solutions|   Thousand Oaks. CA|Proven proficienc...|
|   Data Engineer III|               Ursus|Menlo Park. CA 94...|Experience with E...|
|       Data Engineer|             Harnham|San Francisco. CA...|Using programming...|
|Associate. Visual...|                KPMG|Los Angeles. CA 9...|Proficient with d...|
|Data Engineer. Gr...|              Square|   San Francisco. CA|Understand and im...|
|   SQL Data Engineer|Thermo Fisher Sci...|  Carlsbad. CA 92008|Understand variou...|
|       Data Engineer|               Navis|         Oakland. CA|Developing data i...|
|       Data Engineer|      Deckers Brands|    Goleta. CA 93117|Create data pipel...|
|Data Visualizatio...|Unicorn Technolog...|Oakland. CA

In [16]:
# Tokenize the words
tokenizer = Tokenizer(inputCol="summary", outputCol="tokens")
wordsData2 = tokenizer.transform(csv_df)
wordsData2.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+
|           job_title|        company_name|            location|             summary|              tokens|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|       Data Engineer| Synaptein Solutions|   Thousand Oaks. CA|Proven proficienc...|[proven, proficie...|
|   Data Engineer III|               Ursus|Menlo Park. CA 94...|Experience with E...|[experience, with...|
|       Data Engineer|             Harnham|San Francisco. CA...|Using programming...|[using, programmi...|
|Associate. Visual...|                KPMG|Los Angeles. CA 9...|Proficient with d...|[proficient, with...|
|Data Engineer. Gr...|              Square|   San Francisco. CA|Understand and im...|[understand, and,...|
|   SQL Data Engineer|Thermo Fisher Sci...|  Carlsbad. CA 92008|Understand variou...|[understand, vari...|
|       Data Engineer|               

In [17]:
#Create Remover
remover = StopWordsRemover(inputCol="tokens", outputCol="filtered")

In [18]:
#Drop Stop Words
cleaneddf = remover.transform(wordsData2)
cleaneddf.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|           job_title|        company_name|            location|             summary|              tokens|            filtered|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|       Data Engineer| Synaptein Solutions|   Thousand Oaks. CA|Proven proficienc...|[proven, proficie...|[proven, proficie...|
|   Data Engineer III|               Ursus|Menlo Park. CA 94...|Experience with E...|[experience, with...|[experience, etl,...|
|       Data Engineer|             Harnham|San Francisco. CA...|Using programming...|[using, programmi...|[using, programmi...|
|Associate. Visual...|                KPMG|Los Angeles. CA 9...|Proficient with d...|[proficient, with...|[proficient, data...|
|Data Engineer. Gr...|              Square|   San Francisco. CA|Understand and im...|[understand, and,..

In [19]:
#Convert to Pandas Dataframe
pandasDF = cleaneddf.toPandas()
pandasDF.head()

Unnamed: 0,job_title,company_name,location,summary,tokens,filtered
0,Data Engineer,Synaptein Solutions,Thousand Oaks. CA,Proven proficiency with scripting languages su...,"[proven, proficiency, with, scripting, languag...","[proven, proficiency, scripting, languages, py..."
1,Data Engineer III,Ursus,Menlo Park. CA 94025+1 location,Experience with ETL processes. extracting data...,"[experience, with, etl, processes., extracting...","[experience, etl, processes., extracting, data..."
2,Data Engineer,Harnham,San Francisco. CA+1 location,Using programming skills to create data pipeli...,"[using, programming, skills, to, create, data,...","[using, programming, skills, create, data, pip..."
3,Associate. Visualization Data Engineer,KPMG,Los Angeles. CA 90071 (Downtown area),Proficient with data management and integratio...,"[proficient, with, data, management, and, inte...","[proficient, data, management, integration., d..."
4,Data Engineer. Growth,Square,San Francisco. CA,Understand and implement data logging best pra...,"[understand, and, implement, data, logging, be...","[understand, implement, data, logging, best, p..."


In [20]:
#turn filtered text into a dataframe
countable =pandasDF.drop(columns = ["job_title", "company_name", "location", "summary", "tokens"])

#remove punctuation and turn column into a list
countable['filtered'] = countable['filtered'].astype(str)
countable["filtered"] = countable['filtered'].str.replace('(','')
countable["filtered"] = countable['filtered'].str.replace(')','')
countable["filtered"] = countable['filtered'].str.replace('?','')
countable["filtered"] = countable['filtered'].str.replace('.','')
countable["filtered"] = countable['filtered'].str.replace('/','')
countable["filtered"] = countable['filtered'].str.replace('&','')
countable["filtered"] = countable["filtered"].apply(eval)
countable

Unnamed: 0,filtered
0,"[proven, proficiency, scripting, languages, py..."
1,"[experience, etl, processes, extracting, data,..."
2,"[using, programming, skills, create, data, pip..."
3,"[proficient, data, management, integration, da..."
4,"[understand, implement, data, logging, best, p..."
...,...
505,"[functionally, lead, team, data, engineers, de..."
506,"[server, network, hardware, troubleshooting, f..."
507,"[support, day-to-day, operations, escalations,..."
508,"[experience, relational, data, modeling, metad..."


## Part-of-Speech and Word Count

In [21]:
#use sortvalues to count the number of times each word appears
a = pd.Series([item for sublist in countable.filtered for item in sublist])
counted_df = a.value_counts().sort_index().rename_axis('filtered').reset_index(name='count')
counted_df.sort_values(by=['count'], ascending=False, inplace=True)
counted_df= counted_df[counted_df.filtered != ""]
counted_df.head()


Unnamed: 0,filtered,count
313,data,1340
499,experience,236
963,pipelines,104
176,build,80
366,design,74


In [22]:
counted_df['pos'] = counted_df['filtered'].apply(lambda x: nltk.pos_tag([x])[0][1])
counted_df.to_csv('pos_count_scientist.csv', index = False)
counted_df.head()

Unnamed: 0,filtered,count,pos
313,data,1340,NNS
499,experience,236,NN
963,pipelines,104,NNS
176,build,80,NN
366,design,74,NN


In [25]:
def lemmatizing(tnp):
    text = wn.lemmatize(tnp)
    return text
counted_df["lemmatize"] = counted_df["filtered"].apply(lambda x: lemmatizing(x))
counted_df.head()

Unnamed: 0,filtered,count,pos,stems
313,data,1340,NNS,data
499,experience,236,NN,experi
963,pipelines,104,NNS,pipelin
176,build,80,NN,build
366,design,74,NN,design


In [26]:
total_df = counted_df.groupby(["lemmatize", "pos"]).agg({'count': ["sum"]})
total_df = pd.DataFrame(total_df)
total_df.columns = ['Total_Count']
total_df.sort_values(by=['Total_Count'], ascending=False, inplace=True)
total_df.to_csv('pos_count_engineer.csv')
total_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Total_Count
stems,pos,Unnamed: 2_level_1
data,NNS,1340
experi,NN,236
build,NN,123
engin,NN,110
pipelin,NNS,104
