In [None]:
import os
# Find the latest version of spark 3.0  from http://www-us.apache.org/dist/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.0.2'
spark_version = 'spark-3.<enter version>'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www-us.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

In [1]:
# Start Spark session
import os
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Hashing").getOrCreate()

In [2]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, StopWordsRemover
import nltk
nltk.download('averaged_perceptron_tagger')
import pandas as pd
from collections import Counter
from  itertools import chain
import numpy as np
import ast as ast

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ckkoc\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [3]:
#Import CSV into Datt
file_path = "job_list_single.csv"
csv_df = spark.read.csv(path=file_path, sep=",", header=True)
csv_df.show()

+--------------------+--------------------+--------------------+--------------------+
|           job_title|        company_name|            location|             summary|
+--------------------+--------------------+--------------------+--------------------+
|Business Systems ...|          DiversityX|      Los Angeles CA|Ability to elicit...|
|      Data Analyst I|       Computershare|United States•Remote|1 year of experie...|
|Healthcare Qualit...|Clinical Manageme...|      Long Beach. CA|Data abstraction ...|
|Data Analyst need...|            AppleOne|    Folsom. CA 95630|?Bachelor's degre...|
|TikTok LIVE Data ...|              TikTok|     Los Angeles. CA|Assist in creatin...|
|Data Analyst (Ful...|       Cisco Systems|        San Jose. CA|Assess the effect...|
|Analyst. Data & R...|Disney Streaming ...|         Burbank. CA|1-3 years of equi...|
| Junior Data Analyst|        Dental Views|       Riverside. CA|Creates actionabl...|
|Data Analyst -Rec...|Goodman Manufactu...|Irvine. CA 

In [4]:
# Tokenize the words
tokenizer = Tokenizer(inputCol="summary", outputCol="tokens")
wordsData2 = tokenizer.transform(csv_df)
wordsData2.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+
|           job_title|        company_name|            location|             summary|              tokens|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|Business Systems ...|          DiversityX|      Los Angeles CA|Ability to elicit...|[ability, to, eli...|
|      Data Analyst I|       Computershare|United States•Remote|1 year of experie...|[1, year, of, exp...|
|Healthcare Qualit...|Clinical Manageme...|      Long Beach. CA|Data abstraction ...|[data, abstractio...|
|Data Analyst need...|            AppleOne|    Folsom. CA 95630|?Bachelor's degre...|[?bachelor's, deg...|
|TikTok LIVE Data ...|              TikTok|     Los Angeles. CA|Assist in creatin...|[assist, in, crea...|
|Data Analyst (Ful...|       Cisco Systems|        San Jose. CA|Assess the effect...|[assess, the, eff...|
|Analyst. Data & R...|Disney Streamin

In [5]:
#Create Remover
remover = StopWordsRemover(inputCol="tokens", outputCol="filtered")

In [6]:
#Drop Stop Words
cleaneddf = remover.transform(wordsData2)
cleaneddf.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|           job_title|        company_name|            location|             summary|              tokens|            filtered|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|Business Systems ...|          DiversityX|      Los Angeles CA|Ability to elicit...|[ability, to, eli...|[ability, elicit....|
|      Data Analyst I|       Computershare|United States•Remote|1 year of experie...|[1, year, of, exp...|[1, year, experie...|
|Healthcare Qualit...|Clinical Manageme...|      Long Beach. CA|Data abstraction ...|[data, abstractio...|[data, abstractio...|
|Data Analyst need...|            AppleOne|    Folsom. CA 95630|?Bachelor's degre...|[?bachelor's, deg...|[?bachelor's, deg...|
|TikTok LIVE Data ...|              TikTok|     Los Angeles. CA|Assist in creatin...|[assist, in, crea..

In [7]:
#Convert to Pandas Dataframe
pandasDF = cleaneddf.toPandas()
pandasDF.head()

Unnamed: 0,job_title,company_name,location,summary,tokens,filtered
0,Business Systems Analyst,DiversityX,Los Angeles CA,Ability to elicit. gather and analyze data. We...,"[ability, to, elicit., gather, and, analyze, d...","[ability, elicit., gather, analyze, data., cur..."
1,Data Analyst I,Computershare,United States•Remote,1 year of experience in data analysis or the a...,"[1, year, of, experience, in, data, analysis, ...","[1, year, experience, data, analysis, applicat..."
2,Healthcare Quality Data Informatics Analyst,Clinical Management Consultants,Long Beach. CA,Data abstraction will be utilized to pull conf...,"[data, abstraction, will, be, utilized, to, pu...","[data, abstraction, utilized, pull, confidenti..."
3,Data Analyst needed ASAP!!,AppleOne,Folsom. CA 95630,?Bachelor's degree in mathematics. accounting ...,"[?bachelor's, degree, in, mathematics., accoun...","[?bachelor's, degree, mathematics., accounting..."
4,TikTok LIVE Data Analyst,TikTok,Los Angeles. CA,Assist in creating data best practices and doc...,"[assist, in, creating, data, best, practices, ...","[assist, creating, data, best, practices, docu..."


In [8]:
#turn filtered text into a dataframe
countable =pandasDF.drop(columns = ["job_title", "company_name", "location", "summary", "tokens"])

#remove punctuation and turn column into a list
countable['filtered'] = countable['filtered'].astype(str)
countable["filtered"] = countable['filtered'].str.replace('(','')
countable["filtered"] = countable['filtered'].str.replace('?','')
countable["filtered"] = countable['filtered'].str.replace('.','')
countable["filtered"] = countable['filtered'].str.replace('?','')
#countable["filtered"] = countable['filtered'].str.replace('&','')
countable["filtered"] = countable["filtered"].apply(eval)
countable

Unnamed: 0,filtered
0,"[ability, elicit, gather, analyze, data, curre..."
1,"[1, year, experience, data, analysis, applicat..."
2,"[data, abstraction, utilized, pull, confidenti..."
3,"[bachelor's, degree, mathematics, accounting, ..."
4,"[assist, creating, data, best, practices, docu..."
5,"[assess, effectiveness, accuracy, new, data, s..."
6,"[1-3, years, equivalent, experience, analyst, ..."
7,"[creates, actionable, insights, understanding,..."
8,"[handle, data, entry, purchase, orders, emerge..."
9,"[contributed, ongoing, company, efforts, manag..."


In [9]:
#use sortvalues to count the number of times each word appears
a = pd.Series([item for sublist in countable.filtered for item in sublist])
counted_df = a.value_counts().sort_index().rename_axis('filtered').reset_index(name='count')
counted_df.sort_values(by=['count'], ascending=False, inplace=True)
counted_df.head()

Unnamed: 0,filtered,count
51,data,24
19,analyst,4
136,reporting,3
149,sql,2
72,experience,2


In [12]:
counted_df['pos'] = counted_df['filtered'].apply(lambda x: nltk.pos_tag([x])[0][1])
counted_df.to_csv('pos_single.csv', index = False)
counted_df.head()

Unnamed: 0,filtered,count,pos
51,data,24,NNS
19,analyst,4,NN
136,reporting,3,NN
149,sql,2,NN
72,experience,2,NN
