In [1]:
#Importing Libraries
import boto3
from pyspark.sql.types import BooleanType
from pyspark.sql.functions import udf
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from pyspark.sql import Window

In [None]:
#Downloading data from the s3 bucket
s3 = boto3.client('s3')
s3.download_file('blossom-data-engs', 'all-us-stocks-tickers-company-info-logos.zip', 'allus.zip')
s3.download_file('blossom-data-engs', 'data-scientist-job-market-in-the-us.zip', 'datascientists.zip')

In [2]:
#Creating a SparkSession
spark = SparkSession.builder.getOrCreate()

In [3]:
#Read the dataset from local directory
datascientists = spark.read.csv('C:/Users/USER/Desktop/Blossom_Academy/datascientists/alldata.csv', header = True, inferSchema =True)
companies = spark.read.csv('C:/Users/USER/Desktop/Blossom_Academy/companies/companies.csv', header = True, inferSchema =True)

In [4]:
#Checking the number of columns for the companies dataset
companies.columns

['ticker',
 'company name',
 'short name',
 'industry',
 'description',
 'website',
 'logo',
 'ceo',
 'exchange',
 'market cap',
 'sector',
 'tag 1',
 'tag 2',
 'tag 3']

In [5]:
#Checking the number of columns for the datascientists dataset
datascientists.columns

['position', 'company', 'description', 'reviews', 'location']

In [6]:
#Splitting the column location in the Datascientists dataset
datascientists.select('location', F.split(datascientists['location'], ',')[0].alias('City')).show()

+--------------------+--------------------+
|            location|                City|
+--------------------+--------------------+
|                null|                null|
|                 GA.|                 GA.|
|                null|                null|
|            database|            database|
|                null|                null|
| has served as on...| has served as on...|
|                null|                null|
|                null|                null|
|                null|                null|
|                null|                null|
|                null|                null|
|                null|                null|
| has an open posi...| has an open posi...|
|                null|                null|
|                null|                null|
|                null|                null|
|                null|                null|
|                null|                null|
|                null|                null|
|                null|          

In [7]:
#Joining the two Datasets (Company & Datascientists) and renaming one of the columns as comdescription
joindata = datascientists.join(companies.withColumnRenamed('description', 'comdescription'), datascientists['company'] == companies['company name'] )

In [8]:
#Printing the columns of the joined datasets
joindata.columns

['position',
 'company',
 'description',
 'reviews',
 'location',
 'ticker',
 'company name',
 'short name',
 'industry',
 'comdescription',
 'website',
 'logo',
 'ceo',
 'exchange',
 'market cap',
 'sector',
 'tag 1',
 'tag 2',
 'tag 3']

In [9]:
#Showing the first 2 rows of the Joined dataset
joindata.show(5)

+--------------------+---------------+--------------------+-------------------+--------------------+--------------------+---------------+--------------------+--------------------+--------------------+--------------------+---------+---------+---------+----------+------+-----+-----+-----+
|            position|        company|         description|            reviews|            location|              ticker|   company name|          short name|            industry|      comdescription|             website|     logo|      ceo| exchange|market cap|sector|tag 1|tag 2|tag 3|
+--------------------+---------------+--------------------+-------------------+--------------------+--------------------+---------------+--------------------+--------------------+--------------------+--------------------+---------+---------+---------+----------+------+-----+-----+-----+
|Experience follow...|         design|                code| and defect reviews|                null| The fund general...|         design

### Tokenizing the data to print N-Grams(2)

In [10]:
#Importing libraries for Tokenization and N-Grams
import nltk
from pyspark.ml.feature import Tokenizer, NGram

In [11]:
#Filtering out null values
companies = companies.filter(companies['description'].isNotNull())


In [12]:
#Creating a variable for the Tokenization and also create a new column
token = Tokenizer(inputCol = 'description', outputCol = 'tokenized')

In [13]:
#Dropping null values and applying tokenization
joindata.drop()
token_df = token.transform(joindata)

In [14]:
#Selecting the description and tokenized columns and displaying 5 rows of data
token_df.select('description', 'tokenized').show(5)

+--------------------+--------------------+
|         description|           tokenized|
+--------------------+--------------------+
|                code|            [, code]|
| and planning of ...|[, and, planning,...|
| and planning of ...|[, and, planning,...|
| and moving expen...|[, and, moving, e...|
| and conduct of s...|[, and, conduct, ...|
+--------------------+--------------------+
only showing top 5 rows



In [16]:
#Creating an NGram column and assigning it to bigram
bigram = NGram(n=2, inputCol = 'tokenized', outputCol = 'ngram')
gram = bigram.transform(token_df)

In [17]:
#Showing the 5 rows of the ngram dataset
gram.show(5)

+--------------------+---------------+--------------------+-------------------+--------------------+--------------------+---------------+--------------------+--------------------+--------------------+--------------------+---------+---------+---------+----------+------+-----+-----+-----+--------------------+--------------------+
|            position|        company|         description|            reviews|            location|              ticker|   company name|          short name|            industry|      comdescription|             website|     logo|      ceo| exchange|market cap|sector|tag 1|tag 2|tag 3|           tokenized|               ngram|
+--------------------+---------------+--------------------+-------------------+--------------------+--------------------+---------------+--------------------+--------------------+--------------------+--------------------+---------+---------+---------+----------+------+-----+-----+-----+--------------------+--------------------+
|Experienc

#### Performing a value count to determine the Frequency 

In [18]:
#Importing Libraries
from pyspark.sql.functions import explode
import pyspark.sql.functions as F

In [19]:
#Printing the Ngram column
gram.select('ngram').limit(1).take(1)

[Row(ngram=[' code'])]

In [21]:
# method for creating the frequency data function...
def create_freq(df, col):
    n=gram.select(col, F.explode('ngram').alias('ngram')).groupBy([col, 'ngram']).count() #exploding the ngrams
    n = n.withColumnRenamed('count','frequency') # changing the column name from count to frequency
    n = n.orderBy(n['frequency'].desc()) # ordering rows by biggest first.
    return n

In [22]:
#creating the freq for industry ngrams
industry_freq = create_freq(gram, 'industry')

In [23]:
industry_freq.show()

Py4JJavaError: An error occurred while calling o216.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 2 in stage 14.0 failed 1 times, most recent failure: Lost task 2.0 in stage 14.0 (TID 19, localhost, executor driver): org.apache.spark.SparkException: Failed to execute user defined function($anonfun$createTransformFunc$1: (string) => array<string>)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage2.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
	at scala.collection.Iterator$JoinIterator.hasNext(Iterator.scala:212)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage3.agg_doAggregateWithKeys_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage3.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:125)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
	at java.lang.Thread.run(Unknown Source)
Caused by: java.lang.NullPointerException
	at org.apache.spark.ml.feature.Tokenizer$$anonfun$createTransformFunc$1.apply(Tokenizer.scala:39)
	at org.apache.spark.ml.feature.Tokenizer$$anonfun$createTransformFunc$1.apply(Tokenizer.scala:39)
	... 21 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1889)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1877)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1876)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1876)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:926)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2110)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2059)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2048)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:737)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2158)
	at org.apache.spark.rdd.RDD$$anonfun$reduce$1.apply(RDD.scala:1035)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.RDD.reduce(RDD.scala:1017)
	at org.apache.spark.rdd.RDD$$anonfun$takeOrdered$1.apply(RDD.scala:1439)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.RDD.takeOrdered(RDD.scala:1426)
	at org.apache.spark.sql.execution.TakeOrderedAndProjectExec.executeCollect(limit.scala:136)
	at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectFromPlan(Dataset.scala:3389)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2550)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2550)
	at org.apache.spark.sql.Dataset$$anonfun$52.apply(Dataset.scala:3370)
	at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:78)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:73)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3369)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2550)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:2764)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:254)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:291)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
	at java.lang.reflect.Method.invoke(Unknown Source)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Unknown Source)
Caused by: org.apache.spark.SparkException: Failed to execute user defined function($anonfun$createTransformFunc$1: (string) => array<string>)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage2.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
	at scala.collection.Iterator$JoinIterator.hasNext(Iterator.scala:212)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage3.agg_doAggregateWithKeys_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage3.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:125)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
	... 1 more
Caused by: java.lang.NullPointerException
	at org.apache.spark.ml.feature.Tokenizer$$anonfun$createTransformFunc$1.apply(Tokenizer.scala:39)
	at org.apache.spark.ml.feature.Tokenizer$$anonfun$createTransformFunc$1.apply(Tokenizer.scala:39)
	... 21 more


In [None]:
from pyspark.sql.functions import countDistinct

In [None]:
# Change name of columns with alias
#ngram.groupBy("NGram").agg(F.count('NGram')).show()
#ngram.select(countDistinct("NGram")).show()
newngram.select('count')

In [None]:
newngram.show()

In [None]:
#import ipython
import nbconvert
import import_ipynb

In [None]:
import seaborn as sns
sns.distplot(ngram['NGram'])