# Occupation

### Introduction:

Special thanks to: https://github.com/justmarkham for sharing the dataset and materials.

### Step 1. Import the necessary libraries

In [33]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local").appName("My Application").getOrCreate()

from pyspark.sql.types import *
from pyspark.sql.functions import *

### Step 2. Import the dataset from this [address](https://raw.githubusercontent.com/justmarkham/DAT8/master/data/u.user). 

### Step 3. Assign it to a variable called users.

In [6]:
users = spark.read.format("csv")\
    .option("header", "true")\
        .option("inferSchema", "true")\
        .option("sep", "|")\
            .load("occupation.csv")

users.show()

+-------+---+------+-------------+--------+
|user_id|age|gender|   occupation|zip_code|
+-------+---+------+-------------+--------+
|      1| 24|     M|   technician|   85711|
|      2| 53|     F|        other|   94043|
|      3| 23|     M|       writer|   32067|
|      4| 24|     M|   technician|   43537|
|      5| 33|     F|        other|   15213|
|      6| 42|     M|    executive|   98101|
|      7| 57|     M|administrator|   91344|
|      8| 36|     M|administrator|   05201|
|      9| 29|     M|      student|   01002|
|     10| 53|     M|       lawyer|   90703|
|     11| 39|     F|        other|   30329|
|     12| 28|     F|        other|   06405|
|     13| 47|     M|     educator|   29206|
|     14| 45|     M|    scientist|   55106|
|     15| 49|     F|     educator|   97301|
|     16| 21|     M|entertainment|   10309|
|     17| 30|     M|   programmer|   06355|
|     18| 35|     F|        other|   37212|
|     19| 40|     M|    librarian|   02138|
|     20| 42|     F|    homemake

### Step 4. Discover what is the mean age per occupation

In [7]:
users.groupBy("occupation").agg(mean("age")).show()

[Stage 6:>                                                          (0 + 1) / 1]

+-------------+------------------+
|   occupation|          avg(age)|
+-------------+------------------+
|    librarian|              40.0|
|      retired| 63.07142857142857|
|       lawyer|             36.75|
|         none|26.555555555555557|
|       writer| 36.31111111111111|
|   programmer|33.121212121212125|
|    marketing| 37.61538461538461|
|        other|34.523809523809526|
|    executive|          38.71875|
|    scientist| 35.54838709677419|
|      student|22.081632653061224|
|     salesman|35.666666666666664|
|       artist|31.392857142857142|
|   technician|33.148148148148145|
|administrator| 38.74683544303797|
|     engineer| 36.38805970149254|
|   healthcare|           41.5625|
|     educator| 42.01052631578948|
|entertainment| 29.22222222222222|
|    homemaker| 32.57142857142857|
+-------------+------------------+
only showing top 20 rows



                                                                                

### Step 5. Discover the Male ratio per occupation and sort it from the most to the least

In [11]:
total_counts = users.groupBy("occupation").agg(count("*").alias("total"))

male_counts = users.filter(col("gender") == "M") \
                   .groupBy("occupation").agg(count("*").alias("male_count"))

male_ratio = male_counts.join(total_counts, on="occupation") \
                        .withColumn("male_ratio", round(col("male_count") / col("total"), 2)) \
                        .orderBy(col("male_ratio").desc())

male_ratio.select("occupation", "male_ratio").show()


                                                                                

+-------------+----------+
|   occupation|male_ratio|
+-------------+----------+
|       doctor|       1.0|
|     engineer|      0.97|
|   technician|      0.96|
|      retired|      0.93|
|   programmer|      0.91|
|    executive|      0.91|
|    scientist|       0.9|
|entertainment|      0.89|
|       lawyer|      0.83|
|     salesman|      0.75|
|     educator|      0.73|
|      student|      0.69|
|        other|      0.66|
|    marketing|      0.62|
|       writer|      0.58|
|         none|      0.56|
|       artist|      0.54|
|administrator|      0.54|
|    librarian|      0.43|
|   healthcare|      0.31|
+-------------+----------+
only showing top 20 rows



### Step 6. For each occupation, calculate the minimum and maximum ages

In [15]:
users.groupBy("occupation")\
    .agg(min(col("age").alias("min_age")), max(col("age").alias("max_age")))\
        .orderBy(("occupation"))\
    .show()

+-------------+-------------------+-------------------+
|   occupation|min(age AS min_age)|max(age AS max_age)|
+-------------+-------------------+-------------------+
|administrator|                 21|                 70|
|       artist|                 19|                 48|
|       doctor|                 28|                 64|
|     educator|                 23|                 63|
|     engineer|                 22|                 70|
|entertainment|                 15|                 50|
|    executive|                 22|                 69|
|   healthcare|                 22|                 62|
|    homemaker|                 20|                 50|
|       lawyer|                 21|                 53|
|    librarian|                 23|                 69|
|    marketing|                 24|                 55|
|         none|                 11|                 55|
|        other|                 13|                 64|
|   programmer|                 20|             

### Step 7. For each combination of occupation and gender, calculate the mean age

In [20]:
users.groupBy("occupation", "gender")\
    .agg(mean("age")).orderBy("occupation").show()

+-------------+------+------------------+
|   occupation|gender|          avg(age)|
+-------------+------+------------------+
|administrator|     M| 37.16279069767442|
|administrator|     F|40.638888888888886|
|       artist|     F|30.307692307692307|
|       artist|     M|32.333333333333336|
|       doctor|     M| 43.57142857142857|
|     educator|     F| 39.11538461538461|
|     educator|     M| 43.10144927536232|
|     engineer|     F|              29.5|
|     engineer|     M|              36.6|
|entertainment|     F|              31.0|
|entertainment|     M|              29.0|
|    executive|     M|38.172413793103445|
|    executive|     F|              44.0|
|   healthcare|     F| 39.81818181818182|
|   healthcare|     M|              45.4|
|    homemaker|     F|34.166666666666664|
|    homemaker|     M|              23.0|
|       lawyer|     F|              39.5|
|       lawyer|     M|              36.2|
|    librarian|     M|              40.0|
+-------------+------+------------

### Step 8.  For each occupation present the percentage of women and men

In [None]:
from pyspark.sql.functions import count, col, round  
gender_counts = users.groupBy("occupation", "gender") \
                     .agg(count("*").alias("count"))

total_counts = users.groupBy("occupation") \
                    .agg(count("*").alias("total"))

gender_ratio = gender_counts.join(total_counts, on="occupation")

gender_ratio = gender_ratio.withColumn(
    "percentage", round((col("count") / col("total")) * 100, 2)
)

gender_ratio.orderBy("occupation", "gender").show(truncate=False)


+-------------+------+-----+-----+----------+
|occupation   |gender|count|total|percentage|
+-------------+------+-----+-----+----------+
|administrator|F     |36   |79   |45.57     |
|administrator|M     |43   |79   |54.43     |
|artist       |F     |13   |28   |46.43     |
|artist       |M     |15   |28   |53.57     |
|doctor       |M     |7    |7    |100.0     |
|educator     |F     |26   |95   |27.37     |
|educator     |M     |69   |95   |72.63     |
|engineer     |F     |2    |67   |2.99      |
|engineer     |M     |65   |67   |97.01     |
|entertainment|F     |2    |18   |11.11     |
|entertainment|M     |16   |18   |88.89     |
|executive    |F     |3    |32   |9.38      |
|executive    |M     |29   |32   |90.63     |
|healthcare   |F     |11   |16   |68.75     |
|healthcare   |M     |5    |16   |31.25     |
|homemaker    |F     |6    |7    |85.71     |
|homemaker    |M     |1    |7    |14.29     |
|lawyer       |F     |2    |12   |16.67     |
|lawyer       |M     |10   |12   |