In [1]:
import sys
sys.executable

'/Users/kevin/anaconda3/envs/tf_env/bin/python'

### Get Python driver version the same as the Pyspark worker
#### Pyspark needs both driver and worker nodes to use the same Python version

In [2]:
import os
os.environ['PYSPARK_PYTHON'] = '/Users/kevin/anaconda3/envs/tf_env/bin/python'
os.environ['PYSPARK_DRIVER_PYTHON'] = '/Users/kevin/anaconda3/envs/tf_env/bin/python'

# Data Preprocessing

### 1) Load in relevant datasets pertaining to nuclear energy in 2023
#### deuterium derived by country, tritium derived by country, rare earth metals derived by country, deuterium reserves by country, tritium reserves by country, rare earth reserves by country, nuclear energy capacity by countries that have already embraced nuclear, actual nuclear energy supplied by country, (operating nuclear reactors by country?) 
### 2) Merge all of these metrics into one dataset for training and testing of various machine learning models
### 3) Use the trained models on a dataset of countries that have not yet embraced nuclear but they have deuterium, tritium and rare earths (either derived or reserves)
### Use confidence intervals to gauge an estimate of how much nuclear energy theses countries can supply

### Remove empty columns/rows, merge datasets into one dataframe for training and testing 
### match up countries 

In [3]:
pip install pandas beautifulsoup4 requests

Note: you may need to restart the kernel to use updated packages.


In [4]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from pyspark.sql import SparkSession

In [5]:
spark= SparkSession.builder \
        .appName("Pyspark SQL in Jupyter") \
        .getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/20 13:44:31 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# Scraping

## Makes a request at the PRIS IAEA URL, parses html to find and read table, to finally turn it into a csv

In [6]:
url= "https://pris.iaea.org/PRIS/WorldStatistics/NuclearShareofElectricityGeneration.aspx"
response=requests.get(url)
soup=BeautifulSoup(response.text, 'html.parser')

table=soup.find('table')

df=pd.read_html(str(table))[0]

df.to_csv('nuclear_share_of_electricity_generation_2023', index=False)

# Total Net Electrical Capacity (MW), Number of Operated Reactors, Nuclear Electricity Supplied (GW.h), Nuclear share % by country

## Using 2023 data because provides net electrical capacity as well as nucelar electricity supplied

### These actual values will help the models make predictions

In [7]:
nuclear_share_of_electricity_generation_2023 = spark.read.csv('nuclear_share_of_electricity_generation_2023', header=True, inferSchema=True)
nuclear_share_of_electricity_generation_2023.show()

                                                                                

+--------------------+----------------------------------+---------------------------+------------------------------------+------------------+
|             Country|Total Net Electrical Capacity [MW]|Number of Operated Reactors|Nuclear Electricity Supplied  [GW.h]|Nuclear Share  [%]|
+--------------------+----------------------------------+---------------------------+------------------------------------+------------------+
|              FRANCE|                             61370|                         56|                           323773.23|              64.8|
|            SLOVAKIA|                              2308|                          5|                            17004.98|              61.3|
|             HUNGARY|                              1916|                          4|                            15091.64|              48.8|
|             FINLAND|                              4394|                          5|                            32759.35|              42.0|
|     

## Observe exact column names

In [8]:
print(nuclear_share_of_electricity_generation_2023.columns)

['Country', 'Total Net Electrical Capacity [MW]', 'Number of Operated Reactors', 'Nuclear Electricity Supplied  [GW.h]', 'Nuclear Share  [%]']


## Reorder table based on Nuclear Electricity Supplied

In [9]:
nuclear_share_of_electricity_generation_2023.createOrReplaceTempView("nuclear_share")


# SQL query ; Select all from nuclear share table, order by the nuclear electricity supplied column in descending order
ordered_nuclear_electricity= spark.sql("""
SELECT * 
FROM nuclear_share 
ORDER BY `Nuclear Electricity Supplied  [GW.h]` DESC
""")

ordered_nuclear_electricity.show()

+--------------------+----------------------------------+---------------------------+------------------------------------+------------------+
|             Country|Total Net Electrical Capacity [MW]|Number of Operated Reactors|Nuclear Electricity Supplied  [GW.h]|Nuclear Share  [%]|
+--------------------+----------------------------------+---------------------------+------------------------------------+------------------+
|               Total|                            364480|                        403|                          2552067.11|              null|
|UNITED STATES OF ...|                             95835|                         93|                           779186.02|              18.6|
|               CHINA|                             53152|                         55|                           406483.53|               4.9|
|              FRANCE|                             61370|                         56|                           323773.23|              64.8|
|     

## Remove row displaying "total"

In [17]:
# Filters out rows where "Country" is "Total" 
filtered_df = ordered_nuclear_electricity.filter(~ordered_nuclear_electricity["Country"].like("%Total%"))
filtered_df.createOrReplaceTempView("filtered_df")
filtered_df.show()

+--------------------+----------------------------------+---------------------------+------------------------------------+------------------+
|             Country|Total Net Electrical Capacity [MW]|Number of Operated Reactors|Nuclear Electricity Supplied  [GW.h]|Nuclear Share  [%]|
+--------------------+----------------------------------+---------------------------+------------------------------------+------------------+
|UNITED STATES OF ...|                             95835|                         93|                           779186.02|              18.6|
|               CHINA|                             53152|                         55|                           406483.53|               4.9|
|              FRANCE|                             61370|                         56|                           323773.23|              64.8|
|              RUSSIA|                             27727|                         37|                           203957.32|              18.4|
|  KOR

### Get current count of rows

In [18]:
filtered_df.count()

31

## Add info on Taiwan since it was excluded from the original table, adds to our training and testing data

In [11]:
from pyspark.sql import Row

# define row object
# When using ** before a dictionary in a function call or object creation, ** unpacks the dictionary into keyword arguments
# allows to pass dictionary keys as parameter names and their values as corresponding argument values

taiwan_row = Row(
    Country = "TAIWAN, CHINA",
    # Using dictionary syntax to handle column names with spaces
    **{"Total Net Electrical Capacity [MW]": 2859},
    **{"Number of Operated Reactors": 3}, 
    **{"Nuclear Electricity Supplied  [GW.h]": 17153.88}, 
    **{"Nuclear Share  [%]": 6.9}, 
)

# Convert row into pd dataframe
taiwan_row_df = spark.createDataFrame([taiwan_row])

# create temp view for new row
taiwan_row_df.createOrReplaceTempView("taiwan_row_view")

### Combine row into table

In [20]:
combined_df= spark.sql("""
SELECT * FROM filtered_df
UNION ALL
SELECT * FROM taiwan_row_view
""")

combined_df.show()

+--------------------+----------------------------------+---------------------------+------------------------------------+------------------+
|             Country|Total Net Electrical Capacity [MW]|Number of Operated Reactors|Nuclear Electricity Supplied  [GW.h]|Nuclear Share  [%]|
+--------------------+----------------------------------+---------------------------+------------------------------------+------------------+
|UNITED STATES OF ...|                             95835|                         93|                           779186.02|              18.6|
|               CHINA|                             53152|                         55|                           406483.53|               4.9|
|              FRANCE|                             61370|                         56|                           323773.23|              64.8|
|              RUSSIA|                             27727|                         37|                           203957.32|              18.4|
|  KOR

In [22]:
# confirm if Taiwan was successfully added
rows = combined_df.count()
combined_df.show(rows, truncate=False)

+---------------------------+----------------------------------+---------------------------+------------------------------------+------------------+
|Country                    |Total Net Electrical Capacity [MW]|Number of Operated Reactors|Nuclear Electricity Supplied  [GW.h]|Nuclear Share  [%]|
+---------------------------+----------------------------------+---------------------------+------------------------------------+------------------+
|UNITED STATES OF AMERICA   |95835                             |93                         |779186.02                           |18.6              |
|CHINA                      |53152                             |55                         |406483.53                           |4.9               |
|FRANCE                     |61370                             |56                         |323773.23                           |64.8              |
|RUSSIA                     |27727                             |37                         |203957.32     

## Order based on Nuclear Energy Supplied [GW.h] again with new Taiwan row

In [26]:
combined_df.createOrReplaceTempView("combined_filtered_df")

final_ordered_nuclear_electricity=spark.sql("""
SELECT *
FROM combined_filtered_df
ORDER BY `Nuclear Electricity Supplied  [GW.h]`
DESC
""")

final_ordered_nuclear_electricity.show(32, truncate=False)

+---------------------------+----------------------------------+---------------------------+------------------------------------+------------------+
|Country                    |Total Net Electrical Capacity [MW]|Number of Operated Reactors|Nuclear Electricity Supplied  [GW.h]|Nuclear Share  [%]|
+---------------------------+----------------------------------+---------------------------+------------------------------------+------------------+
|UNITED STATES OF AMERICA   |95835                             |93                         |779186.02                           |18.6              |
|CHINA                      |53152                             |55                         |406483.53                           |4.9               |
|FRANCE                     |61370                             |56                         |323773.23                           |64.8              |
|RUSSIA                     |27727                             |37                         |203957.32     

# Lithium dataset
## MCS 2024 Provides information on Country, extraction type, true production in 2022, estimated production in 2023, production notes, reserves in tons, and reserves notes
### The Mineral Commodities Summary withholds information on the United States to avoid disclosing company proprietary data 

In [27]:
lithium_2024 = pd.read_csv('mcs2024-lithi_world.csv')
lithium_2024

Unnamed: 0,Source,Country,Type,Prod_t_2022,Prod_t_est_2023,Prod_notes,Reserves_t,Reserves_notes,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19
0,MCS2024,United States,"Mine production, lithium content",W,W,,1100000.0,"Reserves for Argentina, Australia, Brazil, Chi...",,,,,,,,,,,,
1,MCS2024,Argentina,"Mine production, lithium content",6590,9600,,3600000.0,"Reserves for Argentina, Australia, Brazil, Chi...",,,,,,,,,,,,
2,MCS2024,Australia,"Mine production, lithium content",74700,86000,,6200000.0,"For Australia, Joint Ore Reserves Committee-co...",,,,,,,,,,,,
3,MCS2024,Brazil,"Mine production, lithium content",2630,4900,Also estimated in 2022,390000.0,"Reserves for Argentina, Australia, Brazil, Chi...",,,,,,,,,,,,
4,MCS2024,Canada,"Mine production, lithium content",520,3400,Also estimated in 2022,930000.0,,,,,,,,,,,,,
5,MCS2024,Chile,"Mine production, lithium content",38000,44000,,9300000.0,"Reserves for Argentina, Australia, Brazil, Chi...",,,,,,,,,,,,
6,MCS2024,China,"Mine production, lithium content",22600,33000,Also estimated in 2022,3000000.0,,,,,,,,,,,,,
7,MCS2024,Portugal,"Mine production, lithium content",380,380,Also estimated in 2022,60000.0,,,,,,,,,,,,,
8,MCS2024,Zimbabwe,"Mine production, lithium content",1030,3400,Also estimated in 2022,310000.0,,,,,,,,,,,,,
9,MCS2024,Other countries6,"Mine production, lithium content",0,0,,2800000.0,"Reserves for Argentina, Australia, Brazil, Chi...",,,,,,,,,,,,


## Remove empty rows and columns

### Remove empty columns

In [28]:
# if the index of the column is greater than the index of 'Reserves_notes' (the last unproblematic column) then we add it to the remove list
# columns.values is just the names of each of the columns, then we get the index based on the name as the key using columns.get_loc
# then we compare indices
# finally drop the remove list with axis=1 (columns)

remove_cols = [i for i in lithium_2024.columns.values if lithium_2024.columns.get_loc(i)  > lithium_2024.columns.get_loc('Reserves_notes')]

lithium_2024_no_bad_cols = lithium_2024.drop(remove_cols, axis=1)
lithium_2024_no_bad_cols

Unnamed: 0,Source,Country,Type,Prod_t_2022,Prod_t_est_2023,Prod_notes,Reserves_t,Reserves_notes
0,MCS2024,United States,"Mine production, lithium content",W,W,,1100000.0,"Reserves for Argentina, Australia, Brazil, Chi..."
1,MCS2024,Argentina,"Mine production, lithium content",6590,9600,,3600000.0,"Reserves for Argentina, Australia, Brazil, Chi..."
2,MCS2024,Australia,"Mine production, lithium content",74700,86000,,6200000.0,"For Australia, Joint Ore Reserves Committee-co..."
3,MCS2024,Brazil,"Mine production, lithium content",2630,4900,Also estimated in 2022,390000.0,"Reserves for Argentina, Australia, Brazil, Chi..."
4,MCS2024,Canada,"Mine production, lithium content",520,3400,Also estimated in 2022,930000.0,
5,MCS2024,Chile,"Mine production, lithium content",38000,44000,,9300000.0,"Reserves for Argentina, Australia, Brazil, Chi..."
6,MCS2024,China,"Mine production, lithium content",22600,33000,Also estimated in 2022,3000000.0,
7,MCS2024,Portugal,"Mine production, lithium content",380,380,Also estimated in 2022,60000.0,
8,MCS2024,Zimbabwe,"Mine production, lithium content",1030,3400,Also estimated in 2022,310000.0,
9,MCS2024,Other countries6,"Mine production, lithium content",0,0,,2800000.0,"Reserves for Argentina, Australia, Brazil, Chi..."


### Remove empty rows

In [29]:
# same procedure; problematic rows start after row 10, so if row number is greater than 10, we will remove 

remove_rows = (i for i in range(11, lithium_2024_no_bad_cols.shape[0]))
# .shape gives the dimensions of the data in format (rows, columns) 
# example: lithium_2024_no_bad_colss has shape (26,8) 26 rows, 8 columns

lithium_2024_no_bad_cols_rows = lithium_2024_no_bad_cols.drop(remove_rows, axis=0)
lithium_2024_no_bad_cols_rows

Unnamed: 0,Source,Country,Type,Prod_t_2022,Prod_t_est_2023,Prod_notes,Reserves_t,Reserves_notes
0,MCS2024,United States,"Mine production, lithium content",W,W,,1100000.0,"Reserves for Argentina, Australia, Brazil, Chi..."
1,MCS2024,Argentina,"Mine production, lithium content",6590,9600,,3600000.0,"Reserves for Argentina, Australia, Brazil, Chi..."
2,MCS2024,Australia,"Mine production, lithium content",74700,86000,,6200000.0,"For Australia, Joint Ore Reserves Committee-co..."
3,MCS2024,Brazil,"Mine production, lithium content",2630,4900,Also estimated in 2022,390000.0,"Reserves for Argentina, Australia, Brazil, Chi..."
4,MCS2024,Canada,"Mine production, lithium content",520,3400,Also estimated in 2022,930000.0,
5,MCS2024,Chile,"Mine production, lithium content",38000,44000,,9300000.0,"Reserves for Argentina, Australia, Brazil, Chi..."
6,MCS2024,China,"Mine production, lithium content",22600,33000,Also estimated in 2022,3000000.0,
7,MCS2024,Portugal,"Mine production, lithium content",380,380,Also estimated in 2022,60000.0,
8,MCS2024,Zimbabwe,"Mine production, lithium content",1030,3400,Also estimated in 2022,310000.0,
9,MCS2024,Other countries6,"Mine production, lithium content",0,0,,2800000.0,"Reserves for Argentina, Australia, Brazil, Chi..."


## Remove columns not useful to the analysis

### For the purpose of our analysis, we don't need columns: Source, Type, Prod Notes, Reserves Notes

In [30]:
remove_unnecessary = ['Source',
                      'Type',
                      'Prod_notes',
                      'Reserves_notes'
                     ]
lithium_2024_clean = lithium_2024_no_bad_cols_rows.drop(remove_unnecessary, axis=1)
lithium_2024_clean

Unnamed: 0,Country,Prod_t_2022,Prod_t_est_2023,Reserves_t
0,United States,W,W,1100000.0
1,Argentina,6590,9600,3600000.0
2,Australia,74700,86000,6200000.0
3,Brazil,2630,4900,390000.0
4,Canada,520,3400,930000.0
5,Chile,38000,44000,9300000.0
6,China,22600,33000,3000000.0
7,Portugal,380,380,60000.0
8,Zimbabwe,1030,3400,310000.0
9,Other countries6,0,0,2800000.0


# Rare Earth Ores dataset

# Since its from the mcs as well, provides info in the same format as the Lithium dataset
### All preprocessing will be done in one block

In [43]:
rareearth_2024=pd.read_csv('mcs2024-raree_world.csv')
rareearth_2024

Unnamed: 0,Source,Country,Type,Prod_t_est_2022,Prod_t_est_2023,Prod_notes,Reserves_t,Reserves_notes,Unnamed: 8,Unnamed: 9,...,Unnamed: 34,Unnamed: 35,Unnamed: 36,Unnamed: 37,Unnamed: 38,Unnamed: 39,Unnamed: 40,Unnamed: 41,Unnamed: 42,Unnamed: 43
0,MCS2024,United States,"Rare earths, mine production, rare-earth-oxide...",42000.0,43000.0,,1800000.0,"Reserves for Australia, Russia, Thailand, and ...",,,...,,,,,,,,,,
1,MCS2024,Australia,"Rare earths, mine production, rare-earth-oxide...",18000.0,18000.0,,5700000.0,"For Australia, Joint Ore Reserves Committee-co...",,,...,,,,,,,,,,
2,MCS2024,Brazil,"Rare earths, mine production, rare-earth-oxide...",80.0,80.0,,21000000.0,,,,...,,,,,,,,,,
3,MCS2024,Burma,"Rare earths, mine production, rare-earth-oxide...",12000.0,38000.0,,,,,,...,,,,,,,,,,
4,MCS2024,Canada,"Rare earths, mine production, rare-earth-oxide...",0.0,0.0,,830000.0,,,,...,,,,,,,,,,
5,MCS2024,China,"Rare earths, mine production, rare-earth-oxide...",210000.0,240000.0,Production quota; does not include undocumente...,44000000.0,,,,...,,,,,,,,,,
6,MCS2024,Greenland,"Rare earths, mine production, rare-earth-oxide...",0.0,0.0,,1500000.0,,,,...,,,,,,,,,,
7,MCS2024,India,"Rare earths, mine production, rare-earth-oxide...",2900.0,2900.0,,6900000.0,,,,...,,,,,,,,,,
8,MCS2024,Madagascar,"Rare earths, mine production, rare-earth-oxide...",960.0,960.0,,,,,,...,,,,,,,,,,
9,MCS2024,Malaysia,"Rare earths, mine production, rare-earth-oxide...",80.0,80.0,,,,,,...,,,,,,,,,,


In [42]:
remove_cols = [i for i in rareearth_2024.columns.values if rareearth_2024.columns.get_loc(i)  > rareearth_2024.columns.get_loc('Reserves_notes')]

rareearth_2024_no_bad_cols = rareearth_2024.drop(remove_cols, axis=1)

remove_rows = (i for i in range(16, rareearth_2024_no_bad_cols.shape[0]))

rareearth_2024_no_bad_cols_rows = rareearth_2024_no_bad_cols.drop(remove_rows, axis=0)

remove_unnecessary = ['Source',
                      'Type',
                      'Prod_notes',
                      'Reserves_notes'
                     ]

rareearth_2024_clean = rareearth_2024_no_bad_cols_rows.drop(remove_unnecessary, axis=1)
rareearth_2024_clean

Unnamed: 0,Country,Prod_t_est_2022,Prod_t_est_2023,Reserves_t
0,United States,42000.0,43000.0,1800000.0
1,Australia,18000.0,18000.0,5700000.0
2,Brazil,80.0,80.0,21000000.0
3,Burma,12000.0,38000.0,
4,Canada,0.0,0.0,830000.0
5,China,210000.0,240000.0,44000000.0
6,Greenland,0.0,0.0,1500000.0
7,India,2900.0,2900.0,6900000.0
8,Madagascar,960.0,960.0,
9,Malaysia,80.0,80.0,


# Zirconium and Hafnium dataset

In [41]:
zirco_2024=pd.read_csv('mcs2024-zirco_world.csv')
zirco_2024

Unnamed: 0,Source,Country,Type,Prod_kt_est_2022,Prod_kt_est_2023,Prod_notes,Reserves_kt_2023,Reserves_notes
0,MCS2024,United States,"Zirconium ores and zircon concentrates, mine p...",100,100,Data are rounded to the nearest hundred thousa...,500.0,
1,MCS2024,Australia,"Zirconium ores and zircon concentrates, mine p...",500,500,,55000.0,"For Australia, Joint Ore Reserves Committee-co..."
2,MCS2024,China,"Zirconium ores and zircon concentrates, mine p...",140,140,,72.0,"Zirconium reserves for Australia, China, Mozam..."
3,MCS2024,Indonesia,"Zirconium ores and zircon concentrates, mine p...",97,90,,,
4,MCS2024,Kenya,"Zirconium ores and zircon concentrates, mine p...",27,30,,18.0,
5,MCS2024,Madagascar,"Zirconium ores and zircon concentrates, mine p...",27,30,,2300.0,
6,MCS2024,Mozambique,"Zirconium ores and zircon concentrates, mine p...",104,90,,1500.0,"Zirconium reserves for Australia, China, Mozam..."
7,MCS2024,Senegal,"Zirconium ores and zircon concentrates, mine p...",57,50,,2600.0,
8,MCS2024,Sierra Leone,"Zirconium ores and zircon concentrates, mine p...",34,30,,290.0,
9,MCS2024,South Africa,"Zirconium ores and zircon concentrates, mine p...",300,400,,5600.0,"Zirconium reserves for Australia, China, Mozam..."


# Since its from the mcs as well, provides info in the same format as the Lithium dataset
### All preprocessing will be done in one block

In [44]:
remove_unnecessary = ['Source',
                      'Type',
                      'Prod_notes',
                      'Reserves_notes'
                     ]

zirco_2024_clean = zirco_2024.drop(remove_unnecessary, axis=1)
zirco_2024_clean

Unnamed: 0,Country,Prod_kt_est_2022,Prod_kt_est_2023,Reserves_kt_2023
0,United States,100,100,500.0
1,Australia,500,500,55000.0
2,China,140,140,72.0
3,Indonesia,97,90,
4,Kenya,27,30,18.0
5,Madagascar,27,30,2300.0
6,Mozambique,104,90,1500.0
7,Senegal,57,50,2600.0
8,Sierra Leone,34,30,290.0
9,South Africa,300,400,5600.0


----------------------------------------
Exception happened during processing of request from ('127.0.0.1', 59836)
Traceback (most recent call last):
  File "/Users/kevin/anaconda3/envs/tf_env/lib/python3.7/socketserver.py", line 316, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/Users/kevin/anaconda3/envs/tf_env/lib/python3.7/socketserver.py", line 347, in process_request
    self.finish_request(request, client_address)
  File "/Users/kevin/anaconda3/envs/tf_env/lib/python3.7/socketserver.py", line 360, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/Users/kevin/anaconda3/envs/tf_env/lib/python3.7/socketserver.py", line 720, in __init__
    self.handle()
  File "/Users/kevin/anaconda3/envs/tf_env/lib/python3.7/site-packages/pyspark/accumulators.py", line 262, in handle
    poll(accum_updates)
  File "/Users/kevin/anaconda3/envs/tf_env/lib/python3.7/site-packages/pyspark/accumulators.py", line 235, in poll
 

#### Not using the salient files because they pertain to the US exclusively, and some information is withheld

In [None]:
ocean_2024=pd.read_csv('cleaned_wod_data.csv') # fix file 