<a href="https://colab.research.google.com/github/taddbackus/capstone/blob/main/Working%20RNN/Data_Cleanup.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import numpy as np
import pandas as pd

# Geospatial processing packages
import geopandas as gpd

from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


### Loading dataframe on current active/and closed coal mines (provided by app voices)

In [2]:
#join in overall info
# Read the GeoJSON file
mines = gpd.read_file('/content/drive/MyDrive/OSMRE_GEOMINE_GEOJSON.geojson')
mines = pd.DataFrame(mines)
mines = mines[['company',
               'coalmine_op_status',
               'mine_name',
               'permit_id',
               'national_id',
               'coal_bed_names',
               'inspectable_unit_status',
                'post_smcra',
               'reported_area',
               'permit_application_type',
               'permit_application_date',
               'permit_approval_date',
               'edit_date',
               'area_mine',
               'contour',
               'mountaintop',
               'steep_slope',
               'highwall',
               'auger',
               'contact']].drop_duplicates()

#replace empty strings with NAN
mines = mines.replace(r'^\s*$', np.nan, regex=True)


In [3]:
### Appending the number of unique companies per permit_id to dataframe
Company_check = mines[['permit_id', 'company']].groupby(['permit_id']).nunique().sort_values('company', ascending=False)
Company_check = Company_check.rename(columns={"company": "Number_company_to_permit_id"})
Company_check[Company_check['Number_company_to_permit_id']>1]
mines = pd.merge(mines, Company_check,  how='left', left_on=['permit_id'], right_on = ['permit_id'])

### Adding in State column to mines df
mines['State'] = mines['national_id'].str.slice(0, 2)
mines['State'].unique()
mines = mines[mines['State']. isin(['KY', 'TN', 'WV'])]
mines['State'] = mines['State'].str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mines['State'] = mines['State'].str.lower()


In [4]:
mines['edit_month'] = mines['edit_date'].astype(str).str.slice(5, 7)
mines['edit_year'] = mines['edit_date'].astype(str).str.slice(0,4)
mines = mines.drop(columns='edit_date')
mines['permit_weekday'] = mines['permit_approval_date'].dt.day_name()
mines['permit_approval_month'] = mines['permit_approval_date'].astype(str).str.slice(5, 7)
mines['permit_approval_year'] = mines['permit_approval_date'].astype(str).str.slice(0,4)
mines = mines.drop(columns='permit_approval_date')
mines['permit_application_year'] =  mines['permit_application_date'].astype(str).str.slice(0,4)
mines = mines.drop(columns='permit_application_date')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mines['edit_month'] = mines['edit_date'].astype(str).str.slice(5, 7)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mines['edit_year'] = mines['edit_date'].astype(str).str.slice(0,4)


In [5]:
mines.columns

Index(['company', 'coalmine_op_status', 'mine_name', 'permit_id',
       'national_id', 'coal_bed_names', 'inspectable_unit_status',
       'post_smcra', 'reported_area', 'permit_application_type', 'area_mine',
       'contour', 'mountaintop', 'steep_slope', 'highwall', 'auger', 'contact',
       'Number_company_to_permit_id', 'State', 'edit_month', 'edit_year',
       'permit_weekday', 'permit_approval_month', 'permit_approval_year',
       'permit_application_year'],
      dtype='object')

### Loading dataframes from google earth engine queries

In [89]:
directory_path = '/content/drive/MyDrive/Mines_Data/'
directory_files = os.listdir(directory_path)

df = pd.DataFrame()
z=0
for i in directory_files:
    df_file = pd.read_csv(os.path.join(directory_path, i), encoding = 'ISO-8859-1',low_memory=False)
    df_file['year'] = directory_files[z]
    df_file['year'] = df_file['year'].str.slice(3, 7)
    df_file['end_date'] = df_file['year']+'-07-31'
    df_file['start_date'] = df_file['year']+'-07-01'
    df_file['start_date'] = pd.to_datetime(df_file['start_date'])
    df_file['end_date'] = pd.to_datetime(df_file['end_date'])

    df_file['State'] = directory_files[z]
    df_file['State'] = df_file['State'].str.slice(0, 2)

    df = pd.concat([df, df_file])

    z+=1


In [90]:
len(df)

1358807

### aggregating to averages of land class(the probability) classes

In [91]:
#mapping demo in python https://colab.research.google.com/github/QuantEcon/quantecon-notebooks-datascience/blob/master/applications/maps.ipynb#scrollTo=SI5i7rboSGFL&uniqifier=2
# Read the downloaded file
#419bdf	water, #397d49	trees, #88b053	grass, #7a87c6	flooded_vegetation, #e49635	crops, #dfc35a	shrub_and_scrub, #c4281b	built, #a59b8f	bare, #b39fe1	snow_and_ice

label0 = df[['orig_perm_id', 'Company','mine_id','water', 'start_date', 'State']].groupby(['orig_perm_id', 'State','mine_id', 'start_date','Company']).mean().reset_index()
label1 = df[['orig_perm_id', 'Company','mine_id', 'trees', 'start_date', 'State']].groupby(['orig_perm_id', 'State','mine_id','start_date', 'Company']).mean().reset_index()
label2 = df[['orig_perm_id', 'Company','mine_id', 'grass', 'start_date', 'State']].groupby(['orig_perm_id', 'State','mine_id','start_date', 'Company']).mean().reset_index()
label3 = df[['orig_perm_id', 'Company','mine_id','flooded_vegetation', 'start_date', 'State']].groupby(['orig_perm_id', 'State','mine_id', 'start_date', 'Company']).mean().reset_index()
label4 = df[['orig_perm_id', 'Company','mine_id', 'crops', 'start_date', 'State']].groupby(['orig_perm_id', 'State', 'start_date','mine_id', 'Company']).mean().reset_index().drop_duplicates()
label5 = df[['orig_perm_id', 'Company','mine_id', 'shrub_and_scrub', 'start_date', 'State']].groupby(['orig_perm_id', 'State', 'start_date', 'mine_id','Company']).mean().reset_index()
label6 = df[['orig_perm_id', 'Company','mine_id', 'built', 'start_date', 'State']].groupby(['orig_perm_id', 'State', 'start_date','mine_id', 'Company']).mean().reset_index()
label7 = df[['orig_perm_id', 'Company','mine_id', 'bare', 'start_date', 'State']].groupby(['orig_perm_id', 'State', 'start_date','mine_id', 'Company']).mean().reset_index()
label8 = df[['orig_perm_id', 'Company','mine_id', 'snow_and_ice', 'start_date', 'State']].groupby(['orig_perm_id', 'State','start_date','mine_id', 'Company']).mean().reset_index()

label0 =label0.drop_duplicates()
label1 =label1.drop_duplicates()
label2 =label2.drop_duplicates()
label3 =label3.drop_duplicates()
label4 =label4.drop_duplicates()
label5 =label5.drop_duplicates()
label6 =label6.drop_duplicates()
label7 =label7.drop_duplicates()
label8 =label8.drop_duplicates()

group_df = df[['orig_perm_id', 'start_date', 'mine_id', 'Company', 'State']].groupby(['orig_perm_id', 'start_date', 'mine_id', 'State']).count().reset_index()
group_df = group_df.drop_duplicates()
group_df = group_df.rename(columns={"Company": "Observations"})
df = pd.merge(df, group_df, on=['orig_perm_id', 'start_date', 'mine_id', 'State'], how='left').drop_duplicates()

df = df[['Company', 'start_date', 'Observations', 'orig_perm_id', 'mine_id', 'State']].drop_duplicates()
df = df.reset_index()
df = df.drop(['index'], axis=1)

### joining aggregated values back into main dataframe
df = pd.merge(df, label0, on=['orig_perm_id', 'start_date', 'Company', 'State', 'mine_id'], how='left').drop_duplicates()
df = pd.merge(df, label1, on=['orig_perm_id', 'start_date', 'Company', 'State', 'mine_id'], how='left').drop_duplicates()
df = pd.merge(df, label2, on=['orig_perm_id', 'start_date', 'Company', 'State', 'mine_id'], how='left').drop_duplicates()
df = pd.merge(df, label3, on=['orig_perm_id', 'start_date', 'Company', 'State', 'mine_id'], how='left').drop_duplicates()
df = pd.merge(df, label4, on=['orig_perm_id', 'start_date', 'Company', 'State', 'mine_id'], how='left').drop_duplicates()
df = pd.merge(df, label5, on=['orig_perm_id', 'start_date', 'Company', 'State', 'mine_id'], how='left').drop_duplicates()
df = pd.merge(df, label6, on=['orig_perm_id', 'start_date', 'Company', 'State', 'mine_id'], how='left').drop_duplicates()
df = pd.merge(df, label7, on=['orig_perm_id', 'start_date', 'Company', 'State', 'mine_id'], how='left').drop_duplicates()
df = pd.merge(df, label8, on=['orig_perm_id', 'start_date', 'Company', 'State', 'mine_id'], how='left').drop_duplicates()

df.fillna(0, inplace=True)
df

Unnamed: 0,Company,start_date,Observations,orig_perm_id,mine_id,State,water,trees,grass,flooded_vegetation,crops,shrub_and_scrub,built,bare,snow_and_ice
0,B & C ENERGY INC,2023-07-01,45,8640180,00000000000000001112,ky,0.031113,0.623011,0.059563,0.032779,0.065345,0.062902,0.035015,0.032943,0.036427
1,RIDNER COAL CO INC,2023-07-01,4,9180010,00000000000000001d20,ky,0.031869,0.698788,0.039705,0.029274,0.039489,0.036189,0.033532,0.034271,0.043253
2,HERBERT WELLS,2023-07-01,2,0320080,000000000000000020c7,ky,0.032205,0.539393,0.055222,0.030944,0.088415,0.041989,0.070858,0.030897,0.037347
3,MOUNT VICTORY COAL CO INC,2023-07-01,16,404774X,00000000000000003095,ky,0.033029,0.735718,0.038324,0.029872,0.033263,0.031299,0.029638,0.032630,0.035219
4,EVERGREEN MINING INC,2023-07-01,30,8260501,0000000000000000086a,ky,0.030702,0.743186,0.033933,0.027343,0.029957,0.035815,0.029723,0.036228,0.032969
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32655,LO-MING COAL CORP.,2017-07-01,1,U502397,0000000000000000085a,wv,0.031039,0.587889,0.039089,0.025979,0.048085,0.042070,0.107876,0.037235,0.042496
32656,"APPALACHIAN FUELS, LLC",2017-07-01,67,S304188,0000000000000000057b,wv,0.030835,0.739765,0.032717,0.027254,0.030536,0.037661,0.031239,0.037154,0.032554
32657,"CHICOPEE COAL COMPANY, INC.",2017-07-01,10,S007385,000000000000000007d5,wv,0.027965,0.710045,0.042984,0.025502,0.038912,0.048706,0.032311,0.033129,0.031919
32658,BELVA COAL COMPANY,2017-07-01,10,S009482,00000000000000000869,wv,0.036899,0.544176,0.040389,0.030385,0.046551,0.055853,0.064093,0.120093,0.039626


In [92]:
len(df)

32660

## Created variables for mine_id and perm_id duplicates

In [93]:
df[df['orig_perm_id'].isna()]

Unnamed: 0,Company,start_date,Observations,orig_perm_id,mine_id,State,water,trees,grass,flooded_vegetation,crops,shrub_and_scrub,built,bare,snow_and_ice


In [94]:
df[df['orig_perm_id']=='Z007881']

Unnamed: 0,Company,start_date,Observations,orig_perm_id,mine_id,State,water,trees,grass,flooded_vegetation,crops,shrub_and_scrub,built,bare,snow_and_ice
28534,ELK RIVER SEWELL COAL COMPANY,2023-07-01,9,Z007881,0.0,wv,0.023552,0.742361,0.034854,0.024266,0.034883,0.03491,0.032696,0.030865,0.037824
29547,ELK RIVER SEWELL COAL COMPANY,2021-07-01,12,Z007881,0.0,wv,0.030378,0.741249,0.033265,0.028867,0.033076,0.030943,0.027757,0.035316,0.038792
31050,ELK RIVER SEWELL COAL COMPANY,2019-07-01,3,Z007881,0.0,wv,0.020282,0.741894,0.038053,0.025082,0.038567,0.030875,0.032687,0.027879,0.041148


In [95]:

company_df = df[['Company', 'mine_id']].groupby(['mine_id']).nunique().sort_values('mine_id', ascending=False).reset_index()
company_df = company_df.rename(columns={"Company": "Companies_per_mine_id"})
df = pd.merge(df, company_df, on=['mine_id'], how='left').drop_duplicates()
df

Unnamed: 0,Company,start_date,Observations,orig_perm_id,mine_id,State,water,trees,grass,flooded_vegetation,crops,shrub_and_scrub,built,bare,snow_and_ice,Companies_per_mine_id
0,B & C ENERGY INC,2023-07-01,45,8640180,00000000000000001112,ky,0.031113,0.623011,0.059563,0.032779,0.065345,0.062902,0.035015,0.032943,0.036427,1
1,RIDNER COAL CO INC,2023-07-01,4,9180010,00000000000000001d20,ky,0.031869,0.698788,0.039705,0.029274,0.039489,0.036189,0.033532,0.034271,0.043253,1
2,HERBERT WELLS,2023-07-01,2,0320080,000000000000000020c7,ky,0.032205,0.539393,0.055222,0.030944,0.088415,0.041989,0.070858,0.030897,0.037347,1
3,MOUNT VICTORY COAL CO INC,2023-07-01,16,404774X,00000000000000003095,ky,0.033029,0.735718,0.038324,0.029872,0.033263,0.031299,0.029638,0.032630,0.035219,1
4,EVERGREEN MINING INC,2023-07-01,30,8260501,0000000000000000086a,ky,0.030702,0.743186,0.033933,0.027343,0.029957,0.035815,0.029723,0.036228,0.032969,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32655,LO-MING COAL CORP.,2017-07-01,1,U502397,0000000000000000085a,wv,0.031039,0.587889,0.039089,0.025979,0.048085,0.042070,0.107876,0.037235,0.042496,2
32656,"APPALACHIAN FUELS, LLC",2017-07-01,67,S304188,0000000000000000057b,wv,0.030835,0.739765,0.032717,0.027254,0.030536,0.037661,0.031239,0.037154,0.032554,2
32657,"CHICOPEE COAL COMPANY, INC.",2017-07-01,10,S007385,000000000000000007d5,wv,0.027965,0.710045,0.042984,0.025502,0.038912,0.048706,0.032311,0.033129,0.031919,2
32658,BELVA COAL COMPANY,2017-07-01,10,S009482,00000000000000000869,wv,0.036899,0.544176,0.040389,0.030385,0.046551,0.055853,0.064093,0.120093,0.039626,2


In [96]:

multistate_mine_id = df[['orig_perm_id', 'mine_id']].groupby(['mine_id']).nunique().sort_values('mine_id', ascending=False).reset_index()
multistate_mine_id = multistate_mine_id.rename(columns={"orig_perm_id": "multistate_mine_id"})
df = pd.merge(df, multistate_mine_id, on=['mine_id'], how='left').drop_duplicates()

permid_df = df[['orig_perm_id', 'mine_id']].groupby(['orig_perm_id']).nunique().sort_values('orig_perm_id', ascending=False).reset_index()
permid_df

permid_df = permid_df.rename(columns={"mine_id": "mines_per_perm_id"})
df = pd.merge(df, permid_df, on=['orig_perm_id'], how='left').drop_duplicates()
df

Unnamed: 0,Company,start_date,Observations,orig_perm_id,mine_id,State,water,trees,grass,flooded_vegetation,crops,shrub_and_scrub,built,bare,snow_and_ice,Companies_per_mine_id,multistate_mine_id,mines_per_perm_id
0,B & C ENERGY INC,2023-07-01,45,8640180,00000000000000001112,ky,0.031113,0.623011,0.059563,0.032779,0.065345,0.062902,0.035015,0.032943,0.036427,1,1,2
1,RIDNER COAL CO INC,2023-07-01,4,9180010,00000000000000001d20,ky,0.031869,0.698788,0.039705,0.029274,0.039489,0.036189,0.033532,0.034271,0.043253,1,1,1
2,HERBERT WELLS,2023-07-01,2,0320080,000000000000000020c7,ky,0.032205,0.539393,0.055222,0.030944,0.088415,0.041989,0.070858,0.030897,0.037347,1,1,1
3,MOUNT VICTORY COAL CO INC,2023-07-01,16,404774X,00000000000000003095,ky,0.033029,0.735718,0.038324,0.029872,0.033263,0.031299,0.029638,0.032630,0.035219,1,1,1
4,EVERGREEN MINING INC,2023-07-01,30,8260501,0000000000000000086a,ky,0.030702,0.743186,0.033933,0.027343,0.029957,0.035815,0.029723,0.036228,0.032969,2,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32655,LO-MING COAL CORP.,2017-07-01,1,U502397,0000000000000000085a,wv,0.031039,0.587889,0.039089,0.025979,0.048085,0.042070,0.107876,0.037235,0.042496,2,2,1
32656,"APPALACHIAN FUELS, LLC",2017-07-01,67,S304188,0000000000000000057b,wv,0.030835,0.739765,0.032717,0.027254,0.030536,0.037661,0.031239,0.037154,0.032554,2,2,1
32657,"CHICOPEE COAL COMPANY, INC.",2017-07-01,10,S007385,000000000000000007d5,wv,0.027965,0.710045,0.042984,0.025502,0.038912,0.048706,0.032311,0.033129,0.031919,2,2,1
32658,BELVA COAL COMPANY,2017-07-01,10,S009482,00000000000000000869,wv,0.036899,0.544176,0.040389,0.030385,0.046551,0.055853,0.064093,0.120093,0.039626,2,2,1


In [23]:
#50 mines with numerous mine ids per orig_perm_id
9856-9906

-50

In [97]:
df[['orig_perm_id', 'start_date', 'mine_id', 'Company', 'State']].groupby([ 'State']).count().reset_index()


Unnamed: 0,State,orig_perm_id,start_date,mine_id,Company
0,ky,27873,27873,27873,27873
1,tn,407,407,407,407
2,wv,4380,4380,4380,4380


### Checking mines with different company names for the same permit_id

In [98]:
rogue_dupe_mine = mines[(mines['permit_id']=='80-156') & (mines['post_smcra']==1.0)]

In [99]:
rogue_dupe_mine

Unnamed: 0,company,coalmine_op_status,mine_name,permit_id,national_id,coal_bed_names,inspectable_unit_status,post_smcra,reported_area,permit_application_type,...,auger,contact,Number_company_to_permit_id,State,edit_month,edit_year,permit_weekday,permit_approval_month,permit_approval_year,permit_application_year
302,RICHLAND COAL CO,7,AREA 5,80-156,TN80-156,,,1.0,,,...,,4,2.0,tn,,NaT,,,NaT,NaT


In [100]:
mines = mines[(mines['permit_id']!='80-156')]

In [101]:
mines = mines.append(rogue_dupe_mine)

  mines = mines.append(rogue_dupe_mine)


### Creating empty rows for missing years

In [102]:
df['key'] = df['mine_id']+df['State']
df[['key', 'start_date']].groupby(['key']).nunique().sort_values('key', ascending=False)

df[['key', 'start_date']].groupby(['start_date']).nunique().sort_values('start_date', ascending=False)

Unnamed: 0_level_0,key
start_date,Unnamed: 1_level_1
2023-07-01,9365
2021-07-01,9988
2019-07-01,11033
2017-07-01,2274


In [103]:
date = ['2017-07-01', '2019-07-01', '2021-07-01']
df_clean = pd.DataFrame()
#imputation for missing years by permit_id

for x,i in enumerate(date):
  missing = df['key'][df['start_date']==date[x]].unique()
  df_clean_loop = df.loc[:, df.columns != 'start_date'].query('key not in @missing')

  df_clean_fill = df_clean_loop.groupby(['mine_id', 'orig_perm_id', 'Company', 'State', 'key']).mean().reset_index()
  df_clean_fill['start_date']=date[x]
  df_clean_fill['start_date'] = pd.to_datetime(df_clean_fill['start_date'])
  df_clean = df_clean.append(df_clean_fill)




  df_clean = df_clean.append(df_clean_fill)
  df_clean = df_clean.append(df_clean_fill)
  df_clean = df_clean.append(df_clean_fill)


In [104]:
df['imputation']=0
df_clean['imputation']=1
df = df.append(df_clean)
df[['mine_id', 'start_date']].groupby(['start_date']).nunique().sort_values('start_date', ascending=False)

### Removing any rows missing 2023

all_year_mines = df['mine_id'][df['start_date']=='07-01-2023'].unique()
df = df.query('mine_id in @all_year_mines')

  df = df.append(df_clean)


In [105]:
date = ['2017-07-01', '2019-07-01', '2021-07-01', '2023-07-01']
df_clean = pd.DataFrame()
#imputation for missing years by permit_id

for x,i in enumerate(date):
  missing = df['mine_id'][df['start_date']==date[x]].unique()
  print('if zero, update succesful: ',len(df.query('key in @missing')))

if zero, update succesful:  0
if zero, update succesful:  0
if zero, update succesful:  0
if zero, update succesful:  0


In [106]:
df[['mine_id', 'start_date']].groupby(['start_date']).nunique().sort_values('start_date', ascending=False)

Unnamed: 0_level_0,mine_id
start_date,Unnamed: 1_level_1
2023-07-01,8632
2021-07-01,8632
2019-07-01,8632
2017-07-01,8632


In [71]:
len(df)

40028

In [None]:
# 10 duplicates with join --40,038

In [107]:
df_mines_check = pd.merge(df, mines,  how='left', left_on=['orig_perm_id', 'State'], right_on = ['permit_id', 'State'])

In [83]:
df_mines_check.columns

Index(['Company', 'start_date', 'Observations', 'orig_perm_id', 'mine_id',
       'State', 'water', 'trees', 'grass', 'flooded_vegetation', 'crops',
       'shrub_and_scrub', 'built', 'bare', 'snow_and_ice',
       'Companies_per_mine_id', 'multistate_mine_id', 'mines_per_perm_id',
       'key', 'imputation', 'company', 'coalmine_op_status', 'mine_name',
       'permit_id', 'national_id', 'coal_bed_names', 'inspectable_unit_status',
       'post_smcra', 'reported_area', 'permit_application_type', 'area_mine',
       'contour', 'mountaintop', 'steep_slope', 'highwall', 'auger', 'contact',
       'Number_company_to_permit_id', 'edit_month', 'edit_year',
       'permit_weekday', 'permit_approval_month', 'permit_approval_year',
       'permit_application_year'],
      dtype='object')

In [115]:
mines[mines['permit_id']=='S200989']

Unnamed: 0,company,coalmine_op_status,mine_name,permit_id,national_id,coal_bed_names,inspectable_unit_status,post_smcra,reported_area,permit_application_type,...,auger,contact,Number_company_to_permit_id,State,edit_month,edit_year,permit_weekday,permit_approval_month,permit_approval_year,permit_application_year
22126,,5,,S200989,WVS200989,,,1.0,,,...,0.0,2,0.0,wv,,NaT,Wednesday,8,1989,NaT


In [113]:
df_mines_check[df_mines_check['mine_id']== '000000000000000008a4']

Unnamed: 0,Company,start_date,Observations,orig_perm_id,mine_id,State,water,trees,grass,flooded_vegetation,...,highwall,auger,contact,Number_company_to_permit_id,edit_month,edit_year,permit_weekday,permit_approval_month,permit_approval_year,permit_application_year
5,GLOBAL ENERGY GROUP LLC,2023-07-01,73.0,8260599,000000000000000008a4,ky,0.030102,0.673109,0.048175,0.028032,...,,,,,,,,,,
6384,GLOBAL ENERGY GROUP LLC,2021-07-01,52.0,8260599,000000000000000008a4,ky,0.029589,0.721871,0.04103,0.027131,...,,,,,,,,,,
9689,GLOBAL ENERGY GROUP LLC,2019-07-01,73.0,8260599,000000000000000008a4,ky,0.028061,0.709958,0.048706,0.026464,...,,,,,,,,,,
27510,"UPCO COAL, INC.",2023-07-01,17.0,S200989,000000000000000008a4,wv,0.030358,0.545957,0.111893,0.034343,...,,0.0,2.0,0.0,,NaT,Wednesday,8.0,1989.0,NaT
29871,"UPCO COAL, INC.",2019-07-01,28.0,S200989,000000000000000008a4,wv,0.026041,0.62079,0.095134,0.030605,...,,0.0,2.0,0.0,,NaT,Wednesday,8.0,1989.0,NaT
30594,"UPCO COAL, INC.",2017-07-01,10.0,S200989,000000000000000008a4,wv,0.028234,0.57507,0.118903,0.030061,...,,0.0,2.0,0.0,,NaT,Wednesday,8.0,1989.0,NaT
33379,GLOBAL ENERGY GROUP LLC,2017-07-01,66.0,8260599,000000000000000008a4,ky,0.029251,0.701646,0.04597,0.027209,...,,,,,,,,,,
39879,"UPCO COAL, INC.",2021-07-01,18.333333,S200989,000000000000000008a4,wv,0.028211,0.580606,0.108643,0.03167,...,,0.0,2.0,0.0,,NaT,Wednesday,8.0,1989.0,NaT


In [116]:
df_mines_check[df_mines_check['multistate_mine_id']>1]

Unnamed: 0,Company,start_date,Observations,orig_perm_id,mine_id,State,water,trees,grass,flooded_vegetation,...,highwall,auger,contact,Number_company_to_permit_id,edit_month,edit_year,permit_weekday,permit_approval_month,permit_approval_year,permit_application_year
4,EVERGREEN MINING INC,2023-07-01,30.0,8260501,0000000000000000086a,ky,0.030702,0.743186,0.033933,0.027343,...,0.0,0.0,3.0,1.0,05,2004,Monday,03,2001,NaT
5,GLOBAL ENERGY GROUP LLC,2023-07-01,73.0,8260599,000000000000000008a4,ky,0.030102,0.673109,0.048175,0.028032,...,,,,,,,,,,
9,HERMITAGE COAL CO INC,2023-07-01,2.0,643677X,000000000000000003af,ky,0.033055,0.732339,0.035516,0.031776,...,0.0,0.0,3.0,1.0,07,2000,Thursday,11,1977,NaT
16,ANDERSON COAL CO INC,2023-07-01,3.0,0605067,00000000000000000062,ky,0.028308,0.667007,0.056279,0.026231,...,0.0,0.0,3.0,0.0,11,1999,Saturday,12,1899,NaT
56,STUMP COAL CO,2023-07-01,3.0,463875X,00000000000000000079,ky,0.035129,0.371806,0.162875,0.052762,...,0.0,0.0,3.0,1.0,07,2000,Tuesday,06,1976,NaT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39909,"HOBET MINING, INC.",2021-07-01,4.0,U503590,00000000000000000942,wv,0.035016,0.326425,0.162511,0.057876,...,,,2.0,1.0,,NaT,Monday,04,1991,NaT
39910,STOLLINGS MINING COMPANY,2021-07-01,3.0,U503787,00000000000000000947,wv,0.030625,0.709932,0.038367,0.027663,...,,,2.0,1.0,,NaT,Thursday,08,1987,NaT
39911,RATON FUELS CORP,2021-07-01,24.0,S504086,0000000000000000094a,wv,0.028510,0.739806,0.033548,0.026251,...,,0.0,2.0,0.0,,NaT,Wednesday,10,1987,NaT
39912,"LCC WEST VIRGINIA, LLC",2021-07-01,29.0,S302791,00000000000000000955,wv,0.029573,0.518831,0.096937,0.040442,...,,0.0,2.0,0.0,,NaT,Monday,07,1992,NaT


In [117]:
df_mines_exclude_from_join = df_mines_check[df_mines_check['Number_company_to_permit_id']>1]

In [121]:
df.columns

Index(['Company', 'start_date', 'Observations', 'orig_perm_id', 'mine_id',
       'State', 'water', 'trees', 'grass', 'flooded_vegetation', 'crops',
       'shrub_and_scrub', 'built', 'bare', 'snow_and_ice',
       'Companies_per_mine_id', 'multistate_mine_id', 'mines_per_perm_id',
       'key', 'imputation'],
      dtype='object')

In [124]:
df_mines_check[df_mines_check['Companies_per_mine_id']>1]

Unnamed: 0,Company,start_date,Observations,orig_perm_id,mine_id,State,water,trees,grass,flooded_vegetation,...,highwall,auger,contact,Number_company_to_permit_id,edit_month,edit_year,permit_weekday,permit_approval_month,permit_approval_year,permit_application_year
4,EVERGREEN MINING INC,2023-07-01,30.0,8260501,0000000000000000086a,ky,0.030702,0.743186,0.033933,0.027343,...,0.0,0.0,3.0,1.0,05,2004,Monday,03,2001,NaT
5,GLOBAL ENERGY GROUP LLC,2023-07-01,73.0,8260599,000000000000000008a4,ky,0.030102,0.673109,0.048175,0.028032,...,,,,,,,,,,
9,HERMITAGE COAL CO INC,2023-07-01,2.0,643677X,000000000000000003af,ky,0.033055,0.732339,0.035516,0.031776,...,0.0,0.0,3.0,1.0,07,2000,Thursday,11,1977,NaT
16,ANDERSON COAL CO INC,2023-07-01,3.0,0605067,00000000000000000062,ky,0.028308,0.667007,0.056279,0.026231,...,0.0,0.0,3.0,0.0,11,1999,Saturday,12,1899,NaT
56,STUMP COAL CO,2023-07-01,3.0,463875X,00000000000000000079,ky,0.035129,0.371806,0.162875,0.052762,...,0.0,0.0,3.0,1.0,07,2000,Tuesday,06,1976,NaT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39909,"HOBET MINING, INC.",2021-07-01,4.0,U503590,00000000000000000942,wv,0.035016,0.326425,0.162511,0.057876,...,,,2.0,1.0,,NaT,Monday,04,1991,NaT
39910,STOLLINGS MINING COMPANY,2021-07-01,3.0,U503787,00000000000000000947,wv,0.030625,0.709932,0.038367,0.027663,...,,,2.0,1.0,,NaT,Thursday,08,1987,NaT
39911,RATON FUELS CORP,2021-07-01,24.0,S504086,0000000000000000094a,wv,0.028510,0.739806,0.033548,0.026251,...,,0.0,2.0,0.0,,NaT,Wednesday,10,1987,NaT
39912,"LCC WEST VIRGINIA, LLC",2021-07-01,29.0,S302791,00000000000000000955,wv,0.029573,0.518831,0.096937,0.040442,...,,0.0,2.0,0.0,,NaT,Monday,07,1992,NaT


In [122]:
df[df['Companies_per_mine_id']==1]

Unnamed: 0,Company,start_date,Observations,orig_perm_id,mine_id,State,water,trees,grass,flooded_vegetation,crops,shrub_and_scrub,built,bare,snow_and_ice,Companies_per_mine_id,multistate_mine_id,mines_per_perm_id,key,imputation
0,B & C ENERGY INC,2023-07-01,45.0,8640180,00000000000000001112,ky,0.031113,0.623011,0.059563,0.032779,0.065345,0.062902,0.035015,0.032943,0.036427,1.0,1.0,2.0,00000000000000001112ky,0
1,RIDNER COAL CO INC,2023-07-01,4.0,9180010,00000000000000001d20,ky,0.031869,0.698788,0.039705,0.029274,0.039489,0.036189,0.033532,0.034271,0.043253,1.0,1.0,1.0,00000000000000001d20ky,0
2,HERBERT WELLS,2023-07-01,2.0,0320080,000000000000000020c7,ky,0.032205,0.539393,0.055222,0.030944,0.088415,0.041989,0.070858,0.030897,0.037347,1.0,1.0,1.0,000000000000000020c7ky,0
3,MOUNT VICTORY COAL CO INC,2023-07-01,16.0,404774X,00000000000000003095,ky,0.033029,0.735718,0.038324,0.029872,0.033263,0.031299,0.029638,0.032630,0.035219,1.0,1.0,1.0,00000000000000003095ky,0
6,BER COAL LLC,2023-07-01,6.0,8260670,000000000000000008bf,ky,0.032317,0.463962,0.098910,0.029844,0.068912,0.083604,0.054121,0.035295,0.036912,1.0,1.0,2.0,000000000000000008bfky,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1457,STUMP COAL COMPANY INC,2021-07-01,1.5,2985253,00000000000000002fbe,ky,0.027484,0.730087,0.032807,0.026845,0.036434,0.035243,0.031670,0.034360,0.040043,1.0,1.0,1.0,00000000000000002fbeky,1
1463,ELKHORN EAGLE MINING COMPANY INC,2021-07-01,1.0,2985273,00000000000000002fca,ky,0.029705,0.730464,0.035289,0.027582,0.032840,0.038326,0.034288,0.037754,0.033707,1.0,1.0,1.0,00000000000000002fcaky,1
1467,JEROLD AND JEFFREY COAL COMPANY INC,2021-07-01,1.5,2985288,00000000000000002fd2,ky,0.026915,0.738387,0.041519,0.026679,0.033050,0.036183,0.032921,0.030864,0.033014,1.0,1.0,1.0,00000000000000002fd2ky,1
1468,LITTLE HACKNEY CREEK COAL CO,2021-07-01,1.5,2985319,00000000000000002fdc,ky,0.027767,0.748742,0.030551,0.028099,0.036952,0.026954,0.033159,0.029888,0.036835,1.0,1.0,1.0,00000000000000002fdcky,1


In [118]:
df_mines_check[df_mines_check['orig_perm_id']=='0070008']

Unnamed: 0,Company,start_date,Observations,orig_perm_id,mine_id,State,water,trees,grass,flooded_vegetation,...,highwall,auger,contact,Number_company_to_permit_id,edit_month,edit_year,permit_weekday,permit_approval_month,permit_approval_year,permit_application_year
1338,XINERGY CORP,2023-07-01,11.0,70008,00000000000000000611,ky,0.030341,0.47053,0.133311,0.037069,...,,,,,,,,,,
1339,STRAIGHT CREEK COAL MINING INC,2023-07-01,11.0,70008,00000000000000000613,ky,0.030341,0.47053,0.133311,0.037069,...,,,,,,,,,,
1340,REVELATION ENERGY LLC,2023-07-01,11.0,70008,00000000000000000615,ky,0.030341,0.47053,0.133311,0.037069,...,,,,,,,,,,
1351,STRAIGHT CREEK MINING CO,2023-07-01,3.0,70008,00000000000000001e6f,ky,0.023888,0.557904,0.121477,0.031908,...,,,,,,,,,,
1353,STRAIGHT CREEK COAL RESOURCES COMPANY,2023-07-01,7.0,70008,00000000000000003098,ky,0.028015,0.444472,0.155607,0.038106,...,,,,,,,,,,
7718,XINERGY CORP,2021-07-01,19.0,70008,00000000000000000611,ky,0.032005,0.612869,0.050876,0.030512,...,,,,,,,,,,
7719,STRAIGHT CREEK COAL MINING INC,2021-07-01,19.0,70008,00000000000000000613,ky,0.032005,0.612869,0.050876,0.030512,...,,,,,,,,,,
7720,REVELATION ENERGY LLC,2021-07-01,19.0,70008,00000000000000000615,ky,0.032005,0.612869,0.050876,0.030512,...,,,,,,,,,,
7731,STRAIGHT CREEK MINING CO,2021-07-01,6.0,70008,00000000000000001e6f,ky,0.029631,0.645189,0.04472,0.033482,...,,,,,,,,,,
7733,STRAIGHT CREEK COAL RESOURCES COMPANY,2021-07-01,11.0,70008,00000000000000003098,ky,0.030143,0.483082,0.063127,0.033888,...,,,,,,,,,,


In [78]:
df_mines_check[['orig_perm_id', 'mine_id', 'State', 'Number_company_to_permit_id']].groupby(['orig_perm_id',  'State']).nunique().sort_values('orig_perm_id', ascending=False).reset_index()



Unnamed: 0,orig_perm_id,State,mine_id,Number_company_to_permit_id
0,Z008881,wv,1,1
1,Z008781,wv,1,1
2,Z008181,wv,1,1
3,Z007881,wv,1,1
4,Z007781,wv,1,1
...,...,...,...,...
8855,0070019,ky,1,1
8856,0070017,ky,1,1
8857,0070015,ky,1,1
8858,0070010,ky,1,1


In [77]:
df_mines_check[['orig_perm_id', 'mine_id', 'State', 'Number_company_to_permit_id']].groupby(['orig_perm_id', 'Number_company_to_permit_id', 'State']).nunique().sort_values('orig_perm_id', ascending=False).reset_index()



Unnamed: 0,orig_perm_id,Number_company_to_permit_id,State,mine_id
0,Z008881,1.0,wv,1
1,Z008781,1.0,wv,1
2,Z008181,0.0,wv,1
3,Z007881,1.0,wv,1
4,Z007781,0.0,wv,1
...,...,...,...,...
7004,0070020,0.0,ky,1
7005,0070019,0.0,ky,1
7006,0070017,0.0,ky,1
7007,0070015,0.0,ky,1


In [66]:
#testing out skipping this merge
#df_merge = pd.merge(df, mines,  how='left', left_on=['orig_perm_id', 'State'], right_on = ['permit_id', 'State'])

In [125]:
df_merge=df

In [126]:
directory_path = '/content/drive/MyDrive/Mines_Data_Clean/Precipitation/'
directory_files = os.listdir(directory_path)

precip = pd.DataFrame()
z=0
for i in directory_files:
    df_file = pd.read_csv(os.path.join(directory_path, i), encoding = 'ISO-8859-1',low_memory=False)
    df_file['start_date_array'] = df_file['start_date_array'].str.split(" ", 1, expand=True)[1]
    df_file['start_date_array'] = df_file['start_date_array'].str.slice(0, 13)
    df_file['start_date'] = pd.to_datetime(df_file['start_date_array'])

    df_file['orig_perm_id'] = df_file['orig_perm_id'].str.split(" ", 1, expand=True)[1]
    df_file['orig_perm_id'] = df_file['orig_perm_id'].str.slice(0, 13)
    df_file['orig_perm_id'] = df_file['orig_perm_id'].str.replace('\nNa', '')

    precip = pd.concat([precip, df_file])

    z+=1

  df_file['start_date_array'] = df_file['start_date_array'].str.split(" ", 1, expand=True)[1]
  df_file['orig_perm_id'] = df_file['orig_perm_id'].str.split(" ", 1, expand=True)[1]
  df_file['start_date_array'] = df_file['start_date_array'].str.split(" ", 1, expand=True)[1]
  df_file['orig_perm_id'] = df_file['orig_perm_id'].str.split(" ", 1, expand=True)[1]
  df_file['start_date_array'] = df_file['start_date_array'].str.split(" ", 1, expand=True)[1]
  df_file['orig_perm_id'] = df_file['orig_perm_id'].str.split(" ", 1, expand=True)[1]
  df_file['start_date_array'] = df_file['start_date_array'].str.split(" ", 1, expand=True)[1]
  df_file['orig_perm_id'] = df_file['orig_perm_id'].str.split(" ", 1, expand=True)[1]
  df_file['start_date_array'] = df_file['start_date_array'].str.split(" ", 1, expand=True)[1]
  df_file['orig_perm_id'] = df_file['orig_perm_id'].str.split(" ", 1, expand=True)[1]
  df_file['start_date_array'] = df_file['start_date_array'].str.split(" ", 1, expand=True)[1]
  df_f

### Precipitation Data

In [127]:
precip = precip.drop_duplicates()
precip = precip.drop(columns = 'start_date_array')
precip = precip[precip.columns[1:]]
keep_columns =['start_date', 'orig_perm_id', 'WaterQuality', 'Max Temperature', 'Precipitation']
precip = precip[keep_columns]

date = ['2017-07-01', '2019-07-01', '2021-07-01', '2023-07-01']
precip_clean = pd.DataFrame()
#imputation for missing years by permit_id
keep_columns =['orig_perm_id', 'WaterQuality', 'Max Temperature', 'Precipitation']

for x,i in enumerate(date):
  print(date[x])
  missing = precip['orig_perm_id'][precip['start_date']==date[x]]
  #finds permit_id that don't have a row for that year
  precip_clean_loop = precip[keep_columns].query('orig_perm_id not in @missing')

  precip_clean_fill = precip_clean_loop.groupby(['orig_perm_id']).mean().reset_index()
  precip_clean_fill['start_date'] = date[x]
  precip_clean_fill['start_date'] = pd.to_datetime(precip_clean_fill['start_date'])
  precip_clean = precip_clean.append(precip_clean_fill)
  missing = precip_clean['orig_perm_id'][precip_clean['start_date']==date[x]].unique()
  print('if zero, update succesful: ',len(precip_clean.query('orig_perm_id not in @missing')))

  precip_clean = precip_clean.append(precip)

precip_clean['orig_perm_id'] = precip_clean['orig_perm_id'].str.strip()
precip_clean = precip_clean.groupby(['orig_perm_id', 'start_date']).mean().reset_index()

  precip_clean = precip_clean.append(precip_clean_fill)
  precip_clean = precip_clean.append(precip)
  precip_clean = precip_clean.append(precip_clean_fill)
  precip_clean = precip_clean.append(precip)
  precip_clean = precip_clean.append(precip_clean_fill)
  precip_clean = precip_clean.append(precip)
  precip_clean = precip_clean.append(precip_clean_fill)
  precip_clean = precip_clean.append(precip)


2017-07-01
if zero, update succesful:  0
2019-07-01
if zero, update succesful:  0
2021-07-01
if zero, update succesful:  0
2023-07-01
if zero, update succesful:  0


In [128]:
precip_clean[precip_clean['orig_perm_id']=='8640180']

Unnamed: 0,orig_perm_id,start_date,WaterQuality,Max Temperature,Precipitation
23920,8640180,2017-07-01,-0.459511,299.720001,14.72
23921,8640180,2019-07-01,-0.682684,304.200012,6.2
23922,8640180,2021-07-01,-0.74047,295.899994,28.799999
23923,8640180,2023-07-01,0.044621,298.399994,3.6


In [129]:
len(df_merge)

40028

In [130]:
df_merge_all = pd.merge(df_merge, precip_clean[['WaterQuality', 'orig_perm_id', 'start_date', 'Max Temperature', 'Precipitation']],  how='left', left_on=['orig_perm_id', 'start_date'], right_on = ['orig_perm_id', 'start_date'])

In [131]:
for i in df_merge_all.columns:
    if df_merge_all[i].isna().sum() > 0:
        print('{} has {}% NAs'.format(i,(df_merge_all[i].isna().sum() / len(df_merge_all))*100))

WaterQuality has 15.936344558808834% NAs
Max Temperature has 15.928849805136405% NAs
Precipitation has 15.928849805136405% NAs


In [132]:

# Read the GeoJSON file
ky = gpd.read_file('/content/drive/MyDrive/KY_Released_mines.geojson')

# Read data using GeoPandas
print("Data dimensions: {}".format(mines.shape))
columns = ['id', 'QUAD_DESC', 'Type_Flag', 'PER_TYPE', 'Source', 'MINE_STATU', 'FeatCLS', 'REGION_DES']
ky=ky[columns]
ky['State_alt']='ky'

tn = gpd.read_file('/content/drive/MyDrive/TN_Released_mines.geojson')

# Read data using GeoPandas
print("Data dimensions: {}".format(mines.shape))
columns = ['id', 'national_i', 'Shape_Area', 'permit_a_2']
tn=tn[columns]
tn['State_alt']='tn'

wv = gpd.read_file('/content/drive/MyDrive/WV_Released_mines.geojson')

# Read data using GeoPandas
print("Data dimensions: {}".format(mines.shape))
columns = ['id', 'ACRES_CURR', 'ACRES_DIST', 'ACRES_ORIG','ACRES_CURR', 'ACRES_RECL', 'EXPIRE_DAT', 'FACILITY_N', 'MAPTYPE', 'OPERATOR', 'PMLU2']
wv=wv[columns]
wv['State_alt']='wv'


Data dimensions: (21930, 25)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ky['State_alt']='ky'


Data dimensions: (21930, 25)
Data dimensions: (21930, 25)


In [133]:
ky = pd.merge(df_merge_all, ky,  how='inner', left_on=['mine_id', 'State'], right_on = ['id', 'State_alt']).drop_duplicates()
tn = pd.merge(df_merge_all, tn,  how='inner', left_on=['mine_id', 'State'], right_on = ['id', 'State_alt']).drop_duplicates()
wv = pd.merge(df_merge_all, wv,  how='inner', left_on=['mine_id', 'State'], right_on = ['id', 'State_alt']).drop_duplicates()

In [179]:
columns_to_drop = ['State_alt', 'id']
all_mines = ky.sort_values(by=['mine_id', 'orig_perm_id', 'start_date']).reset_index()
all_mines = all_mines.drop(columns = 'index')
#all_mines['edit_year'] = all_mines['edit_year'].astype(str)
#all_mines['edit_year'] = all_mines['edit_year'].str.replace('.0','')
#all_mines['edit_year'] = all_mines['edit_year'].astype(object)

## Imputation with mode


In [180]:
all_mines = all_mines.drop(columns = columns_to_drop)

In [182]:
all_mines.orig_perm_id = all_mines.orig_perm_id.str.replace('[a-zA-Z]' , '')
all_mines.mine_id = all_mines.mine_id.str.replace('[a-zA-Z]' , '')

  all_mines.orig_perm_id = all_mines.orig_perm_id.str.replace('[a-zA-Z]' , '')


In [184]:
#Impute with mode for these columns
columns_to_impute_with_mode = ['Company','State',  'QUAD_DESC', 'Type_Flag', 'PER_TYPE', 'Source', 'MINE_STATU', 'FeatCLS', 'REGION_DES']

all_mines[columns_to_impute_with_mode] = all_mines[columns_to_impute_with_mode].fillna(all_mines[columns_to_impute_with_mode].mode().iloc[0])
nan_count = all_mines.isna().sum()
nan_count_sorted = nan_count.sort_values(ascending=False)
print(nan_count_sorted.head(12))

WaterQuality       12
Precipitation       9
Max Temperature     9
Company             0
start_date          0
FeatCLS             0
MINE_STATU          0
Source              0
PER_TYPE            0
Type_Flag           0
QUAD_DESC           0
imputation          0
dtype: int64


In [190]:
all_mines['WaterQuality'] = all_mines['WaterQuality'].fillna(all_mines['WaterQuality'].mode().iloc[0])
all_mines['Precipitation'] = all_mines['Precipitation'].fillna(all_mines['Precipitation'].mode().iloc[0])
all_mines['Max Temperature'] = all_mines['Max Temperature'].fillna(all_mines['Max Temperature'].mode().iloc[0])

In [None]:
## Scaling and one hot encoding the columns


In [243]:
from sklearn.preprocessing import MinMaxScaler

columns_to_encode = [ 'start_date', 'Company','State',  'QUAD_DESC', 'Type_Flag', 'PER_TYPE', 'Source', 'MINE_STATU', 'FeatCLS', 'REGION_DES']
#,'permit_application_year', 'imputation', 'permit_approval_year', 'permit_approval_month', 'edit_year', 'edit_month', 'permit_weekday',
all_mines_encoded = pd.get_dummies(all_mines[columns_to_encode], columns=columns_to_encode)

column_to_exclude = ['trees', 'water', 'snow_and_ice','grass','bare', 'built', 'shrub_and_scrub', 'crops', 'flooded_vegetation']

#one hot encoding categorical variables for model
cols = all_mines.columns
num_cols = all_mines._get_numeric_data().columns
num_cols =list(set(num_cols) - set(column_to_exclude))
num_cols =list(set(num_cols) - set(columns_to_encode))

#creating dataframe of numeric columns
scaler = MinMaxScaler(feature_range = (0, 1))
num_df = scaler.fit_transform(all_mines[num_cols])
num_df =pd.DataFrame(num_df,columns=all_mines[num_cols].columns, index=all_mines[num_cols].index)

all_mines_encoded = pd.merge(all_mines_encoded, num_df, left_index=True, right_index=True)
all_mines_encoded['trees'] = all_mines['trees']
#very quick imputation
#all_mines_encoded = all_mines_encoded.fillna(0)

In [244]:
all_mines_encoded.columns

Index(['start_date_2017-07-01 00:00:00', 'start_date_2019-07-01 00:00:00',
       'start_date_2021-07-01 00:00:00', 'start_date_2023-07-01 00:00:00',
       'Company_#8 LTD OF VIRGINIA', 'Company_17 WEST MINING INC',
       'Company_1ST CAPITAL ENERGY AND COAL LLC', 'Company_2 MILE COAL LLC',
       'Company_3 M DEVELOPMENT CO', 'Company_4 M COAL & ENERGY',
       ...
       'REGION_DES_PRESTONSBURG', 'imputation', 'mines_per_perm_id',
       'multistate_mine_id', 'Observations', 'WaterQuality', 'Max Temperature',
       'Precipitation', 'Companies_per_mine_id', 'trees'],
      dtype='object', length=2671)

In [None]:
## Prepare RNN Dataset for Random Forest


In [245]:
#rf Test and Train dataset

columns = all_mines_encoded.columns
column_to_exclude = ['mine_name','permit_id','trees', 'water', 'snow_and_ice','grass','bare', 'flooded_vegetation', 'built', 'shrub_and_scrub', 'crops']
columns = list(set(columns) - set(column_to_exclude))
selected_columns = all_mines_encoded[columns].columns

sequence_length=4

features = all_mines_encoded[columns]
target = all_mines_encoded['trees']

from sklearn.model_selection import cross_val_predict
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

features['mine_id']=ky['mine_id']

X_trainrf, X_testrf, y_trainrf, y_testrf = train_test_split(features, target, test_size=0.3,  random_state=0, stratify=features['mine_id'])

In [238]:
X_trainrf = X_trainrf[selected_columns]
X_testrf = X_testrf[selected_columns]

from sklearn.model_selection import StratifiedKFold, GridSearchCV
skf = StratifiedKFold(n_splits=10,random_state=807,shuffle=True)
skf.get_n_splits(features,target)

rf = RandomForestRegressor(random_state=0,criterion='squared_error')
params = {'n_estimators':[100, 200],'max_features':['sqrt','log2',20]}
rf_clf = GridSearchCV(rf,param_grid=params,cv=skf,n_jobs = -1)

rf.fit(X_trainrf, y_trainrf)

rf_pred=rf.predict(X_testrf)

mse = mean_squared_error(y_testrf, rf_pred)
mae = mean_absolute_error(y_testrf, rf_pred)
r2 = r2_score(y_testrf, rf_pred)

print('\nMean Squared Error (MSE):', mse)
print('Mean Absolute Error (MAE):', mae)
print('R-squared (R^2):', r2)


ValueError: ignored

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

feature_importance =rf.feature_importances_

forest_importances = pd.Series(feature_importance, index=selected_columns.columns)
forest_importances = pd.DataFrame(forest_importances)

idx = forest_importances[0].sort_values(ascending = False).head(30).index
forest_importances2 = forest_importances.loc[idx]
data=forest_importances2.reset_index()

plt.style.use('ggplot')
plt.figure(figsize = (15,8))
sns.barplot(x=0, y='index', data=data).set(title='Feature Importance from Random Forest')

In [156]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense
from keras.layers import LSTM
from keras.layers import Dropout

In [207]:
all_mines_encoded

Unnamed: 0,start_date,Company,State,QUAD_DESC,Type_Flag,PER_TYPE,Source,MINE_STATU,FeatCLS,REGION_DES,...,REGION_DES_PRESTONSBURG,imputation,mines_per_perm_id,multistate_mine_id,Observations,WaterQuality,Max Temperature,Precipitation,Companies_per_mine_id,trees
0,2017-07-01,LEVISA ENERGIES INC,ky,LANCER,INACT,PERMANENT,KGS Project,FF,SF,PRESTONSBURG,...,1,0.0,0.2,0.5,0.000000,0.509580,0.378685,0.147984,0.5,0.664179
1,2019-07-01,LEVISA ENERGIES INC,ky,LANCER,INACT,PERMANENT,KGS Project,FF,SF,PRESTONSBURG,...,1,0.0,0.2,0.5,0.000279,0.318621,0.653061,0.000000,0.5,0.718107
2,2021-07-01,LEVISA ENERGIES INC,ky,LANCER,INACT,PERMANENT,KGS Project,FF,SF,PRESTONSBURG,...,1,0.0,0.2,0.5,0.000139,0.063699,0.176871,0.341842,0.5,0.733225
3,2023-07-01,LEVISA ENERGIES INC,ky,LANCER,INACT,PERMANENT,KGS Project,FF,SF,PRESTONSBURG,...,1,0.0,0.2,0.5,0.000139,0.934140,0.306122,0.102109,0.5,0.742873
4,2017-07-01,PREMIUM ELKHORN COAL CORPORATION,ky,MCDOWELL,TRNS,PERMANENT,KGS Project,RC,SF,PRESTONSBURG,...,1,1.0,0.6,1.0,0.000186,0.376519,0.337869,0.189789,1.0,0.691618
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33656,2023-07-01,BRANHAM & BAKER COAL COMPANY INC,ky,DAVID,INACT,PERMANENT,KGS Project,RC,SF,PRESTONSBURG,...,1,0.0,0.2,0.0,0.000557,0.798932,0.265308,0.103219,0.0,0.740254
33657,2017-07-01,BRANHAM & BAKER COAL COMPANY INC,ky,PRESTONSBURG,INACT,PERMANENT,KGS Project,RC,SF,PRESTONSBURG,...,1,1.0,0.2,0.0,0.000232,0.209398,0.346940,0.143544,0.0,0.736873
33658,2019-07-01,BRANHAM & BAKER COAL COMPANY INC,ky,PRESTONSBURG,INACT,PERMANENT,KGS Project,RC,SF,PRESTONSBURG,...,1,0.0,0.2,0.0,0.000418,0.240052,0.632654,0.000000,0.0,0.739368
33659,2021-07-01,BRANHAM & BAKER COAL COMPANY INC,ky,PRESTONSBURG,INACT,PERMANENT,KGS Project,RC,SF,PRESTONSBURG,...,1,0.0,0.2,0.0,0.000139,0.055933,0.129253,0.329634,0.0,0.734218


## Prepare RNN Dataset with sequences





In [246]:
columns = all_mines_encoded.columns
column_to_exclude = ['trees', 'water', 'snow_and_ice','grass','bare', 'flooded_vegetation', 'built', 'shrub_and_scrub', 'crops']
columns = list(set(columns) - set(column_to_exclude))
selected_columns = all_mines_encoded[columns].columns

sequence_length=4

features = all_mines_encoded[selected_columns]

target = all_mines_encoded['trees']
X, y, y_index = [], [], []
for i in range(len(features) - sequence_length):

   X.append(features.iloc[i:i + sequence_length].values)
   y.append(target.iloc[i + sequence_length])
   i=i+sequence_length

X=np.array(X)
y=np.array(y)

features['mine_id']=ky['mine_id']

for i in range(len(features) - sequence_length):
   y_index.append(features.mine_id.iloc[i + sequence_length])
   i=i+sequence_length

y_index=np.array(y_index)

In [247]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2505,  random_state=0, stratify=y_index)

In [248]:
print(len(y_test))
print(len(X_test))
print(len(y_train))
print(len(X_train))

8432
8432
25225
25225


## RNN Model

In [307]:
from tensorflow.keras.callbacks import EarlyStopping
feature_length = X_train.shape[0]
#Li and L2 regularization
#from keras import optimizers

# fix random seed for reproducibility
tf.random.set_seed(42)
regressor = Sequential()
regressor.add(LSTM(units = 50, input_shape=(sequence_length,
                   X_train.shape[2]),
                   return_sequences = True,
                   activation = 'relu'))

regressor.add(LSTM(units = 50, return_sequences = True, recurrent_dropout=0.3))
regressor.add(Dropout(0.4))
regressor.add(LSTM(50))
regressor.add(Dropout(0.4))
regressor.add(Dense(30))
regressor.add(Dense(15))
regressor.add(Dense(1))
#adding in a higher learning rate
Adam = tf.keras.optimizers.Adam(learning_rate=0.002)

regressor.compile(optimizer = Adam, loss =tf.keras.losses.mean_squared_error)
es = EarlyStopping(monitor='val_loss', mode='min', patience=5, restore_best_weights=True)
regressor.fit(X_train, y_train, epochs=50, batch_size=32, validation_data= (X_test, y_test), callbacks=[es])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50


<keras.src.callbacks.History at 0x7e380958cfa0>

In [308]:
y_pred = regressor.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print('\nMean Squared Error (MSE):', mse)
print('Mean Absolute Error (MAE):', mae)
print('R-squared (R^2):', r2)


Mean Squared Error (MSE): 0.0065307550558090606
Mean Absolute Error (MAE): 0.05252459845197405
R-squared (R^2): 0.5817663138977406


In [309]:
from tensorflow.keras.callbacks import EarlyStopping
feature_length = X_train.shape[0]
#Li and L2 regularization
#from keras import optimizers

# fix random seed for reproducibility
tf.random.set_seed(42)
regressor = Sequential()
regressor.add(LSTM(units = 50, input_shape=(sequence_length,
                   X_train.shape[2]),
                   return_sequences = True,
                   activation = 'relu'))

regressor.add(LSTM(units = 50, return_sequences = True, recurrent_dropout=0.3))
regressor.add(Dropout(0.4))
regressor.add(LSTM(50))
regressor.add(Dropout(0.4))
regressor.add(Dense(30))
regressor.add(Dense(15))
regressor.add(Dense(1))
#adding in a higher learning rate
Adam = tf.keras.optimizers.Adam()

regressor.compile(optimizer = Adam, loss =tf.keras.losses.mean_squared_error)
es = EarlyStopping(monitor='val_loss', mode='min', patience=5, restore_best_weights=True)
regressor.fit(X_train, y_train, epochs=150, batch_size=32, validation_data= (X_test, y_test), callbacks=[es])

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150


<keras.src.callbacks.History at 0x7e381a3aa800>

In [310]:
y_pred = regressor.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print('\nMean Squared Error (MSE):', mse)
print('Mean Absolute Error (MAE):', mae)
print('R-squared (R^2):', r2)


Mean Squared Error (MSE): 0.006389482247150281
Mean Absolute Error (MAE): 0.05055985151781519
R-squared (R^2): 0.5908135139544668


In [301]:
from tensorflow.keras.callbacks import EarlyStopping
feature_length = X_train.shape[0]
#Li and L2 regularization
#from keras import optimizers

# fix random seed for reproducibility
tf.random.set_seed(42)
regressor = Sequential()
regressor.add(LSTM(units = 50, input_shape=(sequence_length,
                   X_train.shape[2]),
                   return_sequences = True,
                   activation = 'relu'))

regressor.add(LSTM(units = 50, return_sequences = True, recurrent_dropout=0.3))
regressor.add(Dropout(0.4))
regressor.add(LSTM(50))
regressor.add(Dropout(0.4))
regressor.add(Dense(30))
regressor.add(Dense(15))
regressor.add(Dense(1))
#adding in a higher learning rate
Adam = tf.keras.optimizers.Adam(learning_rate=0.003)

regressor.compile(optimizer = Adam, loss =tf.keras.losses.mean_squared_error)
es = EarlyStopping(monitor='val_loss', mode='min', patience=5, restore_best_weights=True)
regressor.fit(X_train, y_train, epochs=150, batch_size=32, validation_data= (X_test, y_test), callbacks=[es])

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150


<keras.src.callbacks.History at 0x7e381ecd8730>

In [302]:
y_pred = regressor.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print('\nMean Squared Error (MSE):', mse)
print('Mean Absolute Error (MAE):', mae)
print('R-squared (R^2):', r2)


Mean Squared Error (MSE): 0.006766774330693718
Mean Absolute Error (MAE): 0.05510329881486498
R-squared (R^2): 0.5666514901931845


In [303]:
y_pred = regressor.predict(X)
mse = mean_squared_error(y, y_pred)
mae = mean_absolute_error(y, y_pred)
r2 = r2_score(y, y_pred)

print('\nMean Squared Error (MSE):', mse)
print('Mean Absolute Error (MAE):', mae)
print('R-squared (R^2):', r2)


Mean Squared Error (MSE): 0.004555522152455117
Mean Absolute Error (MAE): 0.04538507692724247
R-squared (R^2): 0.7129070664917526


In [288]:
#pip install keras_tuner

Collecting keras_tuner
  Downloading keras_tuner-1.4.5-py3-none-any.whl (129 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.5/129.5 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting keras-core (from keras_tuner)
  Downloading keras_core-0.1.7-py3-none-any.whl (950 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m950.8/950.8 kB[0m [31m23.7 MB/s[0m eta [36m0:00:00[0m
Collecting kt-legacy (from keras_tuner)
  Downloading kt_legacy-1.0.5-py3-none-any.whl (9.6 kB)
Collecting namex (from keras-core->keras_tuner)
  Downloading namex-0.0.7-py3-none-any.whl (5.8 kB)
Installing collected packages: namex, kt-legacy, keras-core, keras_tuner
Successfully installed keras-core-0.1.7 keras_tuner-1.4.5 kt-legacy-1.0.5 namex-0.0.7
