In [5]:

# general data and data viz imports
import pandas as pd
import os

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

from copy import copy
# adjusting display restrictions 
pd.options.display.max_columns = 100
pd.options.display.max_rows = 3000
# setting preference to remove scientific notation 
pd.options.display.float_format = '{:,.2f}'.format

%matplotlib inline

In [11]:
suppression= pd.read_csv('../data/fire_supp_data.csv')
#suppression= pd.read_excel('../data/Data Checklist.xlsx')

suppression.tail()

Unnamed: 0,Year,Fires,Acres,Forest Service,DOI Agencies,Total
30,2015,68151,10125149,1713000000,417543000,2130543000
31,2016,67595,5503538,1603806000,371739000,1975545000
32,2017,71499,10026086,2410165000,508000000,2918165000
33,2018,58083,8767492,2615256000,528000000,3143256000
34,2019,50477,4664364,1150000000,440000000,1590000000


In [263]:
suppression=suppression[['Year','Fires','Acres','Forest Service','DOI Agencies','Total']]
suppression.dtypes

Year              float64
Fires             float64
Acres              object
Forest Service     object
DOI Agencies       object
Total              object
dtype: object

In [264]:
suppression=suppression.astype('str')
suppression.dtypes

Year              object
Fires             object
Acres             object
Forest Service    object
DOI Agencies      object
Total             object
dtype: object

In [265]:
####################################### preprocessing function ##########################
# to pass in a series using apply functions
import re
def pre_process(sentences):
    '''
    inputs:
    sentences = text

    description:
    The function is utilized to remove emoticons, urls (https,eee,etc), special characters,
    and new line break stings('\n'). (Does NOT remove spaces).

    use cases:
    This was specifically created to pass in use df['new_column']=df['text_column'].apply(pre_preprocess)
    OR individual strings.
    '''
    # removing emoticons
    sentences = re.sub(':d', '', str(sentences)).strip()
    sentences = re.sub(':p', '', str(sentences)).strip()

    # removing urls
    sentences = re.sub('(https?:\/\/)(\s)*(www\.)?(\s)*((\w|\s)+\.)*([\w\-\s]+\/)*([\w\-]+)((\?)?[\w\s]*=\s*[\w\%&]*)*','  ', sentences)

    # removing special characters (https://stackoverflow.com/questions/5843518/remove-all-special-characters-punctuation-and-spaces-from-string)
    sentences = re.sub('[^A-Za-z0-9]+', ' ', str(sentences))

    sentences = re.sub('[^a-zA-Z\s]', '', str(sentences)).rstrip()

    # removing the '\n' new line breaks in sentences
    sentences = sentences.replace('\n',' ')
    sentences = sentences.replace('\$','')
    #sentences - sentences.replace(r'[^\w\s]+', '')

    return sentences

In [266]:
suppression.apply(pre_process)
suppression = suppression.replace({'\$':''}, regex = True)
suppression = suppression.replace({'\\.':''}, regex = True)

suppression.tail(3)

Unnamed: 0,Year,Fires,Acres,Forest Service,DOI Agencies,Total
34,20190.0,504770.0,4664364,1150000000,440000,1590000000
35,,,5 year average,1956295500,430295200,2337931600
36,,,10 year average,1437740333,378492800,1843759100


In [267]:
# need to drop two bottom rows
suppression.drop([35,36],inplace=True)
suppression.tail(3)

Unnamed: 0,Year,Fires,Acres,Forest Service,DOI Agencies,Total
32,20170,71499,10026086,2410165000,508000000,2918165000
33,20180,58083,8767492,2615256000,528000000,3143256000
34,20190,504770,4664364,1150000000,440000,1590000000


In [268]:
# will convert year to int
suppression.dtypes[:2]

Year     object
Fires    object
dtype: object

In [269]:
suppression.columns=suppression.columns.str.lower()
suppression.columns

Index(['year', 'fires', 'acres', 'forest service', 'doi agencies', 'total'], dtype='object')

In [270]:
suppression['year'] = suppression['year'].replace({'\\.':''}, regex = True)

def split_on_period(obs):
    obs = int(obs[:4])

    return obs
suppression['year'] = suppression['year'].apply(split_on_period)
suppression.dtypes[:2]

year      int64
fires    object
dtype: object

In [271]:
suppression.head(3)

Unnamed: 0,year,fires,acres,forest service,doi agencies,total
0,1985,82591,2896147,161505000,78438000,239943000
1,1986,859070,2719162,111625000,91153000,202778000
2,1987,713,2447296,253657000,81452000,335109000


In [272]:
suppression.isnull().sum().sum()

0

### Dealing with fires column

After inspection every value should have been 5 values in length so will append 0s to set to appropriate order of magnitude

In [291]:
# checking lengths of each fire count then depending on lenght will add 0s or remove last values
fires_lst=[]
for fire in suppression['fires']:
    if len(fire)==5:
        #print('ok')
        pass
    elif len(fire)<5:
        #adding supplemental 0s
        zeros_to_add=5-len(fire)
        fire = int(str(fire)+(str(0)*zeros_to_add))
        
        #removing trailing values
        elif len(fire)>5:
        #print('more than 5:',fire)
        fire = fire[:5]
        #print('more than 5:',fire)
    fires_lst.append(fire)
suppression['fires']=fires_lst
suppression.head()

Unnamed: 0,year,fires,acres,forest service,doi agencies,total
0,1985,82591,2896147,161505000,78438000,239943000
1,1986,85907,2719162,111625000,91153000,202778000
2,1987,71300,2447296,253657000,81452000,335109000
3,1988,72750,500929,429609000,149317000,578926000
4,1989,48949,1827310,331672000,168115000,499787000


In [None]:
# checking lengths of each acre observation  depending on lenght will add 0s or remove last values
acre_lst=[]
for acre in suppression['acres']:
    if len(fire)==5:
        #print('ok')
        pass
    elif len(fire)<5:
        #adding supplemental 0s
        zeros_to_add=5-len(fire)
        fire = int(str(fire)+(str(0)*zeros_to_add))
        
        #removing trailing values
        elif len(fire)>5:
        #print('more than 5:',fire)
        fire = fire[:5]
        #print('more than 5:',fire)
    fires_lst.append(fire)
suppression['fires']=fires_lst
suppression.head()