# Appendix - Python Code and Outputs

### Data Preparation

In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

### Import Extracted Training Data

In [2]:
import numpy as np
import pandas as pd
# load training data
disaster_training_data = pd.read_csv('train.csv')

# show first rows of the data
disaster_training_data.head(100)

# show number of columns and rows
disaster_training_data.shape

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
95,137,accident,Charlotte,9 Mile backup on I-77 South...accident blockin...,1
96,138,accident,"Baton Rouge, LA",Has an accident changed your life? We will hel...,0
97,139,accident,"Hagerstown, MD",#BREAKING: there was a deadly motorcycle car a...,1
98,141,accident,"Gloucestershire , UK",@flowri were you marinading it or was it an ac...,0


(7613, 5)

In [3]:
# first ten rows
disaster_training_data.head(10)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
5,8,,,#RockyFire Update => California Hwy. 20 closed...,1
6,10,,,#flood #disaster Heavy rain causes flash flood...,1
7,13,,,I'm on top of the hill and I can see a fire in...,1
8,14,,,There's an emergency evacuation happening now ...,1
9,15,,,I'm afraid that the tornado is coming to our a...,1


### Investigation of Missing Data and Outliers in Training Data

In [5]:
# find null counts, percentage of null values, and column type
null_count = disaster_training_data.isnull().sum()
null_percentage = disaster_training_data.isnull().sum() * 100 / len(disaster_training_data)
column_type = disaster_training_data.dtypes

# show null counts, percentage of null values, and column type for columns with more than one Null value
null_summary = pd.concat([null_count, null_percentage, column_type], axis=1, keys=['Missing Count', 'Percentage Missing','Column Type'])
null_summary_only_missing = null_summary[null_count != 0].sort_values('Percentage Missing',ascending=False)
null_summary_only_missing

Unnamed: 0,Missing Count,Percentage Missing,Column Type
location,2533,33.272035,object
keyword,61,0.801261,object


The above analysis displays that location is missing from 2533 rows and keyword is missing from 61 rows in the training data.

In [None]:
# deal with missing values


### Import Testing Data

In [8]:
# import test dataset
disaster_testing_data = pd.read_csv('test.csv')

# show first ten rows of the data
disaster_testing_data.head(10)

# show number of columns and rows
disaster_testing_data.shape

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan
5,12,,,We're shaking...It's an earthquake
6,21,,,They'd probably still show more life than Arse...
7,22,,,Hey! How are you?
8,27,,,What a nice hat?
9,29,,,Fuck off!


(3263, 4)

### Investigation of Missing Data and Outliers in Testing Data

In [9]:
# find null counts, percentage of null values, and column type
null_count = disaster_testing_data.isnull().sum()
null_percentage = disaster_testing_data.isnull().sum() * 100 / len(disaster_testing_data)
column_type = disaster_testing_data.dtypes

# show null counts, percentage of null values, and column type for columns with more than one Null value
null_summary = pd.concat([null_count, null_percentage, column_type], axis=1, keys=['Missing Count', 'Percentage Missing','Column Type'])
null_summary_only_missing = null_summary[null_count != 0].sort_values('Percentage Missing',ascending=False)
null_summary_only_missing

Unnamed: 0,Missing Count,Percentage Missing,Column Type
location,1105,33.864542,object
keyword,26,0.796813,object


The above analysis displays that location is missing from 1105 rows and keyword is missing from 26 rows in the testing data.

In [None]:
# deal with missing values
