# Load the dataset

In [1]:
import pandas as pd

# we want to display all columns in the dataset to be imported
pd.set_option('display.max_columns', None)

In [2]:
everest_df = pd.read_csv('mount_everest_deaths.csv')
everest_df.head()

Unnamed: 0,No.,Name,Date,Age,Expedition,Nationality,Cause of death,Location
0,1,Dorje,"June 7, 1922",,1922 British Mount Everest Expedition,Nepal,Avalanche,Below North Col
1,2,Lhakpa,"June 7, 1922",,1922 British Mount Everest Expedition,Nepal,Avalanche,Below North Col
2,3,Norbu,"June 7, 1922",,1922 British Mount Everest Expedition,Nepal,Avalanche,Below North Col
3,4,Pasang,"June 7, 1922",,1922 British Mount Everest Expedition,Nepal,Avalanche,Below North Col
4,5,Pema,"June 7, 1922",,1922 British Mount Everest Expedition,Nepal,Avalanche,Below North Col


In [3]:
everest_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 310 entries, 0 to 309
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   No.             310 non-null    int64  
 1   Name            310 non-null    object 
 2   Date            310 non-null    object 
 3   Age             160 non-null    float64
 4   Expedition      271 non-null    object 
 5   Nationality     309 non-null    object 
 6   Cause of death  296 non-null    object 
 7   Location        291 non-null    object 
dtypes: float64(1), int64(1), object(6)
memory usage: 19.5+ KB


### Check for null values

In [4]:
vars_with_na = [
    var for var in everest_df.columns
    if everest_df[var].isnull().sum() > 0 
]

vars_with_na

['Age', 'Expedition', 'Nationality', 'Cause of death', 'Location']

The columns above contain null values.
Next show what percentage of values in the column contain null values.

In [5]:
everest_df[vars_with_na].isnull().mean().sort_values(ascending=False)*100

Age               48.387097
Expedition        12.580645
Location           6.129032
Cause of death     4.516129
Nationality        0.322581
dtype: float64

48% of the values in the Age column are empty.

### Unique values

In [6]:
for i in everest_df.columns:
    print('unique values of',i,':',len(everest_df[i].unique()),'/',len(everest_df[i]))

unique values of No. : 310 / 310
unique values of Name : 303 / 310
unique values of Date : 196 / 310
unique values of Age : 48 / 310
unique values of Expedition : 142 / 310
unique values of Nationality : 37 / 310
unique values of Cause of death : 78 / 310
unique values of Location : 120 / 310


Having so many unique values in such a small number of records, makes it more difficult to provide meaningful data insight.

# Tableau dashboard prep

I will be creating a dashboard to draw some insights between the amount of deaths, date and location.

#### 1. Separate the Date column into:
 - Year
 - Month
 - Day
 
Will help with sorting the date columns out and exploring the relationship between deaths and date

In [7]:
year_list = []
month_list = []
day_list = []

In [8]:
def month_to_numbers(month):
    keys_list = ('january', 'february', 'march', 'april', 
                 'may', 'june', 'july', 'august', 'september',
                 'october', 'november', 'december')
    values_list = (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)
    
    zip_keys_values = zip(keys_list, values_list)
    month_dict = dict(zip_keys_values)
    
    return str(month_dict[month.lower()])

In [9]:
#The row below does not follow the common structure for data in the Date column
everest_df['Date'][36].split(' ')

['Aug-75']

In [10]:
date_list = everest_df['Date']
for i in range(len(date_list)):
    x = date_list[i].split(' ')
    if len(x) == 3:
        year_list.append(x[2])
        month_list.append(month_to_numbers(x[0]))
        day_list.append(x[1][:-1])
    else:
        year_list.append(1975)
        month_list.append(8)
        day_list.append(None)

#print(len(year_list), len(month_list), len(day_list))

In [11]:
everest_df['Year'] = year_list
everest_df['Month'] = month_list
everest_df['Day'] = day_list

everest_df.head()

Unnamed: 0,No.,Name,Date,Age,Expedition,Nationality,Cause of death,Location,Year,Month,Day
0,1,Dorje,"June 7, 1922",,1922 British Mount Everest Expedition,Nepal,Avalanche,Below North Col,1922,6,7
1,2,Lhakpa,"June 7, 1922",,1922 British Mount Everest Expedition,Nepal,Avalanche,Below North Col,1922,6,7
2,3,Norbu,"June 7, 1922",,1922 British Mount Everest Expedition,Nepal,Avalanche,Below North Col,1922,6,7
3,4,Pasang,"June 7, 1922",,1922 British Mount Everest Expedition,Nepal,Avalanche,Below North Col,1922,6,7
4,5,Pema,"June 7, 1922",,1922 British Mount Everest Expedition,Nepal,Avalanche,Below North Col,1922,6,7


#### 2. Group the causes of death, there are too many unique values here

In [12]:
pd.set_option('display.max_rows', None)
everest_df.groupby('Cause of death')['No.'].count().sort_values(ascending=False)

Cause of death
Avalanche                                                                                                                                                    50
Fall                                                                                                                                                         46
Exposure                                                                                                                                                     25
Altitude sickness                                                                                                                                            17
Base Camp avalanche following the April 2015 Nepal earthquake                                                                                                16
2014 Mount Everest Avalanche                                                                                                                                 15
Disappearance (likely acc

In [13]:
cause_list = []
CoD = everest_df['Cause of death']
def CoD_less_unique(df_list, new_list, df):
    for i in range(len(df_list)):
        if type(df_list[i]) == type(df_list[0]): #string type
            
            if 'avalanche' in df_list[i].lower():#avalanche
                new_list.append('avalanche')
                
            elif 'fall' in df_list[i].lower():#fall
                new_list.append('fall')
                
            elif 'altitude sickness' in df_list[i].lower():#altitude sickness
                new_list.append('altitude sickness')
            elif 'mountain sickness' in df_list[i].lower():
                new_list.append('altitude sickness')
            elif 'hape' in df_list[i].lower():
                new_list.append('altitude sickness')
            elif 'hace' in df_list[i].lower():
                new_list.append('altitude sickness')
            elif 'altitude' in df_list[i].lower().split(' ')[0]:
                new_list.append('altitude sickness')
            
            elif 'exposure' in df_list[i].lower().split(' ')[0]:#exposure
                new_list.append('exposure')
            elif 'hypothermia' in df_list[i].lower():
                new_list.append('exposure')
                
            elif 'exhaustion' in df_list[i].lower().split(' ')[0]:#exhaustion
                new_list.append('exhaustion')
            
            elif 'disappear' in df_list[i].lower():#unknown
                new_list.append('unknown')
            elif 'unknown' in df_list[i].lower():
                new_list.append('unknown')
            elif 'possible' in df_list[i].lower().split(' ')[0]:
                new_list.append('unknown')
            
            elif 'heart attack' in df_list[i].lower():#cardiac event
                new_list.append('cardiac event')
            elif 'cardiac' in df_list[i].lower():
                new_list.append('cardiac event')
                
            elif 'stroke' in df_list[i].lower():#stroke
                new_list.append('stroke')
                
            else:
                new_list.append(df_list[i])
                
        else:
            new_list.append('un-recorded')#deals with null values
    
    #print('length:',len(new_list))
    #print('unique values:',len(list(set(new_list))))
    #df['CoD'] = new_list
    #print(df.groupby('CoD')['No.'].count().sort_values(ascending=False))
    return new_list
    

In [14]:
everest_df['grouped_CoD'] = CoD_less_unique(CoD, cause_list, everest_df)

In [15]:
vars_with_na = [
    var for var in everest_df.columns
    if everest_df[var].isnull().sum() > 0 
]

vars_with_na

['Age', 'Expedition', 'Nationality', 'Cause of death', 'Location', 'Day']

In [16]:
everest_df.head()

Unnamed: 0,No.,Name,Date,Age,Expedition,Nationality,Cause of death,Location,Year,Month,Day,grouped_CoD
0,1,Dorje,"June 7, 1922",,1922 British Mount Everest Expedition,Nepal,Avalanche,Below North Col,1922,6,7,avalanche
1,2,Lhakpa,"June 7, 1922",,1922 British Mount Everest Expedition,Nepal,Avalanche,Below North Col,1922,6,7,avalanche
2,3,Norbu,"June 7, 1922",,1922 British Mount Everest Expedition,Nepal,Avalanche,Below North Col,1922,6,7,avalanche
3,4,Pasang,"June 7, 1922",,1922 British Mount Everest Expedition,Nepal,Avalanche,Below North Col,1922,6,7,avalanche
4,5,Pema,"June 7, 1922",,1922 British Mount Everest Expedition,Nepal,Avalanche,Below North Col,1922,6,7,avalanche


#### 3. Group the recorded location of death, there are too many unique values here

In [17]:
everest_df.groupby('Location')['No.'].count().sort_values(ascending=False)

Location
Icefall                                                                 36
Base Camp                                                               23
N.E. Ridge                                                              19
Below North Col                                                         13
8600m N.E. Ridge                                                         8
Near Summit                                                              7
South Col                                                                7
6400m                                                                    7
Balcony                                                                  5
8700m N.E. Ridge                                                         5
8000m S.W. Ridge                                                         4
8400m S.E. Ridge                                                         4
7200m W ridge                                                            4
8500m N.E. Ridge

In [18]:
loc_list = []
location = everest_df['Location']

def loc_less_unique(df_list, new_list, df):
    for i in range(len(df_list)):
        if type(df_list[i]) == type(df_list[0]):#string type
            
            if 'icefall' in df_list[i].lower():#icefall
                new_list.append('icefall')
            
            elif 'base camp' in df_list[i].lower():#base camp
                new_list.append('base camp')
                
            elif 'n.e.' in df_list[i].lower() or ' ne' in df_list[i].lower():#n.e. ridge
                new_list.append('n.e. ridge')
            elif 'north-east' in df_list[i].lower():
                new_list.append('n.e. ridge')
            
            elif 's.e.' in df_list[i].lower() or 'se ridge' in df_list[i].lower():#s.e. ridge
                new_list.append('s.e. ridge')
            
            elif 's.w.' in df_list[i].lower():#s.w. ridge
                new_list.append('s.w. ridge')
            
            elif 'w ridge' in df_list[i].lower() or 'west ridge' in df_list[i].lower():#w. ridge
                new_list.append('w. ridge')
            
            elif 'north col' in df_list[i].lower():#north col
                new_list.append('north col')
                
            elif 'south col' in df_list[i].lower():#south col
                new_list.append('south col')
                
            elif 'camp iv, in the tent' in df_list[i].lower():#camp 4
                new_list.append('camp 4')
            elif 'camp iv (south side) in tent' in df_list[i].lower():
                new_list.append('camp 4')
            elif 'near camp iv' in df_list[i].lower() or i == 287:
                new_list.append('camp 4')
            
            elif len(df_list[i].split(' ')) == 3 and df_list[i].lower().split(' ')[2] == 'iii':#camp 3
                new_list.append('camp 3')
            elif len(df_list[i].split(' ')) == 5 and df_list[i].lower().split(' ')[1] == 'iii':
                new_list.append('camp 3')
            
            elif len(df_list[i].split(' ')) == 2 and ' ii' in df_list[i].lower():#camp 2
                new_list.append('camp 2')
            
            elif len(df_list[i].split(' ')) == 2 and 'camp i' in df_list[i].lower():#camp 1
                new_list.append('camp 1')
                
            elif 'between camp' in df_list[i].lower() and ' i ' in df_list[i].lower():#between camp 1 and 2
                new_list.append('between camp 1 and 2')
            
            elif 'between camp' in df_list[i].lower() and ' iii ' in df_list[i].lower():#between camp 2 and 3
                new_list.append('between camp 2 and 3')
            
            elif 'hornbein couloir' in df_list[i].lower():#hornbein couloir
                new_list.append('hornbein couloir')
            
            elif 'norton couloir' in df_list[i].lower() or 'great couloir' in df_list[i].lower():#norton couloir
                new_list.append('norton couloir')
                
            elif ' summit' in df_list[i].lower() or '100 me' in df_list[i].lower():#around summit
                new_list.append('around the summit')
            
            elif 'balcony' in df_list[i].lower():#balcony
                new_list.append('balcony')
            
            elif '85' in df_list[i].lower()[0:2] or '86' in df_list[i].lower()[0:2] or '87' in df_list[i].lower()[0:2]:#8500 - 8750m
                new_list.append('8500 - 8750m')
            
            elif '80' in df_list[i].lower()[0:2] or '82' in df_list[i].lower()[0:2] or '83' in df_list[i].lower()[0:2]:#8000 - 8500m
                new_list.append('8000 - 8500m')
            
            elif '79' in df_list[i].lower()[0:2] or '77' in df_list[i].lower()[0:2]:#7500 - 8000m
                new_list.append('7500 - 8000m')
            
            elif '73' in df_list[i].lower()[0:2] or '72' in df_list[i].lower()[0:2] or '70' in df_list[i].lower()[0:2]:#7000 - 7500m
                new_list.append('7000 - 7500m')
                
            elif '67' in df_list[i].lower()[0:2] or '69' in df_list[i].lower()[0:2]:#6500 - 7000m
                new_list.append('6500 - 7000m')
                
            elif '61' in df_list[i].lower()[0:2] or '62' in df_list[i].lower()[0:2]:#6000 - 6500m
                new_list.append('6000 - 6500m')
                
            elif '55' in df_list[i].lower()[0:2]:#5500 - 6000m
                new_list.append('5500 - 6000m')
            
            else:
                new_list.append(df_list[i].lower())
            
        else:

            new_list.append('un-recorded')#deals with null values
    
    #print(len(new_list))
    #print('unique values:',len(list(set(new_list))))
    
    #df['grouped_location'] = new_list
    #print(df.groupby('grouped_location')['No.'].count().sort_values(ascending=False))
    return new_list


In [19]:
everest_df['grouped_location'] = loc_less_unique(location, loc_list, everest_df)

In [20]:
vars_with_na = [
    var for var in everest_df.columns
    if everest_df[var].isnull().sum() > 0 
]

vars_with_na

['Age', 'Expedition', 'Nationality', 'Cause of death', 'Location', 'Day']

In [21]:
everest_df.head()

Unnamed: 0,No.,Name,Date,Age,Expedition,Nationality,Cause of death,Location,Year,Month,Day,grouped_CoD,grouped_location
0,1,Dorje,"June 7, 1922",,1922 British Mount Everest Expedition,Nepal,Avalanche,Below North Col,1922,6,7,avalanche,north col
1,2,Lhakpa,"June 7, 1922",,1922 British Mount Everest Expedition,Nepal,Avalanche,Below North Col,1922,6,7,avalanche,north col
2,3,Norbu,"June 7, 1922",,1922 British Mount Everest Expedition,Nepal,Avalanche,Below North Col,1922,6,7,avalanche,north col
3,4,Pasang,"June 7, 1922",,1922 British Mount Everest Expedition,Nepal,Avalanche,Below North Col,1922,6,7,avalanche,north col
4,5,Pema,"June 7, 1922",,1922 British Mount Everest Expedition,Nepal,Avalanche,Below North Col,1922,6,7,avalanche,north col


  # Save the new dataset          

In [22]:
everest_df.to_csv('new_mount_everest_deaths.csv')