In [1]:
# importing standard Python libraries for data analysis
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime as dt

In [2]:
# I would like to see all rows and columns of dataframes
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [3]:
# checking our dataset
df = pd.read_excel("accidents_massifs_T.xlsx")
df.head()

Unnamed: 0,code accident,date,heure,département,commune,massif,region,massif_unique,site,coordonnées\nzone départ,altitude,inclinaison,orientation,groupe,emportés,ensevelis \ntête,dead,injured,indemnes,activite_regrouped
0,1011-74-02,2010-12-04,13:05,74,la clusaz,aravis,alps,Aravis,combe borderan,"6°28'30"""" 45°53'10''",2350.0,,NO,3.0,3,1,0,3,0,hiking
1,1011-74-08,2010-12-26,10:55,74,le reposoir,aravis,alps,Aravis,col de l'encrenaz,,1950.0,30-34,S,3.0,1,1,0,1,0,hiking
2,1011-74-11,2011-03-20,12:05,74,magland,aravis,alps,Aravis,face nord pointe d'Areu,32t 0312804 5094955,2430.0,35-39,NO,3.0,1,1,1,0,0,hiking
3,1011-63-01,2010-11-28,14:00,63,le mont dore,sancy,other,other,puy redon / couloir de l'envers du redon,"45°31'58.0"" 2°48'23""",1725.0,40-44,NO,2.0,1,0,0,1,0,hiking
4,1213-74-02,2012-12-06,12:45,74,serraval,aravis,alps,Aravis,sulens,32t 0295672 5078087,1710.0,40-44,O,2.0,2,1,1,0,1,hiking


In [4]:
# viewing names of columns for easier copypaste later
df.columns

Index(['code accident', 'date', 'heure', 'département', 'commune', 'massif',
       'region', 'massif_unique', 'site', 'coordonnées\nzone départ',
       'altitude', 'inclinaison', 'orientation', 'groupe', 'emportés',
       'ensevelis \ntête', 'dead', 'injured', 'indemnes',
       'activite_regrouped'],
      dtype='object')

In [5]:
# getting rid of not needed data for visualization
df = df.drop(columns=['code accident', 'coordonnées\nzone départ', 'département',
                     'commune', 'site'])

In [6]:
# viewing names of columns for translation
df.columns

Index(['date', 'heure', 'massif', 'region', 'massif_unique', 'altitude',
       'inclinaison', 'orientation', 'groupe', 'emportés', 'ensevelis \ntête',
       'dead', 'injured', 'indemnes', 'activite_regrouped'],
      dtype='object')

In [7]:
#translating names of variable in English
df.columns = ['date', 'hour', 'massif', 'region', 'massif_unique', 'altitude',
       'inclination', 'orientation', 'size_of_group', 'transported', 'with head buried in snow',
       'dead', 'injured', 'unharmed', 'activite_regrouped']

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 15 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   date                      569 non-null    datetime64[ns]
 1   hour                      373 non-null    object        
 2   massif                    569 non-null    object        
 3   region                    569 non-null    object        
 4   massif_unique             569 non-null    object        
 5   altitude                  330 non-null    float64       
 6   inclination               299 non-null    object        
 7   orientation               331 non-null    object        
 8   size_of_group             438 non-null    float64       
 9   transported               569 non-null    int64         
 10  with head buried in snow  569 non-null    int64         
 11  dead                      569 non-null    int64         
 12  injured               

In [9]:
# changing to more suitable data type
df.altitude = (df.altitude.fillna(0)).astype(int)
df.size_of_group = (df.size_of_group.fillna(0)).astype(int)

In [10]:
# some records have not filled values for size of group, even though there are records for transported etc.
# therefore for missing group size values, value from transported column was added
df["size_of_group"] = np.where(df["size_of_group"]==0, df["transported"],df["size_of_group"])

In [11]:
# creating new variables of time
df["only_hour"] = df.hour.str[0:2]
df["year"] = ((df.date.astype(str)).str[0:4]).astype(int)
df["month"] = ((df.date.astype(str)).str[5:7]).astype(int)

In [12]:
df.only_hour = (df.only_hour.fillna(0)).astype(int)

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 18 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   date                      569 non-null    datetime64[ns]
 1   hour                      373 non-null    object        
 2   massif                    569 non-null    object        
 3   region                    569 non-null    object        
 4   massif_unique             569 non-null    object        
 5   altitude                  569 non-null    int32         
 6   inclination               299 non-null    object        
 7   orientation               331 non-null    object        
 8   size_of_group             569 non-null    int64         
 9   transported               569 non-null    int64         
 10  with head buried in snow  569 non-null    int64         
 11  dead                      569 non-null    int64         
 12  injured               

In [14]:
# saving cleaned dataset
aval_accidents_for_t = df.to_csv(r'.\\aval_accidents_for_t.csv', index = False)