In [1]:
import pandas as pd
import numpy as np

# Data Sourcing

In [2]:
from members import Members

member = Members().get_data()
member = Members().clean_data(member)

In [3]:
from weather import Weather

weather = Weather().get_data()
weather = Weather().clean_data(weather)

In [4]:
from peaks import Peaks

peak = Peaks().get_data()
peak = Peaks().clean_data(peak)

In [5]:
from expeds import Expeds

exped = Expeds().get_data()
exped = Expeds().clean_data(exped)

## Merging

In [6]:
print('member', member.shape)
print('peak', peak.shape)
print('exped', exped.shape)

member (16383, 47)
peak (468, 18)
exped (3704, 57)


In [7]:
mem_to_drop = ['memb_id',
               'year',
               'unique_id',
                 'peak_id',
                 'residence',
                 'occupation',
                 'summit_claimed',
                 'summit_disputed',
                 'highpt',
                 'high_point',
                 'death',
                 'death_type',
                 'death_height',
                 'death_class',
                 'summit_bid',
                 'summit_term',
               'summit_date1',
               'citizenship'
                ]

member.drop(columns= mem_to_drop, inplace=True)

In [8]:
peak_to_drop = ['peak_name',
                 'pk_name_2',
                 'location',
                 'himal',
                 'region',
                 'open',
                 'unlisted',
                 'trekking',
                 'restrict',
                 'country_status',
                 'year',
                 'season',
                 'expid',
                 'summiter_country',
                 'summiters'
                ]

peak.drop(columns= peak_to_drop, inplace=True)

In [9]:
exp_to_drop = ['year',
                 'season',
                 'route1',
                 'route2',
                 'nation',
                 'leaders',
                 'sponsor',
                 'success1',
                 'success2',
                 'ascent1',
                 'ascent2',
                 'claimed',
                 'disputed',
                 'countries',
               'summit_time',
               'term_date',
               'term_reason',
               'term_note',
               'high_point',
               'traverse',
               'ski',
               'parapente',
               'o2_climb',
               'o2_descent',
               'o2_sleep',
               'o2_medical',
               'o2_taken',
               'o2_unkwn',
               'o2_used',
               'o2_none',
               'other_smts',
               'campsites',
               'accidents',
               'achievment',
               'agency',
               'peak_name',
               'primmem',
               'summiter_deaths',
               'summit_members',
               'summit_hired',
               'hired_deaths'
                ]

exped.drop(columns= exp_to_drop, inplace=True)
exped['summit_date'] = pd.to_datetime(exped.summit_date, errors = 'coerce')
exped['bc_date'] = pd.to_datetime(exped.bc_date , errors = 'coerce')

## full data

In [34]:
df = exped.merge(member, on='exp_id', how = 'right')

df = df.set_index('summit_date')
wet = weather.set_index('date_time')

df = df.merge(wet, how='left', left_index=True, right_index=True)

df = df.reset_index()
df.drop(columns=['exp_id', 'index', 'bc_date', 'moonrise', 'moonset', 'sunrise', 'sunset'], inplace = True) 

In [74]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16383 entries, 0 to 16382
Data columns (total 60 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   peak_id            16383 non-null  object 
 1   host               16383 non-null  object 
 2   summit_days        16383 non-null  int64  
 3   tot_days           16383 non-null  int64  
 4   camps              16383 non-null  int64  
 5   rope               16383 non-null  int64  
 6   tot_members        16383 non-null  int64  
 7   tot_hired          16383 non-null  int64  
 8   no_hired           16383 non-null  bool   
 9   comrte             16383 non-null  bool   
 10  stdrte             16383 non-null  bool   
 11  primrte            16383 non-null  bool   
 12  peak_height        16383 non-null  int64  
 13  season             16383 non-null  object 
 14  sex_M              16383 non-null  float64
 15  yob                16383 non-null  float64
 16  status             163

# Data Preprocessing

In [53]:
col_num = []
col_bool =[]
col_object =[]

for col in df:
    if df[col].dtype == "float64":
        col_num.append(col)
        
    if df[col].dtype == "int64":
        col_num.append(col)
        
    if df[col].dtype == 'bool':
        col_bool.append(col)
        
    if df[col].dtype == 'object':
        col_object.append(col)
        
col_bool.remove('summit_success')

In [59]:
for col in df:        
    if df[col].dtype == 'bool':
        df[col].fillna(method='bfill')

In [65]:
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.impute import SimpleImputer, KNNImputer
from scipy.stats import uniform
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

numeric_transformer = Pipeline(steps=[
    ('imputer', KNNImputer(n_neighbors=2)),
    ('scaler', MinMaxScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(drop= 'first', handle_unknown='error'))])

# boolean_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='most_frequent'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, col_num),
        ('cat', categorical_transformer, col_object)])

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', preprocessor)])

X = df.drop(columns=['summit_success'])
y = df.summit_success

X_trans = clf.fit_transform(X)

# Models

In [67]:
X_train, X_test, y_train, y_test = train_test_split(X_trans, y, test_size=0.3, random_state=42)

model = LogisticRegression(max_iter = 500)
model.fit(X_train, y_train)

print("model score: %.3f" % model.score(X_test, y_test))

model score: 0.801


In [69]:
from sklearn.metrics import classification_report
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       False       0.85      0.79      0.82      2790
        True       0.75      0.81      0.78      2125

    accuracy                           0.80      4915
   macro avg       0.80      0.80      0.80      4915
weighted avg       0.80      0.80      0.80      4915



In [73]:
## Baseline
y_base = np.zeros(len(y_test))
print(classification_report(y_test, y_base))

              precision    recall  f1-score   support

       False       0.57      1.00      0.72      2790
        True       0.00      0.00      0.00      2125

    accuracy                           0.57      4915
   macro avg       0.28      0.50      0.36      4915
weighted avg       0.32      0.57      0.41      4915

