In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

In [4]:
aps = pd.read_csv('apostles_clean.csv')
# Make a dataset of living apostles to make predictions on later
living = aps.sort_values(by='date_ordained').tail(20).query('name not in ["Neal A. Maxwell", "M. Russell Ballard", "Joseph B. Wirthlin", "Richard G. Scott", "Robert D. Hales", "Russell M. Nelson"]')

In [5]:
aps.drop(labels=['date_left', 'age_left', 'birthdate', 'date_ordained'], axis=1, inplace=True) # drop date columns for MLM
aps = aps.query("name not in ['Dallin H. Oaks', 'Jeffrey R. Holland', 'Henry B. Eyring', 'David A. Bednar', 'Dieter F. Uchtdorf', 'Quentin L. Cook', 'D. Todd Christofferson', 'Neil L. Andersen', 'Gary E. Stevenson', 'Dale G. Renlund', 'Ronald A. Rasband', 'Gerrit W. Gong', 'Ulisses Soares', 'Patrick Kearon']") # Remove living aposltes
living.drop(labels=['date_left', 'age_left', 'birthdate', 'date_ordained', 'president'], axis=1, inplace=True)

ml = aps.copy()
ml.drop(labels=['name'], axis=1, inplace=True) # Remove name column

In [6]:
living.reset_index(inplace=True, drop=True) # Fix the index

In [274]:
ml['president'] = np.where(
    ml['president'] == 'Yes',
    1,
    0
) # Turn president column into binary 1 or 0

In [275]:
x = ml.drop(labels=['president'], axis=1) # Set features and target variable
y = ml['president']

In [279]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=777) # Split dataset for training

In [377]:
alg = RandomForestClassifier(n_estimators=10) # Choose algorithm

alg.fit(x_train, y_train) # Fit the model
y_predicted = alg.predict(x_test) # Make predictions
acc = metrics.accuracy_score(y_test, y_predicted) # Find rough accuracy

print(acc) # Note that this accuracy is an approximation of the final model, because 20% of the data was used for testing rather than training. The final model will be trained on all data except living apostles, and then used to predict living apostles.

0.8


In [8]:
real = living.drop(columns=['name']) # Make the dataset prediction ready

Unnamed: 0,age_ordained,age_aps_1,age_aps_2,age_aps_3,age_aps_4,age_aps_5,age_aps_6,age_aps_7,age_aps_8,age_aps_9,age_aps_10,age_aps_11,age_aps_12,age_aps_13,age_aps_14,min_other_aps,num_younger_aps,birth_year
0,52,84,76,56,59,68,68,61.0,77.0,63.0,57.0,59.0,66.181818,66.181818,66.181818,56,0,1932
1,54,69,71,87,73,67,69,61.0,65.0,77.0,65.0,61.0,69.545455,69.545455,69.545455,61,0,1940
2,62,70,72,88,68,70,62,66.0,77.0,66.0,62.0,54.0,68.636364,68.636364,68.636364,54,1,1933
3,52,80,82,80,72,75,87,75.0,72.0,63.0,71.0,63.0,74.545455,74.545455,74.545455,63,0,1952
4,64,80,82,80,72,75,87,75.0,72.0,63.0,71.0,52.0,73.545455,73.545455,73.545455,52,2,1940
5,67,83,85,83,75,78,90,78.0,75.0,66.0,74.0,55.0,66.0,75.666667,75.666667,55,3,1940
6,63,83,85,83,75,79,90,79.0,75.0,67.0,74.0,55.0,67.0,67.0,75.307692,55,1,1945
7,58,84,86,84,76,80,80,76.0,68.0,75.0,56.0,68.0,68.0,64.0,74.230769,56,1,1951
8,60,91,83,86,83,74,82,63.0,74.0,75.0,70.0,64.0,62.0,64.0,74.692308,62,0,1955
9,63,91,83,86,83,74,82,63.0,74.0,75.0,70.0,64.0,60.0,64.0,74.538462,60,1,1952


In [385]:
# Training the model with ALL past prophets
x_train_new, x_test_new, y_train_new, y_test_new = train_test_split(x, y, test_size=None, random_state=777)

alg_new = RandomForestClassifier(n_estimators=10)

alg_new.fit(x_train_new, y_train_new)

names_new = living['name']
real_new = living.drop(columns=['name'])

real_pred_new = alg.predict(real)

Unnamed: 0,name,becomes_president
0,Dallin H. Oaks,1
1,Jeffrey R. Holland,0
2,Henry B. Eyring,1
3,David A. Bednar,0
4,Dieter F. Uchtdorf,0
5,Quentin L. Cook,0
6,D. Todd Christofferson,1
7,Neil L. Andersen,0
8,Gary E. Stevenson,0
9,Dale G. Renlund,0


In [386]:

res_new = pd.DataFrame({
    'name': names_new,
    'becomes_president': real_pred_new
})

res_new['becomes_president'] = np.where(
    res_new['becomes_president'] == 1,
    'Yes',
    'No'
)

res_new # Make new dataset with the predictions.

Unnamed: 0,name,becomes_president
0,Dallin H. Oaks,Yes
1,Jeffrey R. Holland,No
2,Henry B. Eyring,Yes
3,David A. Bednar,No
4,Dieter F. Uchtdorf,No
5,Quentin L. Cook,No
6,D. Todd Christofferson,Yes
7,Neil L. Andersen,No
8,Gary E. Stevenson,No
9,Dale G. Renlund,No
