In [150]:
import pandas as pd
import country_converter as coco
import numpy as np
import os
import matplotlib.pyplot as plt
import copy
import random

from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from mpl_toolkits.mplot3d import Axes3D
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [None]:
class Country_Regressor():
    def __init__(self):
        # This reads in the master csv file
        self.og_df = pd.read_csv("https://raw.githubusercontent.com/MariaStefaniak/SurvivalAtBirth_Global/refs/heads/main/Data_Organization/Compiled_CSVs/Logistic_Practice.csv", index_col = 0)
    
    def Print_Features(self):
        print(self.og_df.columns)

    def Feature_Selection(self, features_list):
        self.features = features_list

    def Make_Weights(self):
        o_len = len(self.og_df)
        # Check that features were selected. If not, default to first five features
        # We check if the features are selected here because we're also making Xad, which we don't want to include every feature from the Master csv
        if not hasattr(self, 'features'):
            print("No features selected. Defaulting to using first four features.")
            print("Feature Selection may be done via Feature_Selection()")
            print("To see options for features, use Print_Columns()")
            self.features = self.og_df.columns[0:4]
            print(self.features)
        # Split the dataset into two, one which will have the weight Alive, and one that will have the weight for Dead
        # Xad = X, the input, split into Alive (a) and Dead(d) parts
        # Really it should be called df_ad since it has X, Y, and Weights
        self.Xad = pd.concat([self.og_df[self.features],self.og_df[self.features]])
        self.Xad.index.values[range(o_len)] = self.Xad.index.values[range(o_len)] + "_Dead"
        self.Xad.index.values[range(o_len,2*o_len)] = self.Xad.index.values[range(o_len,2*o_len)] + "_Alive"
        self.Xad["Alive"] = 1
        self.Xad.iloc[0:o_len, -1] = 0
        self.Xad["Weight"] = 0.0
        self.Xad.iloc[:o_len, -1] = self.og_df["Population_2019"] * self.og_df["Maternal_mortality_ratio _2019"] / 100000
        self.Xad["Weight"] = pd.to_numeric(self.Xad["Weight"], errors = "coerce").round().astype('Int64')
        self.Xad.iloc[o_len:2*o_len, -1] = pd.to_numeric(self.og_df["Population_2019"] * (1 - self.og_df["Maternal_mortality_ratio _2019"] / 100000), errors = "coerce").round().astype('Int64')

    def Train_Test_Split(self, speedup):
        # Example usage: Train_Test_Split("Infinite Speed Up gotta go fast") 
        # Example usage: Train_Test_Split(10) 
        # Does a 80 20 split
        # This is hard coded because sampling with removal on the 8 billion people on earth takes eight hours to run if you go by individuals
        # I have a way of speeding this up by a factor of N if you go by batches of people of size N, although that only approximiates the distirbution 
        # I also two theoretical ways of speeding it up so it runs in a few minutes or less (one good version and one bad version)... but I need to finish working out the math
        countries_names_all = ['CHN','IND','USA','IDN','PAK','NGA','BRA','BGD','RUS','JPN','MEX','ETH','PHL','EGY','VNM','COD','IRN','DEU','TUR','THA','FRA','GBR','ITA','ZAF','TZA','MMR','KOR','KEN','COL','ESP','SDN','ARG','UKR','DZA','UGA','IRQ','POL','AFG','CAN','MAR','YEM','MYS','UZB','PER','AGO','GHA','SAU','MOZ','VEN','NPL','MDG','CIV','PRK','CMR','AUS','NER','LKA','MLI','BFA','SYR','ROU','KAZ','CHL','MWI','ZMB','NLD','ECU','GTM','TCD','KHM','SEN','SOM','ZWE','GIN','RWA','BEN','BDI','TUN','BOL','BEL','CUB','HTI','DOM','GRC','JOR','CZE','SSD','PRT','SWE','AZE','HND','HUN','PNG','TJK','ARE','BLR','ISR','AUT','CHE','TGO','SLE','HKG','LAO','BGR','LBY','SRB','TKM','KGZ','PRY','NIC','SLV','DNK','LBN','SGP','COG','FIN','SVK','NOR','LBR','CRI','NZL','CAF','IRL','PSE','OMN','MRT','KWT','PAN','HRV','GEO','URY','BIH','MNG','ERI','PRI','ARM','ALB','JAM','LTU','MDA','NAM','QAT','GMB','BWA','GAB','LSO','SVN','GNB','LVA','MKD','XKX','GNQ','BHR','TTO','EST','TLS','CYP','MUS','SWZ','DJI','FJI','GUY','COM','BTN','SLB','MAC','MNE','LUX','SUR','CPV','MLT','MDV','BRN','BHS','BLZ','ISL','VUT','NCL','BRB','PYF','STP','WSM','LCA','GUM','CUW','KIR','GRD','FSM','ABW','VIR','TON','VCT','SYC','ATG','IMN','AND','DMA','CYM','BMU','GRL','FRO','ASM','MNP','KNA','MHL','TCA','SXM','LIE','MCO','VGB','GIB','SMR','MAF','PLW','NRU','TUV']
        self.countries_names = self.og_df.index.to_list()

        if type(speedup) == str:
            # ... why did I sort the countries by population size?
            train_all = [1126175209, 1111220306, 262664668, 217983849, 184645202, 167595128, 165963271, 131928636, 116360318, 101308127, 100609844, 92598153, 88643563, 86046476, 77736146, 74357828, 69642781, 66476519, 66061000, 57215605, 53907422, 53466615, 47779749, 47670734, 47339015, 42115302, 41412885, 40961632, 39930976, 37707802, 36435928, 35977827, 35965264, 34631557, 34402935, 32951354, 30370489, 30285945, 30096730, 28971816, 28092689, 26754282, 26369874, 25959039, 25898598, 25007025, 24051523, 23909289, 23151166, 22733268, 22574867, 22555063, 20832014, 20404010, 20268173, 18355908, 17442116, 16857171, 16770964, 16279138, 15495346, 15366372, 15358309, 15221141, 14811981, 13875776, 13874927, 13683230, 13346400, 13184808, 13082845, 12823443, 12214042, 10427854, 10219953, 10182773, 9805017, 9498049, 9336818, 9191406, 8962883, 8882960, 8716377, 8575384, 8538121, 8538740, 8339303, 8228745, 8222291, 8020427, 7953683, 7816224, 7688059, 7623817, 7558905, 7538649, 7242939, 7105899, 6861010, 6768323, 6185228, 6005581, 5790371, 5582907, 5562354, 5556074, 5443176, 5274514, 5212205, 5189136, 4976773, 4650748, 4635792, 4562735, 4493904, 4417008, 4363329, 4277500, 4034791, 3999877, 3985034, 3956010, 3948634, 3747828, 3673944, 3573078, 3553009, 3388954, 3158684, 2976328, 2718758, 2675588, 2613195, 2592506, 2556595, 2368377, 2284398, 2258139, 2235618, 2130784, 2120190, 2111543, 1965421, 1865522, 1814695, 1768230, 1671785, 1574101, 1530675, 1500961, 1431718, 1336025, 1187287, 1090200, 1061737, 1040469, 1029003, 1012748, 944990, 870462, 731604, 646171, 629147, 611959, 582195, 537798, 497704, 496393, 485125, 411628, 403345, 389853, 354786, 315550, 308420, 288624, 233829, 227037, 225286, 223302, 170772, 168134, 142044, 129521, 126280, 98866, 92673, 88193, 87444, 85472, 84707, 83284, 78225, 72925, 67360, 61107, 54333, 53919, 51116, 45028, 41447, 40075, 38578, 37607, 35304, 34602, 32392, 30929, 30428, 29179, 28448, 27567, 27313, 14261, 9262, 8487]
            # test_all = [self.og_df["Population_2019"].loc[countries_names_all[ii]] - train_all[ii] for ii in range(len(train_all))]
            self.train = []
            self.test = []
            for name in self.countries_names:
                if name in countries_names_all:
                    self.train.append(train_all[countries_names_all.index(name)])
                    self.test.append(self.og_df["Population_2019"].loc[name] - self.train[-1])
                else:
                    print("Warning. Someone messed with the master csv and now there's a country that doesn't have a numerical entry in Population_2019")
                    print("This process has failed because of that")
        elif type(speedup) == int and speedup >= 1:
            # This needs to be fixed in a couple of ways
            # 1. Don't use all countries. Only use countries in the og_df
            # 2. It's currently out of order and needs to be sorted in the same way type(speedup) == int needed to be fixed.
            self.test = [1407745000	,1389030312	,328329953	,272489381	,230800899	,209485641	,207455459	,164913055	,145453291	,126633000	,125762982	,115737383	,110804683	,107553158	,97173776	,92947442	,87051648	,83092962	,82579440	,71522271	,67382061	,66836327	,59729081	,59587885	,59174891	,52640713	,51764822	,51202827	,49907985	,47134837	,45548175	,44973465	,44957458	,43294546	,42999637	,41192171	,37965475	,37856121	,37618495	,36210898	,35111408	,33440596	,32964701	,32449303	,32375632	,31258945	,30063799	,29884380	,28938098	,28414064	,28219554	,28193009	,26037632	,25506095	,25334826	,22947757	,21803000	,21068405	,20961952	,20353534	,19371648	,19209555	,19197744	,19025752	,18513839	,17344874	,17340021	,17103803	,16685223	,16481304	,16352922	,16030971	,15271368	,13034346	,12776103	,12726755	,12255336	,11875081	,11671386	,11488980	,11202846	,11105163	,10894043	,10721582	,10671891	,10671870	,10423384	,10286263	,10278887	,10024283	,9943633	,9771141	,9609010	,9529966	,9445785	,9419758	,9054000	,8879920	,8575280	,8463068	,7731991	,7507900	,7237636	,6975761	,6951033	,6945235	,6803944	,6590211	,6515058	,6483657	,6222318	,5814422	,5794594	,5703569	,5616661	,5521606	,5454147	,5347896	,5043721	,4999553	,4979200	,4944703	,4934340	,4685306	,4591241	,4467708	,4442316	,4234700	,3949390	,3720161	,3397206	,3345533	,3267673	,3240194	,3193694	,2962500	,2854191	,2823271	,2794137	,2664224	,2650494	,2638657	,2456844	,2332083	,2267706	,2209405	,2088385	,1967696	,1913822	,1876262	,1788891	,1670177	,1483756	,1363985	,1326898	,1300945	,1286671	,1265985	,1179874	,1088709	,914899	,807665	,786578	,764824	,728147	,672000	,622028	,620001	,605768	,514104	,504062	,487731	,442680	,394675	,385829	,360563	,291985	,283606	,281207	,279034	,213392	,209780	,177661	,161808	,157441	,123921	,115981	,110282	,109203	,106669	,105669	,104167	,97625	,91364	,83957	,76474	,67880	,67327	,64107	,56225	,51795	,50209	,48129	,46952	,44007	,43166	,40385	,38563	,38109	,36395	,35494	,34663	,34267	,17798	,11587	,10581	]
            self.countries = [ii for ii in range(len(self.countries_names))]
            run_length = int(sum(self.test)* .2 / speedup)
            for ii in range(run_length):
                self.test[random.sample(self.countries, counts = self.test,k=1)[0]] -= speedup
            self.test = [self.og_df["Population_2019"].loc[self.countries_names[ii]] - self.test[ii] for ii in range(len(self.test))]
            print("Warning: Weights are currently out of order and need to be sorted like they were in the first if statement")
        else:
            print("This isn't very good code. Sorry -Rocky")

    def KFold(self, number_of_folds, speedup):
        # Legit, KFold seems unnecessary given what our data looks like, lots of balls in not a lot of bins. 
        # Each fold is going to look like every other fold plus .01% of noise

        # This isnt really a KFold I think? This is just resampling. Crap
        # For this to be KFold, I'd need to remove 1/k of the original and run it multiple times keeping track of what's removed
        # This creates the weights for the logistic regression for each train and test set. 
        # Train weights are used for logistic regression fitting
        # Test weights are used for 
        # weights for nth train kfold is weights_df.columns[2n]
        # weights for nth test kfold is weight_df.columns[2n+1]
        self.weights_df = pd.DataFrame(index = self.og_df.index)
        for ii in range(number_of_folds):
            self.Train_Test_Split(speedup)
            self.weights_df[str(2*ii)] = self.train
            self.weights_df[str(2*ii + 1)] = self.test


    def Fit(self, X, Y, W):
        # An example would be
        # X = self.Xad[self.features]
        # Y = self.Xad["Alive"]
        # self.W = self.Xad["Weight"]
        self.log_reg = Pipeline([('scale', StandardScaler()), ('log', LogisticRegression(penalty=None))])
        self.log_reg.fit(X, Y, log__sample_weight=W)



    def Predict(self, X):
        return self.log_reg.predict_proba(X)

    def Cross_Validate(self, y_hh_true, y_hh_predict):
        # I think what we're trying to predict is the mortality ratio as a percent
        accuracy_score(y_hh_true, y_hh_predict)
        print("Not working yet and I'm not sure how to measure it just yet. It's definitely not supposed to be accuracy score since we're not classifying.")
             


In [169]:
test = Country_Regressor()
test.Print_Features()
test.Feature_Selection(["Births_per_woman_2019", "Maternal_mortality_ratio _2019", "GDP_per_capita_2022"])
test.Make_Weights()
test.Fit(test.Xad[test.features], test.Xad["Alive"], test.Xad["Weight"])
print(test.og_df.index.tolist(), test.Predict(test.Xad[test.features]))



Index(['Births_per_woman_2019', 'Population_Density_2010',
       'GDP_per_capita_2022', 'beds_per_1000',
       'Maternal_mortality_ratio _2019', 'Population_2019'],
      dtype='object')
['AFG', 'ALB', 'ARE', 'ARG', 'ARM', 'AUT', 'BEL', 'BEN', 'BFA', 'BGR', 'BOL', 'BRA', 'BRB', 'BWA', 'CAN', 'CHE', 'CHL', 'CHN', 'COL', 'CRI', 'CUB', 'CZE', 'DEU', 'DNK', 'DOM', 'ECU', 'EGY', 'ESP', 'EST', 'FIN', 'FRA', 'GBR', 'GEO', 'GMB', 'GTM', 'HND', 'HRV', 'HTI', 'HUN', 'IDN', 'IND', 'IRL', 'ISL', 'ISR', 'ITA', 'JAM', 'JOR', 'JPN', 'KAZ', 'KGZ', 'KWT', 'LAO', 'LBN', 'LKA', 'LTU', 'LUX', 'LVA', 'MAR', 'MKD', 'MNE', 'MNG', 'MOZ', 'MRT', 'NER', 'NIC', 'NLD', 'NOR', 'NPL', 'NZL', 'OMN', 'PAN', 'PER', 'PHL', 'POL', 'PRT', 'PRY', 'ROU', 'RUS', 'RWA', 'SAU', 'SEN', 'SGP', 'SLV', 'SRB', 'SVK', 'SVN', 'SWE', 'SYC', 'SYR', 'THA', 'TJK', 'TKM', 'TTO', 'TUN', 'TUR', 'TZA', 'UKR', 'URY', 'USA', 'UZB', 'VEN'] [[8.64512550e-03 9.91354875e-01]
 [4.16811391e-04 9.99583189e-01]
 [2.55985918e-04 9.99744014e-01]
 [5.

In [165]:
test.Xad.head(10)

Unnamed: 0,Births_per_woman_2019,Maternal_mortality_ratio _2019,GDP_per_capita_2022,Alive,Weight
AFG_Dead,4.87,644,1357.9878,0,243793
ALB_Dead,1.414,5,12978.101,0,143
ARE_Dead,1.334,9,77203.67,0,850
ARG_Dead,1.994,33,18292.318,0,14841
ARM_Dead,1.575,25,13837.577,0,741
AUT_Dead,1.46,5,43792.855,0,444
BEL_Dead,1.6,5,41872.395,0,574
BEN_Dead,5.13,522,2477.0903,0,66434
BFA_Dead,4.971,283,1704.966,0,59322
BGR_Dead,1.58,7,20592.455,0,488


Desired Workflow (03/31/2025):

1. Initialize the Class

    a. This determines the model to be Logistic Regression

    b. It reads in the master csv to make a DataFrame called og_df
    
2. Determine Features to be Used:

    a. You can see the list of features by using Print_Features()

    b. You can do some feature engineering by manipulating og_df directly 

    c. Select your features using Feature_Selection(list_of_features)

3. Make Weights

    a. Call Make_Weights()

4. Train_Test_Split

    a. Determine how many folds by calling KFold(number_of_folds, speedup)

    b. Determine what method you'll use by specifying speedup. Current recommendation, speedup = "Infinite" or speedup = 1000

5. Fit

6. Predict

7. Cross-Validate

To Do:

*. ~~Implement feature selection in a way that makes sense~~ Done

*. Stratified / Kfold

*. Finish compiling Master csv

*. Cross Validation

*. Write an equivalent code for multi-year information (Which involves compiling another master csv)

*. ~~Implement random sampling treating the weights as the number of entries (Worry that random sampling means that test and train will look very, very similiar to each other in a bad way? I think this isn't a big deal though...)~~

*. Determine if we need to scale (it will take like two minutes to put this in the pipeline)

*. Determine what measure we'll use for predicting. Weighted Brier Score?

Would be Nice:

*. PCA?

*. Dual Plots to examine separation? 

Improvement Notes:

*. Is it necesssary to make X, Y, and W into their own dataframes? It's nice for readability... Although I guess it's just a pointer

*. ~~Find out what happens if "Population_2019" or "Maternal_mortality_ratio _2019" is chosen as a feature~~ It was fine

*. ~~Want to consider making all countries population equal, otherwise maybe we're oversampling India and China?~~ Resolved by implementing Train_Test_Split

*. Change "Maternal_mortality_ratio _2019" to "Maternal_mortality_ratio_2019"

*. Double check weights are correct
