-
Notifications
You must be signed in to change notification settings - Fork 1
/
LR_Equation_Code.py
2053 lines (1761 loc) · 117 KB
/
LR_Equation_Code.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# Script Name: Logistic_Regression
# Author: Joshua (Jay) Wimhurst
# Date created: 8/23/2022
# Date Last Edited: 4/28/2023
############################### DESCRIPTION ###################################
# This script applies a logistic regression model to the predictors and wind
# farm locations across the CONUS (and individual states) in order to
# construct an equation-based relationship between them. The outputs of this
# model are various statistics (e.g., log-likelihood ratio) and charts
# (e.g., Receiver Operation Characteristic curve) and a map showing likelihood
# of wind farm existence and predicted versus actual wind farm occurrence.
# Four different versions of the model can be executed: all predictors,
# all predictors but wind speed, wind speed only, and a refined predictor set.
###############################################################################
def LogisticRegressionModel():
import sys
import os
import arcpy
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import statsmodels.api as sm
from arcpy.da import TableToNumPyArray, UpdateCursor, SearchCursor
from fpdf import FPDF
from pandas import DataFrame, concat
from matplotlib.lines import Line2D
from matplotlib.patches import Rectangle
from matplotlib import transforms
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score
from statsmodels.api import OLS
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from math import e
from scipy.stats import chi2, rankdata, mannwhitneyu
from statistics import median
from warnings import filterwarnings
filterwarnings("ignore")
######################### DIRECTORY CONSTRUCTION ##########################
# IMPORTANT - set up a file directory to place the Console Output,
# generated figures, and constructed WiFSS surfaces BEFORE attempting to
# run this model
directory = # Specify directory here
# IMPORTANT - make sure to also create a folder for the figures
# created from the model's calibration and validation results
directoryPlusFigures = directory + "\Figures"
# IMPORTANT - make sure to also create a folder for the WiFSS surfaces
# created from applying the trained and tested model to all grid cells
directoryPlusSurfaces = directory + "\WiFSS_Surfaces"
# IMPORTANT - make sure to also create a folder for the coefficients
# obtained from fitting the model (and a sub-folder named after the state
# selected or CONUS), which will be needed for users running
# the cellular automaton portion of the model
directoryPlusCoefficients = directory + "\Coefficients"
# IMPORTANT - make sure to also create a folder for the intercepts
# obtained from fitting the model (and a sub-folder named after the state
# selected or CONUS), which will be needed for users running
# the cellular automaton portion of the model
directoryPlusIntercepts = directory + "\Intercepts"
# The script prints the console output to a text file, with a previous
# version deleted prior to the model run
if os.path.exists(directory + "\Logistic_Regression_Console_Output.txt"):
os.remove(directory + "\Logistic_Regression_Console_Output.txt")
###################### DATASET SELECTION AND SETUP ############################
# PDF file containing the console output is initiated and caveat is added
pdf = FPDF()
pdf.add_page()
pdf.set_xy(4, 4)
pdf.set_font(family = 'arial', size = 13.0)
pdf.multi_cell(w=0, h=5.0, align='R', txt="Console output", border = 0)
pdf.multi_cell(w=0, h=5.0, align='L',
txt="------------------ DATASET SELECTION AND SETUP ------------------"
+'\n\n'+ "NOTE: The desired study region must be specified as 'CONUS' if one "
+'\n'+ "wishes to execute the logistic regression model over states that "
+'\n'+ "contain zero commercial wind farms (Louisiana, Mississippi "
+'\n'+ "Alabama, Georgia, South Carolina, Kentucky), states that possess "
+'\n'+ "wind farms in only one grid cell at all but the highest spatial "
+'\n'+ "resolutions (Arkansas, Florida, Virginia, Delaware, Connecticut,"
+'\n'+ "New Jersey, Tennessee), or states at low spatial resolutions at which "
+'\n'+ "too many predictors were removed due to collinearity (Rhode Island "
+'\n'+ "at the 100th or 80th percentile).\n", border=0)
# Select the study region for which the model will run:
# The CONUS or a single state
def studyRegion(values, message):
while True:
x = input(message)
if x in values:
studyRegion.region = x
break
else:
print("Invalid value; options are " + str(values))
studyRegion(["CONUS","Arizona","California","Colorado","Idaho","Illinois",
"Indiana","Iowa","Kansas","Maine","Maryland","Massachusetts",
"Michigan","Minnesota","Missouri","Montana","Nebraska",
"Nevada","New_Hampshire","New_Mexico","New_Jersey","New_York",
"North_Carolina","North_Dakota","Ohio","Oklahoma","Oregon",
"Pennsylvania","Rhode_Island","South_Dakota","Texas","Utah",
"Vermont","Washington","West_Virginia","Wisconsin","Wyoming"],
'''Enter desired study region \n(CONUS, Arizona, California, Colorado, '''
'''Idaho, Illinois, Indiana, Iowa, Kansas, Maine, Maryland, '''
'''Massachusetts, Michigan, Minnesota, Missouri, Montana, '''
'''Nebraska, Nevada, New_Hampshire, New_Mexico, New_York, '''
'''North_Carolina, North_Dakota, Ohio, Oklahoma, Oregon, '''
'''Pennsylvania, Rhode_Island, South_Dakota, Texas, '''
'''Utah, Vermont, Washington, West_Virginia, Wisconsin, '''
'''Wyoming):\n''')
# User input for wind farm density in acres per Megawatt: 25, 45, 65, or 85
def farmDensity(values,message):
while True:
x = input(message)
if x in values:
farmDensity.density = x
break
else:
print("Invalid value; options are " + str(values))
farmDensity(["25","45","65","85"], "\nEnter desired wind farm density (25, 45, 65, or 85 acres/MW):\n")
# User input for wind power capacity as a percentile: 20, 40, 60, 80, or 100
def farmCapacity(values,message):
while True:
x = input(message)
if x in values:
farmCapacity.capacity = x
break
else:
print("Invalid value; options are " + str(values))
farmCapacity(["20","40","60","80","100"], "\nEnter desired wind power capacity (20, 40, 60, 80, or 100 percentile):\n")
# NOTE: 20th percentile = 30MW, 40th percentile = 90MW, 60th percentile = 150 MW
# 80th percentile = 201.5 MW, 20th percentile = 525 MW
# Based on selected percentile, the print-out to the text file changes
if farmCapacity.capacity == "100":
power = "525 MW"
elif farmCapacity.capacity == "80":
power = "202 MW"
elif farmCapacity.capacity == "60":
power = "150 MW"
elif farmCapacity.capacity == "40":
power = "90 MW"
elif farmCapacity.capacity == "20":
power = "30 MW"
# The user is asked for the predictor configurations for which they wish
# to run the model, first the Full configuration
def fullConfiguration(values,message):
while True:
x = input(message)
if x in values:
fullConfiguration.YesNo = x
break
else:
print("Invalid value; options are " + str(values))
fullConfiguration(["Y","N"], "".join(["\nDo you wish to use the Full predictor configuration? Y or N.\n"]))
# Next the No_Wind configuration
def noWindConfiguration(values,message):
while True:
x = input(message)
if x in values:
noWindConfiguration.YesNo = x
break
else:
print("Invalid value; options are " + str(values))
noWindConfiguration(["Y","N"], "".join(["\nDo you wish to use the No_Wind predictor configuration? Y or N.\n"]))
# Next the Wind_Only configuration
def windOnlyConfiguration(values,message):
while True:
x = input(message)
if x in values:
windOnlyConfiguration.YesNo = x
break
else:
print("Invalid value; options are " + str(values))
windOnlyConfiguration(["Y","N"], "".join(["\nDo you wish to use the Wind_Only predictor configuration? Y or N.\n"]))
# Finally the Reduced configuration
def reducedConfiguration(values,message):
while True:
x = input(message)
if x in values:
reducedConfiguration.YesNo = x
break
else:
print("Invalid value; options are " + str(values))
reducedConfiguration(["Y","N"], "".join(["\nDo you wish to use the Reduced predictor configuration? Y or N.\n"]))
configList = []
if fullConfiguration.YesNo == "Y":
configList.append("Full")
if noWindConfiguration.YesNo == "Y":
configList.append("No_Wind")
if windOnlyConfiguration.YesNo == "Y":
configList.append("Wind_Only")
if reducedConfiguration.YesNo == "Y":
configList.append("Reduced")
# User inputs are added to the console output
pdf.multi_cell(w=0, h=5.0, align='L',
txt="\nSpecified study region: " + str(studyRegion.region)
+'\n'+"Specified wind farm density: " + str(farmDensity.density) + " acres/MW"
+'\n'+"Specified wind power capacity: " + str(farmCapacity.capacity) + "th percentile (" + power + ")"
+'\n'+"Predictor configurations specified by the user: " + str(configList), border=0)
# The filepaths to the desired gridded datasets depend on whether the
# user selected the CONUS or an individual state
if studyRegion.region != "CONUS":
# File path to the dataset specified by user input
table = "".join([directory, "/", studyRegion.region, "_Gridded_Surfaces/Hexagon_Grid_", farmDensity.density, "_acres_per_MW_", farmCapacity.capacity, "th_percentile_", studyRegion.region, "_Merged.gdb\Attribute_Table"])
else:
table = "".join([directory, "/", studyRegion.region, "_Gridded_Surfaces/Hexagon_Grid_", farmDensity.density, "_acres_per_MW_", farmCapacity.capacity, "th_percentile_", studyRegion.region, "_Merged.gdb\Attribute_Table"])
# The aggregated dataset are redefined to exclude the attribute table
# rows with missing data
array = TableToNumPyArray(table, "*", skip_nulls = True)
df = DataFrame(array)
# The model's ROC is more likely to fail if there are too few grid cells,
# and too many predictors may also be removed due to collinearity. A user
# input is created to allow the user the chance to abort the code if they
# choose an inappropriate resolution.
if len(df) <= 300:
print('''\nThis resolution comprises''', str(len(df)), '''grid cells, '''
'''a resolution at which the model may be reduced to a small number '''
'''of predictors due to multicollinearity, and at which the model's '''
'''Receiver Operating Characteristic may be affected. Do you wish '''
'''to proceed?''')
# Specify using Y or N whether the user would like to choose a different
# resolution
def proceed(values, message):
while True:
x = input(message)
if x in values:
proceed.YesOrNo = x
break
else:
print("Invalid value; options are " + str(values))
proceed(["Y", "N"], "Y or N:\n")
# The code stops if the user specifies N
if proceed.YesOrNo != "Y":
sys.exit()
# The warning about compromised model performance is written to the
# console output if the end-user decides to still go ahead.
else:
pdf.multi_cell(w=0, h=5.0, align='L',
txt="\nAt the user-specified resolution, less than 300 (" + str(len(df)) + ") grid cells exist over"
+'\n'+ str(studyRegion.region) + ", which increases the risk of fewer predictors being "
+'\n'+"retained due to multicollinearity, and of a compromised ROC.", border=0)
# The predictors that the model uses depend on the study region selected
# by the user
if studyRegion.region != "CONUS":
# The predictors from the dataframe are isolated as a separate dataframe.
# NOTE: The model's ability to predict wind farm locations cannot be
# assessed over the following states: Alabama, Arkansas, Delaware, DC,
# Florida, Georgia, Kentucky, Louisiana, Mississippi, New Jersey,
# South Carolina, or Virginia (all are states with 0 or 1 commercial
# wind farm).
df = df[["Wind_Turb","Critical","Historical","Military","Mining",
"Nat_Parks","Dens_15_19","Trib_Land","Wild_Refug",
"Avg_Elevat","Avg_Temp","Avg_Wind","Bat_Count","Bird_Count",
"Prop_Rugg","Undev_Land","Near_Air","Near_Hosp",
"Near_Roads","Near_Trans","Near_Sch","Plant_Year",
"Farm_Year","ISO_YN","Near_Plant","Dem_Wins","Type_15_19",
"Unem_15_19","Fem_15_19","Hisp_15_19","Avg_25",
"Whit_15_19","supp_2018"]]
# Predictors if the model user specifies the CONUS as the study region
else:
df = df[["Wind_Turb","Critical","Historical","Military","Mining",
"Nat_Parks","Dens_15_19","Trib_Land","Wild_Refug",
"Avg_Elevat","Avg_Temp","Avg_Wind","Bat_Count","Bird_Count",
"Prop_Rugg","Undev_Land","Near_Air","Near_Hosp",
"Near_Roads","Near_Trans","Near_Sch","Plant_Year",
"Farm_Year","Cost_15_19","ISO_YN","Near_Plant",
"Farm_15_19","Prop_15_19","In_Tax_Cre","Tax_Prop",
"Tax_Sale","Numb_Incen","Rep_Wins","Dem_Wins","Interconn",
"Net_Meter","Renew_Port","Renew_Targ","Numb_Pols",
"Foss_Lobbs","Gree_Lobbs","Type_15_19","Unem_15_19",
"Fem_15_19","Hisp_15_19","Avg_25","Whit_15_19","supp_2018"]]
# Some rows of the dataframe contain null values. Their indexes are saved,
# since they will need to be removed from the gridded surface (+1 since
# a pandas dataframe indexes from zero)
dropIndex = df[np.invert(df.index.isin(df.dropna().index))].index.tolist()
dropIndex = [x+1 for x in dropIndex]
# Rows containing null values can now be removed, otherwise the model will
# not run. Binary categorical variables are also converted into numbers,
# where N (no) is 0 and Y (yes) is 1.
df = df.dropna().replace(to_replace = ["Other","N","Y"], value = [0,0,1])
# Predictors that take the same value in every single grid cell should be
# dropped, since they have no predictive power. This applies to the
# categorical predictors
constantDropped = []
nunique = df.nunique()
constantDropped = nunique[nunique == 1].index.tolist()
# The names of the dropped predictors are written to the console output
if len(constantDropped) == 0:
pdf.multi_cell(w=0, h=5.0, align='L',
txt="\nPredictors removed from the model based on having a constant"
+'\n'+ "value in all grid cells: None", border=0)
else:
pdf.multi_cell(w=0, h=5.0, align='L',
txt="\nPredictors removed from the model based on having a constant"
+'\n'+ "value in all grid cells: " + str(constantDropped), border=0)
# The respective columns are dropped from the dataset
df = df.drop(columns = constantDropped)
######################### TESTING ASSUMPTIOMS ##############################
# First is a test to ensure that the relationship between the predictors
# and the logit of the occurrence of wind farms is indeed linear
pdf.multi_cell(w=0, h=5.0, align='L',
txt="\n------------------ TESTING ASSUMPTIONS ------------------"
+'\n\n'+ "Assumption #1: All continuous predictors have a linear relationship "
+'\n'+ "with the logit of the dependent variable, based on a Box-Tidwell test.", border=0)
# The Wind_Turb column in the attribute table defines the dependent variable,
# i.e., whether or not a grid cell contains a wind turbine.
dfy = DataFrame(df["Wind_Turb"])
dfy = dfy["Wind_Turb"].tolist()
# The Wind_Turb column must also be removed since it is not itself a predictor.
dfx = df.drop(columns = ["Wind_Turb"])
# Logistic regression assumes that each predictor has a linear
# relationship with the logit of the dependent variable. Continuous
# predictors that do not pass a Box-Tidwell test are therefore dropped
columnNames = dfx.columns.tolist()
# A dataframe is created for testing the first assumption
dfAssumptionOne = pd.DataFrame()
# Log-transformed versions of the continuous predictors are calculated e.g.. Wind_Speed * Log(Wind_Speed)
for predictor in columnNames:
# Categorical and discrete predictors are skipped
if (predictor != "Critical" and predictor != "Historical"
and predictor != "Military" and predictor != "Mining"
and predictor != "Nat_Parks" and predictor != "Trib_Land"
and predictor != "Wild_Refug" and predictor != "Bat_Count"
and predictor != "Bird_Count" and predictor != "Plant_Year"
and predictor != "Farm_Year" and predictor != "ISO_YN"
and predictor != "In_Tax_Cre" and predictor != "Tax_Prop"
and predictor != "Tax_Sale" and predictor != "Numb_Incen"
and predictor != "Rep_Wins" and predictor != "Dem_Wins"
and predictor != "Interconn" and predictor != "Net_Meter"
and predictor != "Renew_Port" and predictor != "Renew_Targ"
and predictor != "Numb_Pols" and predictor != "Foss_Lobbs"
and predictor != "Gree_Lobbs" and predictor != "supp_2018"):
dfAssumptionOne[predictor] = dfx[predictor].astype(float)
dfAssumptionOne[f'{predictor}:Log_{predictor}'] = dfx[predictor].apply(lambda x: float(x) * np.log(float(x)))
# Some of the continuous predictors (e.g., Land_Slope, Undevelopable_Land)
# sometimes do take a value of 0, and thus become NaN when log-transformed.
# Their log transformations are thus replaced with 0.
dfAssumptionOne = dfAssumptionOne.fillna(0)
# Column names of the non-transformed predictors only
dfNonTransformed = [col for col in dfAssumptionOne.columns if ":" not in col]
# Add a constant term
dfAssumptionOne = sm.add_constant(dfAssumptionOne, prepend=False)
# A generalized linear model is constructed, first by adding a constant
# and then fitting to the continuous predictors and their log transformations
try:
logit_results = sm.GLM(dfy, dfAssumptionOne, family=sm.families.Binomial()).fit()
# If (quasi-)complete separation does not occur, then the linearity test
# can be completed
if logit_results:
# The p-values for each predictor
logit_pvalues = logit_results.pvalues
# The p-values of the constant and the non-transformed predictors are
# removed from the list
logit_pvalues = logit_pvalues[:-1]
# A Bonferroni correction is used to account for false positive
# statistical significance
bonferroni = 0.05/len(logit_pvalues)
pdf.multi_cell(w=0, h=5.0, align='L',
txt="\nBonferroni-corrected p-value: " + str(bonferroni), border=0)
# Interest is in the p-values of the log-transformed predictors only, which
# are every other p-value in the list
logit_pvalues = logit_pvalues[1::2]
# Log-transformed predictors with a p-value less than the bonferroni
# correction are statistically significantly non-linear in their
# relationship with the logit of wind farm occurrence. These predictors
# must be dropped from the model.
assumptionOneDropped = []
for i in range(len(logit_pvalues)):
if logit_pvalues[i] < bonferroni:
assumptionOneDropped.append(dfNonTransformed[i])
# Results from the Box-Tidwell test are presented as a dataframe
pvalueList = logit_pvalues.tolist()
dfBoxTidwell = pd.DataFrame()
dfBoxTidwell["Predictor"] = dfNonTransformed
dfBoxTidwell["pval"] = pvalueList
dfBoxTidwell = dfBoxTidwell.sort_values("pval", ignore_index = True)
dfBoxTidwell = dfBoxTidwell.to_string(justify = 'center', col_space = 30, index = False)
pdf.multi_cell(w=0, h=5.0, align='L',
txt="\nResults of the Box-Tidwell test: \n", border=0)
pdf.multi_cell(w=0, h=5.0, align='R', txt=dfBoxTidwell, border = 0)
# When no predictors are dropped following the Box-Tidwell test,
# the following is written to the console output
if len(assumptionOneDropped) == 0:
pdf.multi_cell(w=0, h=5.0, align='L',
txt="\nPredictors to be removed based on a non-linear relationship "
+'\n'+ "with the logit of likelihood of wind farm occurrence: None", border=0)
# If predictors are dropped, the user is given the opportunity to
# ask whether they should be dropped
else:
print('''\nPredictors to be removed based on a non-linear relationship with '''
'''the logit of likelihood of wind farm occurrence: \n'''
+ str(assumptionOneDropped) +
'''\nDo you wish to remove these predictors from the model?''')
# Specify using Y or N whether the user would like to choose a different
# resolution
def remove(values, message):
while True:
x = input(message)
if x in values:
remove.YesOrNo = x
break
else:
print("Invalid value; options are " + str(values))
remove(["Y", "N"], "Y or N:\n")
# If the user says yes, the predictors are dropped
if remove.YesOrNo == "Y":
# The respective columns are dropped from the dataframe
dfx = dfx.drop(columns = assumptionOneDropped)
pdf.multi_cell(w=0, h=5.0, align='L',
txt="\nPredictors removed from the model based on the results "
+'\n'+ "of the Box-Tidwell test: " + str(assumptionOneDropped), border=0)
else:
pdf.multi_cell(w=0, h=5.0, align='L',
txt="\nThe Box-Tidwell test would have dropped " + str(assumptionOneDropped)
+'\n'+ "from the model, though the user chose to retain them.", border=0)
# The variable holding the dropped predictors is emptied
assumptionOneDropped = []
# The linearity test can fail due to complete separation of the binary
# dependent variable resulting from the value of one or more predictors
except:
print('''\nComplete or quasi-complete separation has occurred due to the coarse '''
'''resolution and/or a lack of grid cells containing wind farms. The test for '''
'''linearity cannot be completed; do you still wish to continue the model run?''')
# Specify using Y or N whether the user would like to remove them
def proceed(values, message):
while True:
x = input(message)
if x in values:
proceed.YesOrNo = x
break
else:
print("Invalid value; options are " + str(values))
proceed(["Y", "N"], "Y or N:\n")
# The code stops if the user specifies N
if proceed.YesOrNo != "Y":
sys.exit()
else:
pdf.multi_cell(w=0, h=5.0, align='L',
txt="\nThe Box-Tidwell test cannot complete due to (quasi-)complete separation "
+'\n'+ "of the logit of the depenent variable. Although the linearity of the model's "
+'\n'+ "predictors cannot be ascertained, the user has chosen to continue the model "
+'\n'+ "run. No predictors have been removed based on Assumption #1."
+'\n\n'+"The user has specified the following predictor configurations: "
+'\n'+ str(configList), border=0)
# No predictors were dropped
assumptionOneDropped = []
# The second assumption is the multicollinearity test, to ensure that
# all predictors have independent effects on WiFSS
pdf.multi_cell(w=0, h=5.0, align='L',
txt="\nAssumption #2: There is no multicollinearity, or pairwise collinearity, "
+'\n'+ "between the model's predictors, based on Variance Inflaction Factors (VIF).", border=0)
# First step is to normalize the predictors and grid cells that have been
# retained from applying the previous assumption
columnNames = dfx.columns.tolist()
dfNormalized = pd.DataFrame()
dfCategorical = pd.DataFrame()
# Categorical predictors must be separated from those that are to be
# normalized
for predictor in columnNames:
if (predictor == "Critical" or predictor == "Historical"
or predictor == "Military" or predictor == "Mining"
or predictor == "Nat_Parks" or predictor == "Trib_Land"
or predictor == "Wild_Refug" or predictor == "ISO_YN"
or predictor == "In_Tax_Cre" or predictor == "Tax_Prop"
or predictor == "Tax_Sale" or predictor == "Interconn"
or predictor == "Net_Meter" or predictor == "Renew_Port"):
dfCategorical[predictor] = dfx[predictor]
else:
dfNormalized[predictor] = dfx[predictor].astype(float)
# Quantitative data that take the same value at all grid cells should not
# be normalized, and are thus separated before normalization
nunique = dfNormalized.nunique()
doNotNormalize = nunique[nunique == 1].index.tolist()
dfNotNormalized = dfx[doNotNormalize]
# The normalization is executed using standard scores for each grid cell
dfNormalized = dfNormalized.drop(doNotNormalize, axis = 1).astype(float)
dfNormalized = (dfNormalized - dfNormalized.mean())/dfNormalized.std()
# The normalized and categorical columns can now be recombined
dfx = concat([dfNormalized,dfNotNormalized,dfCategorical], axis = 1)
# Multicollinearity is assessed by first creating a dataframe to hold
# VIF values, which are calculated for each predictor in the above
# multiple linear regression model.
vif = DataFrame()
vif['Predictor'] = dfx.columns
vif['VIF'] = [variance_inflation_factor(dfx.values, i) for i in range(dfx.shape[1])]
# The VIF data are sorted, with each row showing the collinearity that each
# variable has with the others. The bigger the value, the bigger the
# multicollinearity and thus the less unique the effect the variable can have
# in the logistic regression model.
vif = vif.sort_values(["VIF"]).reset_index(drop = True)
vifGrouped = vif.to_string(justify = 'center', col_space = 25, index = False)
pdf.multi_cell(w=0, h=5.0, align='L',
txt="\nGrouped Multicollinearity Test Results:\n", border=0)
pdf.multi_cell(w=0, h=5.0, align='R', txt=vifGrouped, border = 0)
# A predictor with a VIF above 10 is considered to be too strongly correlated
# with the other predictors. These predictors are isolated.
vifCorrelate = vif[vif["VIF"] >= 10]
vifCorrelateList = vifCorrelate["Predictor"].tolist()
# Predictors that possess a value of NaN based on the multicollinearity test
# have the same value across all grid cells, and thus have no unique spatial
# influence on WiFSS. These predictors must also be isolated.
vifNaN = vif[vif["VIF"].isna()]
vifNaNList = vifNaN["Predictor"].tolist()
vifRemoveList = vifCorrelateList + vifNaNList
# Predictors are now tested for collinearity in pairwise combinations, rather
# than altogether as done above. VIF is calculated for each pair.
vif = 1/(1-dfx.corr()**2)
# The upper triangle of the pairwise matrix is deleted, and VIF values
# are sorted by magnitude.
vif = vif.where(np.triu(np.ones(vif.shape),k=1).astype(bool))
vif = vif.stack().sort_values(axis = 0).reset_index()
vif.columns = ["Predictor1","Predictor2","VIF"]
vifPairs = vif.to_string(justify = 'center', col_space = 25, index = False, max_rows = 10)
pdf.multi_cell(w=0, h=5.0, align='L',
txt="\nPairwise Multicollinearity Test Results:\n", border=0)
pdf.multi_cell(w=0, h=5.0, align='R', txt=vifPairs, border = 0)
# Pairs of predictors with a VIF above 10 are isolated and added to a list.
vifPairwiseRemove = vif[vif["VIF"] >= 10]
var1List = vifPairwiseRemove["Predictor1"].tolist()
var2List = vifPairwiseRemove["Predictor2"].tolist()
vifPairwiseRemoveList = var1List + var2List
# The predictors from the multicollinearity and pairwise collinearity tests
# are combined and duplicates are removed
assumptionTwoDropped = [*set(vifRemoveList + vifPairwiseRemoveList)]
if len(assumptionTwoDropped) == 0:
pdf.multi_cell(w=0, h=5.0, align='L',
txt="\nPredictors to be removed based on multicollinearity: None", border=0)
# The user is given the opportunity to remove the multicollinear predictors
else:
print('''\nPredictors to be removed based on multicollinearity: \n'''
+ str(assumptionTwoDropped) +
'''\nDo you wish to remove these predictors from the model?''')
# Specify using Y or N whether the user would like to remove them
def remove(values, message):
while True:
x = input(message)
if x in values:
remove.YesOrNo = x
break
else:
print("Invalid value; options are " + str(values))
remove(["Y", "N"], "Y or N:\n")
# If the user says yes, the predictors are dropped
if remove.YesOrNo == "Y":
# The respective columns are dropped from the dataframe
dfx = dfx.drop(columns = assumptionTwoDropped)
pdf.multi_cell(w=0, h=5.0, align='L',
txt="\nPredictors to be removed from the model based on multicollinearity: \n" + str(assumptionTwoDropped), border=0)
else:
pdf.multi_cell(w=0, h=5.0, align='L',
txt="\nThe multicollinearity test would have dropped \n" + str(assumptionTwoDropped)
+'\n'+ "from the model, though the user chose to retain them.", border=0)
# The variable holding the dropped predictors is emptied
assumptionTwoDropped = []
# The final assumption is the test for outliers, to ensure that none of the
# aggregated to the grid cells will bias the training and testing stages
# of this model
pdf.multi_cell(w=0, h=5.0, align='L',
txt="\nAssumption #3: None of the grid cells contain data that represent "
+'\n'+ "extreme outliers, based on a Cook's distance test.", border=0)
# An ordinary least squares regression fit between the wind turbine
# locations (dfy) and the grid cell data (dfRenamed) is conducted,
# and identified outlying grid cells are recorded to be later dropped.
outlierTest = OLS(dfy,dfx).fit()
influence = outlierTest.get_influence()
cooks = influence.cooks_distance
cookIndex = []
count = 0
for i in range(len(cooks[1])):
if cooks[1][i] < 0.05:
cookIndex.append(i)
count = count + 1
# Final results of the three assumptions are written to the console output
pdf.multi_cell(w=0, h=5.0, align='L',
txt="\nNumber of grid cells removed due to outlying observations according to a "
+'\n'+ "Cook's distance test: " + str(count)
+'\n\n'+ "Final list of predictors that did not pass the model's three assumptions: "
+'\n'+ str(constantDropped + assumptionOneDropped + assumptionTwoDropped), border=0)
############### MAKING THE TRAINING AND TESTING DATASETS ##################
# The predictors used in the training and testing datasets depend on
# the configuration(s) selected by the end-user
for g in range(len(configList)):
# The No_Wind configuration requires that wind speed be dropped as a
# predictor before the model run starts
if configList[g] == "No_Wind":
dfxConfig = dfx.loc[:, dfx.columns != "Avg_Wind"]
# The Wind_Only configuration requires wind speed to be the only
# predictor that is retained
elif configList[g] == "Wind_Only":
try:
dfxConfig = dfx[["Avg_Wind"]]
except:
print('''\nAvg_Wind was removed as a predictor by the user when assumptions were tested, '''
'''\nmeaning the Wind_Only predictor configuration will not be used.''')
pdf.multi_cell(w=0, h=5.0, align='L',
txt="\nAvg_Wind was removed as a predictor by the user when assumptions were tested, "
+"\n"+"meaning the Wind_Only predictor configuration will not be used.", border=0)
configList.remove(configList[g])
break
# For the Full and Reduced configurations, no predictors need to be
# pre-emptively removed
elif configList[g] == "Full" or configList[g] == "Reduced":
dfxConfig = dfx
# Numpy arrays are constructed for each of the predictors that will inform
# the logistic regression model.
predictorArrays = [DataFrame(dfxConfig.iloc[:,i]).to_numpy().reshape(-1,1) for i in range(len(dfxConfig.columns))]
# The predictors are concatenated into the same array
dfxArray = np.concatenate((predictorArrays[0:len(predictorArrays)]), axis = 1)
# Console output to signify the beginning of outputs from a predictor configuration
pdf.multi_cell(w=0, h=5.0, align='L',
txt="\n ################# " + configList[g] + " Configuration Output Begins ################# \n", border=0)
# The names of the predictors retained by the model are added to a
# separate list, which will be used for holding the coefficients should
# the user wish to employ a cellular automaton in the model's second
# script
predictorCodeNames = dfxConfig.columns.tolist()
######################## MODEL CALIBRATION #########################
pdf.multi_cell(w=0, h=5.0, align='L',
txt="\n--------------- MODEL CALIBRATION (Training Data): " + configList[g] + " Configuration ---------------", border=0)
# These two lists will hold the log-likelihood scores obtained from each
# combination of grid cells (30) used to separately train the model
trainedModelScores = []
trainedModelAppend = trainedModelScores.append
nullModelScores = []
nullModelAppend = nullModelScores.append
# This list holds the intercept of the logistic regression model, which
# will be called upon when computing each grid cell's probability
interceptList = []
interceptAppend = interceptList.append
# This list holds the McFadden's Adjusted Pseudo R-squared scores for each
# of the 30 calibration runs of the model
rSquaredList = []
rSquaredAppend = rSquaredList.append
# An empty list to hold the coefficient for each predictor produced
# over the 30 calibration model runs
coefList = []
coefAppend = coefList.append
# This list holds the Area Under Curve statistic computed from plotting
# a Receiver Operating Characteristic curve for each sample of testing
# (validation) grid cells
aucList = []
aucAppend = aucList.append
# This list holds the classification threshold from the ROC curve at which
# true positive classification of grid cells as containing wind farms
# (in the testing dataset) maximizes, and false positive classification
# minimizes
thresholdList = []
thresholdAppend = thresholdList.append
# These lists hold the false positive rate and true positive rates
# obtained from classifying the training data at the various thresholds
# in the list above
fprList = []
fprAppend = fprList.append
tprList = []
tprAppend = tprList.append
# This list holds a confusion matrix of the true positive, false positive,
# true negative, and false negative predictions of the state of grid cells
# in the testing dataset
cmList = []
cmAppend = cmList.append
# The number of degrees of freedom that the model possesses
degrees = []
degreesAppend = degrees.append
# The count and pvalues must also be saved from running the function
countList = []
countAppend = countList.append
pValCountList = []
pValCountAppend = pValCountList.append
print("\n" + configList[g] + " model training and testing in progress...")
# The logistic regression model is run 30 times, in order to account for
# different combinations of training and testing grid cells and to diagnose
# an average model performance
def trainingTestingModel(allData):
# This count variable will keep track of how frequently the model
# outperforms the null model (intercept-only) for different training
# grid cell combinations
count = 0
# This count variable will keep track of how many times the model's
# outperformance of the null model is statistically significant (p < 0.05)
pValCount = 0
# 25% of the grid cells train the model, and 75% test it. The datasets
# are stratified such that equal numbers of grid cells containing wind
# turbines exist in the training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(allData, dfy, train_size = 0.75, stratify = dfy)
# The logistic regression model is constructed. A low value for C prevents
# overfitting. A balanced class weight prevents a unit weight of 1 being
# applied to each variable.
model = LogisticRegression(solver = "liblinear", C = 1, class_weight = "balanced")
# The model is fitted to the training grid cells. The training grid
# cells define the model's coefficients, i.e., the associations that
# will predict whether a grid cell contains a wind turbine or not.
model.fit(X_train,y_train)
# The coefficients from the model run are saved to the empty list
coefAppend(model.coef_)
# The intercept from this trained model defines the null model. The
# intercept is extended to the same length as the training dataset
intercept = model.intercept_.tolist()
intercept = [i for i in intercept for r in range(len(y_train))]
interceptAppend(intercept)
# A likelihood-ratio test for performance against the null model is
# needed to determine how much better this trained model can correctly
# predict the existence of wind farms in the training grid cells.
# The log-likelihood of the trained model is calculated first
X_train = sm.add_constant(X_train)
trained_model = OLS(y_train,X_train).fit()
trained_ll = trained_model.llf
# Next the log-likelihood of the null model is computed
null_model = OLS(y_train,intercept).fit()
null_ll = null_model.llf
# These two log-likelihood scores are saved to empty lists
trainedModelAppend(trained_ll)
nullModelAppend(null_ll)
# The goodness-of-fit of the trained model is computed in terms of
# McFadden's Adjusted Pseudo R-squared, and then added to its
# respective list. The statistic's numerator is penalized for its large
# number of predictors.
trained_rsquared = 1 - (trained_ll - X_train.shape[1])/null_ll
rSquaredAppend(trained_rsquared)
# The count keeps track of how many times the trained model has better
# goodness-of-fit than the null model to the training grid cells.
if trained_ll > null_ll:
count = count + 1
# The likelihood ratio of the null and trained models' likelihood scores
# is computed, and the associated p-value based on a chi-square test.
# The number of predictors determines the number of degrees of freedom.
LR_trained_vs_null = -2*(null_ll-trained_ll)
p_value = chi2.sf(LR_trained_vs_null, len(X_train[0]))
# The p-value count variable is now updated
if p_value < 0.05:
pValCount = pValCount + 1
# A Receiver Operating Characteristic is calculated to illustrate how
# effective the trained model is at correctly classifying the testing grid
# cells as containing wind farms (true positive rate versus false
# positive rate).
y_pred_proba = model.predict_proba(X_test)[::,1]
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
thresh = thresholds[np.argmax(tpr-fpr)]
# The true/false positive rates, the area under curve and the optimal
# classification thresholds are saved to their respective lists
fprAppend(fpr)
tprAppend(tpr)
aucAppend(auc)
thresholdAppend(thresh)
# The model is used to predict whether testing grid cells should contain
# wind turbines, based on the predictor values in the testing grid cells and
# the optimal classification threshold from the ROC curve.
y_pred = (model.predict_proba(X_test)[:, 1] > thresh).astype('float')
# The actual and predicted model performance for each model run, as well
# as a confusion matrix, are saved.
cm = confusion_matrix(y_test, y_pred)
cmAppend(cm)
# The number of degrees of freedom will be needed to assess performance
degreesAppend(len(X_train[0]))
countAppend(count)
pValCountAppend(pValCount)
# The trained and tested model is iterated 30 times
[trainingTestingModel(dfxArray) for i in tqdm(range(30))]
# The Reduced predictor configuration first requires determining
# which combination of predictors maximizes the model's predictive power
if configList[g] == "Reduced":
print("\nPlease wait while the model determines the importance of each predictor...\n")
# These empty lists hold the Reduced model configuration's performance
# compared to the Full configuration (across 30 runs), and the
# statistical significance of any reduction in performance
likelihoodRatioList = []
likelihoodRatioAppend = likelihoodRatioList.append
pValueList = []
pValueAppend = pValueList.append
# Removal of predictors is performed 30 times in order to obtain a median
# reduction of model performance that accounts for randomness
def iteration():
# This list will hold the log likelihood scores of the model as predictors
# are removed with replacement
reducedLogLikelihoods = []
reducedLogAppend = reducedLogLikelihoods.append
# The effect of removing each predictor in turn on the model's output
# needs to be determined
def predictorRemoval(reducedData):
# A version of the training and testing datasets having been built with
# a predictor excluded
X_train_red, X_test_red, y_train_red, y_test_red = train_test_split(reducedData, dfy, train_size = 0.75, stratify = dfy)
# The logistic regression model is refitted using these new training
# and testing datasets
model = LogisticRegression(solver = "liblinear", C = 1, class_weight = "balanced")
model.fit(X_train_red,y_train_red)
# The log-likelihood of the Reduced model's goodness-of-fit
# is computed in the same way that it was for the other
# predictor configurations
red_model = OLS(y_train_red,X_train_red).fit()
red_ll = red_model.llf
reducedLogAppend(red_ll)
# The log likelihood scores and likelihood ratios obtained by removing
# each predictor with replacement are generated
[predictorRemoval(np.delete(dfxArray,h,1)) for h in range(len(dfxArray[0]))]
# Likelihood ratios between the Reduced log likelihoods and the median
# log likelihood score from the Full predictor configuration are computed.
# Quantization of log likelihood scores in this manner is common
# in signal processing (Liu et al., 2010)
fullVersusReducedLRs = -2*(reducedLogLikelihoods - np.median(trainedModelScores))
# The statistical significance of the changes in the models likelihood ratio
# after removing each predictor with replacement are computed using a
# chi-square test
pValues = chi2.sf(fullVersusReducedLRs, len(fullVersusReducedLRs)-1)
# The computed likelihood ratios and p-values are saved
likelihoodRatioAppend(fullVersusReducedLRs)
pValueAppend(pValues)
# The removal of predictors is iterated 30 times
[iteration() for i in tqdm(range(30))]
# The number of times the model's performance was worsened by removing each
# predictor with replacement is computed.
likelihoodRatioList = np.asarray(likelihoodRatioList)
likelihoodRatioCounts = np.count_nonzero(likelihoodRatioList >= 0, axis = 0)
# Same as above, but this time for the number of tiems the worsened model
# performance exceeded the stopping criterion (p < 0.5). Used to inform
# predictor removal, with value derived from Mahmood et al. (2016).
pValueList = np.asarray(pValueList)
pValueCounts = np.count_nonzero(pValueList < 0.5, axis = 0)
# A dataframe is constructed that summarizes the lower goodness-of-fit
# of each version of the Reduced precitor configuration when compared
# to the Full predictor configuration.
dfReducedModel = DataFrame()
dfReducedModel["Predictors"] = dfxConfig.columns.tolist()
dfReducedModel["Reduced_Fit"] = likelihoodRatioCounts
dfReducedModel["Stop_Criterion"] = pValueCounts
dfReducedModel = dfReducedModel.sort_values(by = ["Reduced_Fit","Stop_Criterion"], ascending = False, ignore_index = True)
# The predictors ordered from most to least impactful upon their
# removal are assigned to a list
finalPredictors = dfReducedModel["Predictors"].tolist()
# This dataframe is written to the console output.
dfReducedModel = dfReducedModel.to_string(justify = 'center', col_space = 25, index = False)
pdf.multi_cell(w=0, h=5.0, align='L',
txt="\nDataframe showing the lowered goodness-of-fit caused by removing each predictor"
+"\n"+"with replacement over 30 model runs. The columns show the number of times removal of "
+"\n"+"each predictor reduced the model's goodness-of-fit, and the number of times this "
+"\n"+"reduction exceeded a p < 0.5 stopping criterion:\n\n",border=0)
pdf.multi_cell(w=0, h=5.0, align='R',txt="\n"+ dfReducedModel, border=0)
# Lists that will be used to create a dataframe summarizing the predictive
# power of each set of model predictors
numPredictors = list(range(1,len(finalPredictors)+1))
medAccuracyList = []
medAccuracyAppend = medAccuracyList.append
trueVersusFalseList = []
trueVersusFalseAppend = trueVersusFalseList.append
# Predictors are inserted into the Reduced model from the most to least
# impactful on the full model's goodness of fit, in order to determine the
# number of predictors that maximizes (minimizes) the model's true positive
# (false positive) rate.
print("\nPlease wait while the model's accuracy with different predictor combinations is assessed...\n")
def predictorCombos():
for h in tqdm(range(len(numPredictors))):
# Dataframe is sliced to contain only the predictors of interest
dfReduced = dfxConfig[finalPredictors[0:h+1]]