In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from scipy.stats import levene, ttest_ind
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, plot_confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text
import math
import numpy as np
import os
import pandas as pd
import requests
import seaborn as sns
import statsmodels.api as sm
import warnings
warnings.filterwarnings("ignore")

In [2]:
sns.set(rc={'figure.figsize':(16,9)})

In [3]:
df = pd.read_csv('train_data.csv')

In [18]:
df.head(25)

Unnamed: 0,index,lat,lon,startdate,contest-pevpr-sfc-gauss-14d__pevpr,nmme0-tmp2m-34w__cancm30,nmme0-tmp2m-34w__cancm40,nmme0-tmp2m-34w__ccsm30,nmme0-tmp2m-34w__ccsm40,nmme0-tmp2m-34w__cfsv20,...,wind-vwnd-925-2010-11,wind-vwnd-925-2010-12,wind-vwnd-925-2010-13,wind-vwnd-925-2010-14,wind-vwnd-925-2010-15,wind-vwnd-925-2010-16,wind-vwnd-925-2010-17,wind-vwnd-925-2010-18,wind-vwnd-925-2010-19,wind-vwnd-925-2010-20
0,0,0.0,0.833333,9/1/14,237.0,29.02,31.64,29.57,30.73,29.71,...,-27.68,-37.21,8.32,9.56,-2.03,48.13,28.09,-13.5,11.9,4.58
1,1,0.0,0.833333,9/2/14,228.9,29.02,31.64,29.57,30.73,29.71,...,-21.13,-36.57,8.77,21.17,4.44,48.6,27.41,-23.77,15.44,3.42
2,2,0.0,0.833333,9/3/14,220.69,29.02,31.64,29.57,30.73,29.71,...,-10.72,-34.16,6.99,32.16,5.01,48.53,19.21,-33.16,15.11,4.82
3,3,0.0,0.833333,9/4/14,225.28,29.02,31.64,29.57,30.73,29.71,...,0.33,-31.04,6.17,39.66,-1.41,50.59,8.29,-37.22,18.24,9.74
4,4,0.0,0.833333,9/5/14,237.24,29.02,31.64,29.57,30.73,29.71,...,9.83,-31.8,7.47,38.62,-5.21,54.73,-2.58,-42.3,21.91,10.95
5,5,0.0,0.833333,9/6/14,237.87,29.02,31.64,29.57,30.73,29.71,...,18.5,-31.46,9.53,38.36,-7.64,59.5,-11.27,-52.07,23.96,12.09
6,6,0.0,0.833333,9/7/14,236.36,29.02,31.64,29.57,30.73,29.71,...,21.72,-22.72,17.99,38.1,-7.09,63.29,-16.52,-52.61,31.4,12.76
7,7,0.0,0.833333,9/8/14,233.36,29.02,31.64,29.57,30.73,29.71,...,21.4,-12.81,30.81,36.88,-5.39,64.69,-18.54,-48.94,39.8,11.65
8,8,0.0,0.833333,9/9/14,233.82,29.02,31.64,29.57,30.73,29.71,...,18.79,-3.38,45.85,34.57,-2.46,62.43,-19.59,-45.81,46.65,8.39
9,9,0.0,0.833333,9/10/14,229.74,29.02,31.64,29.57,30.73,29.71,...,20.08,-5.34,62.8,26.96,-3.72,63.91,-27.55,-40.83,46.77,4.96


In [19]:
contest_df = df[['climateregions__climateregion', 'elevation__elevation', 'lat', 'lon', 'startdate',
                 'contest-pevpr-sfc-gauss-14d__pevpr','contest-precip-14d__precip','contest-pres-sfc-gauss-14d__pres',
                 'contest-prwtr-eatm-14d__prwtr','contest-rhum-sig995-14d__rhum','contest-slp-14d__slp',
                 'contest-tmp2m-14d__tmp2m','contest-wind-h10-14d__wind-hgt-10','contest-wind-h100-14d__wind-hgt-100',
                 'contest-wind-h500-14d__wind-hgt-500','contest-wind-h850-14d__wind-hgt-850','contest-wind-uwnd-250-14d__wind-uwnd-250',
                 'contest-wind-uwnd-925-14d__wind-uwnd-925','contest-wind-vwnd-250-14d__wind-vwnd-250',
                 'contest-wind-vwnd-925-14d__wind-vwnd-925']].copy()
contest_df

Unnamed: 0,climateregions__climateregion,elevation__elevation,lat,lon,startdate,contest-pevpr-sfc-gauss-14d__pevpr,contest-precip-14d__precip,contest-pres-sfc-gauss-14d__pres,contest-prwtr-eatm-14d__prwtr,contest-rhum-sig995-14d__rhum,contest-slp-14d__slp,contest-tmp2m-14d__tmp2m,contest-wind-h10-14d__wind-hgt-10,contest-wind-h100-14d__wind-hgt-100,contest-wind-h500-14d__wind-hgt-500,contest-wind-h850-14d__wind-hgt-850,contest-wind-uwnd-250-14d__wind-uwnd-250,contest-wind-uwnd-925-14d__wind-uwnd-925,contest-wind-vwnd-250-14d__wind-vwnd-250,contest-wind-vwnd-925-14d__wind-vwnd-925
0,BSh,200,0.0,0.833333,9/1/14,237.00,94.31,98644.97,42.45,81.72,101352.08,28.744480,31246.63,16666.81,5899.66,1535.52,-2.56,-5.22,-3.52,4.41
1,BSh,200,0.0,0.833333,9/2/14,228.90,100.85,98686.80,42.66,82.56,101396.02,28.370585,31244.78,16667.31,5901.03,1538.00,-2.39,-5.20,-4.49,3.74
2,BSh,200,0.0,0.833333,9/3/14,220.69,101.25,98712.85,43.23,83.29,101429.25,28.133059,31239.27,16668.39,5902.18,1540.32,-2.76,-5.00,-5.44,3.40
3,BSh,200,0.0,0.833333,9/4/14,225.28,101.90,98711.70,43.11,83.26,101440.85,28.256798,31232.86,16667.39,5903.07,1541.10,-3.00,-4.61,-5.76,3.29
4,BSh,200,0.0,0.833333,9/5/14,237.24,82.95,98686.46,42.98,82.50,101419.53,28.372353,31226.16,16665.65,5903.36,1539.73,-3.40,-4.25,-6.09,3.27
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375729,Dfb,100,1.0,0.866667,8/27/16,312.05,57.45,97613.96,24.32,70.25,101373.91,17.150954,31484.23,16540.22,5763.23,1501.92,23.20,1.32,9.59,2.96
375730,Dfb,100,1.0,0.866667,8/28/16,305.82,53.53,97631.29,23.92,71.08,101397.77,16.962051,31476.67,16534.71,5760.19,1502.70,23.18,1.40,7.14,2.30
375731,Dfb,100,1.0,0.866667,8/29/16,311.62,52.12,97588.69,23.94,69.74,101368.67,16.915474,31464.71,16526.98,5754.76,1499.57,24.49,1.75,7.05,2.19
375732,Dfb,100,1.0,0.866667,8/30/16,304.54,51.73,97538.62,23.61,69.71,101321.24,16.536761,31448.83,16512.39,5742.21,1493.14,25.80,1.84,7.74,1.88


In [20]:
# rename column names for better readability and easy manipulations of columns
contest_df=contest_df.rename(columns={'climateregions__climateregion': 'region', 
                                      'elevation__elevation': 'elevation',
                                      'contest-pevpr-sfc-gauss-14d__pevpr':'potential_evap',
                                      'contest-precip-14d__precip':'precip',
                                      'contest-pres-sfc-gauss-14d__pres':'barometric_pressure',
                                      'contest-prwtr-eatm-14d__prwtr':'all_atmos_precip',
                                      'contest-rhum-sig995-14d__rhum':'relative_humidity',
                                      'contest-slp-14d__slp':'sea_level_press',
                                      'contest-tmp2m-14d__tmp2m':'mean_temp',
                                      'contest-wind-h10-14d__wind-hgt-10':'height_10_mb',
                                      'contest-wind-h100-14d__wind-hgt-100':'height_100_mb',
                                      'contest-wind-h500-14d__wind-hgt-500':'height_500_mb',
                                      'contest-wind-h850-14d__wind-hgt-850':'height_850_mb',
                                      'contest-wind-uwnd-250-14d__wind-uwnd-250':'zonal_wind_250mb',
                                      'contest-wind-uwnd-925-14d__wind-uwnd-925':'zonal_wind_925mb',
                                      'contest-wind-vwnd-250-14d__wind-vwnd-250':'long_wind_250mb',
                                      'contest-wind-vwnd-925-14d__wind-vwnd-925':'long_wind_925mb'
                                     })

In [21]:
contest_df.columns

Index(['region', 'elevation', 'lat', 'lon', 'startdate', 'potential_evap',
       'precip', 'barometric_pressure', 'all_atmos_precip',
       'relative_humidity', 'sea_level_press', 'mean_temp', 'height_10_mb',
       'height_100_mb', 'height_500_mb', 'height_850_mb', 'zonal_wind_250mb',
       'zonal_wind_925mb', 'long_wind_250mb', 'long_wind_925mb'],
      dtype='object')

In [22]:
contest_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 375734 entries, 0 to 375733
Data columns (total 20 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   region               375734 non-null  object 
 1   elevation            375734 non-null  int64  
 2   lat                  375734 non-null  float64
 3   lon                  375734 non-null  float64
 4   startdate            375734 non-null  object 
 5   potential_evap       375734 non-null  float64
 6   precip               375734 non-null  float64
 7   barometric_pressure  375734 non-null  float64
 8   all_atmos_precip     375734 non-null  float64
 9   relative_humidity    375734 non-null  float64
 10  sea_level_press      375734 non-null  float64
 11  mean_temp            375734 non-null  float64
 12  height_10_mb         375734 non-null  float64
 13  height_100_mb        375734 non-null  float64
 14  height_500_mb        375734 non-null  float64
 15  height_850_mb    

In [23]:
contest_df.index

RangeIndex(start=0, stop=375734, step=1)

In [24]:
contest_df.dtypes

region                  object
elevation                int64
lat                    float64
lon                    float64
startdate               object
potential_evap         float64
precip                 float64
barometric_pressure    float64
all_atmos_precip       float64
relative_humidity      float64
sea_level_press        float64
mean_temp              float64
height_10_mb           float64
height_100_mb          float64
height_500_mb          float64
height_850_mb          float64
zonal_wind_250mb       float64
zonal_wind_925mb       float64
long_wind_250mb        float64
long_wind_925mb        float64
dtype: object

In [25]:
contest_df.shape

(375734, 20)

In [26]:
contest_df.describe()

Unnamed: 0,elevation,lat,lon,potential_evap,precip,barometric_pressure,all_atmos_precip,relative_humidity,sea_level_press,mean_temp,height_10_mb,height_100_mb,height_500_mb,height_850_mb,zonal_wind_250mb,zonal_wind_925mb,long_wind_250mb,long_wind_925mb
count,375734.0,375734.0,375734.0,375734.0,375734.0,375734.0,375734.0,375734.0,375734.0,375734.0,375734.0,375734.0,375734.0,375734.0,375734.0,375734.0,375734.0,375734.0
mean,1078.657588,0.592766,0.517964,276.744064,22.859842,89235.103399,14.298038,62.766512,101605.888627,11.858568,31071.557568,16404.561991,5728.370672,1500.148857,19.46755,1.035913,-0.800131,1.006341
std,669.219573,0.251744,0.272059,198.085182,32.109036,6509.231609,8.142212,17.399704,406.331364,9.867901,343.019882,196.007401,123.805703,28.77481,8.845528,1.696025,8.723436,2.086804
min,0.0,0.0,0.0,-2.31,0.0,74839.94,2.7,13.25,100379.21,-20.358963,29943.15,15750.12,5289.25,1353.54,-10.06,-6.84,-32.88,-8.0
25%,500.0,0.409091,0.3,104.0625,3.79,83598.7175,8.35,50.09,101306.62,4.818231,30811.99,16252.3225,5644.07,1484.78,13.7,-0.08,-6.26,-0.2
50%,900.0,0.590909,0.533333,250.32,12.18,89444.93,12.05,64.91,101511.245,12.28577,31016.82,16381.73,5732.22,1503.56,19.64,0.95,-0.28,0.77
75%,1700.0,0.818182,0.766667,420.01,28.84,94683.83,17.82,77.41,101844.87,19.44117,31374.78,16585.4275,5827.68,1519.67,25.4,2.08,5.12,2.01
max,3100.0,1.0,1.0,1185.14,502.63,102845.57,52.66,98.22,103275.84,37.238782,31803.03,16762.8,5958.2,1588.91,52.97,8.11,29.22,11.88


In [27]:
# Count number of zeros in all columns of Dataframe
for column_name in contest_df.columns:
    column = contest_df[column_name]
    # Get the count of Zeros in column 
    count = (column == 0).sum()
    print('Count of zeros in column ', column_name, ' is : ', count)

Count of zeros in column  region  is :  0
Count of zeros in column  elevation  is :  2193
Count of zeros in column  lat  is :  1462
Count of zeros in column  lon  is :  5848
Count of zeros in column  startdate  is :  0
Count of zeros in column  potential_evap  is :  0
Count of zeros in column  precip  is :  11683
Count of zeros in column  barometric_pressure  is :  0
Count of zeros in column  all_atmos_precip  is :  0
Count of zeros in column  relative_humidity  is :  0
Count of zeros in column  sea_level_press  is :  0
Count of zeros in column  mean_temp  is :  0
Count of zeros in column  height_10_mb  is :  0
Count of zeros in column  height_100_mb  is :  0
Count of zeros in column  height_500_mb  is :  0
Count of zeros in column  height_850_mb  is :  0
Count of zeros in column  zonal_wind_250mb  is :  23
Count of zeros in column  zonal_wind_925mb  is :  872
Count of zeros in column  long_wind_250mb  is :  188
Count of zeros in column  long_wind_925mb  is :  863


In [28]:
# examining missing values
print("Missing values distribution: ")
print(contest_df.isnull().mean())
print("")

Missing values distribution: 
region                 0.0
elevation              0.0
lat                    0.0
lon                    0.0
startdate              0.0
potential_evap         0.0
precip                 0.0
barometric_pressure    0.0
all_atmos_precip       0.0
relative_humidity      0.0
sea_level_press        0.0
mean_temp              0.0
height_10_mb           0.0
height_100_mb          0.0
height_500_mb          0.0
height_850_mb          0.0
zonal_wind_250mb       0.0
zonal_wind_925mb       0.0
long_wind_250mb        0.0
long_wind_925mb        0.0
dtype: float64



In [16]:
# use pandas hist()
#contest_df.hist()

In [29]:
region_df = contest_df.groupby(by=["region"])

In [30]:
# Count number of zeros in all columns of Dataframe
for column_name in region_df.columns:
    column = region_df[column_name]
    # Get the count of Zeros in column 
    count = (column == 0).sum()
    print('Count of zeros in column ', column_name, ' is : ', count)

AttributeError: 'DataFrameGroupBy' object has no attribute 'columns'

In [33]:
z_by_region = contest_df.groupby("region").count(contest_df['zonal_wind_250mb'] == 0).sum()

TypeError: count() takes 1 positional argument but 2 were given

In [35]:
grp=contest_df.groupby(['region'])

In [38]:
selected_group = grp.get_group(contest_df['zonal_wind_250mb'])
selected_group

TypeError: unhashable type: 'Series'

In [None]:
selected_group['zonal_wind_250mb']=='0']

In [39]:
contest_df.region.query('zonal_wind_250mb=="0"')

AttributeError: 'Series' object has no attribute 'query'

In [40]:
regions = contest_df.groupby(['region', 'zonal_wind_250mb'])

In [41]:
regions.query("zonal_wind_250mb == 0")

AttributeError: 'DataFrameGroupBy' object has no attribute 'query'

In [43]:
region_df =contest_df.groupby[(contest_df.region) & (contest_df.zonal_wind_250mb == 0)]

TypeError: unsupported operand type(s) for &: 'str' and 'bool'

In [None]:
mask = (contest_df['zonal_wind_250mb'] == 0)
contest_df[mask]

In [58]:
df1=contest_df[contest_df['zonal_wind_250mb']==0]['region']

In [59]:
df1.value_counts()


Cfa    11
BSk     4
Dfb     3
BWk     1
Csa     1
Dfa     1
Csb     1
Dfc     1
Name: region, dtype: int64

In [60]:
df2=contest_df[contest_df['zonal_wind_925mb']==0]['region']

In [61]:
df2.value_counts()

BSk    352
Cfa    128
Csb    120
Dfb     79
BWk     44
Dfa     44
BWh     29
Csa     20
Dfc     20
Dsb     11
Cfb     10
BSh      7
Dsc      5
Dwa      2
Dwb      1
Name: region, dtype: int64

In [62]:
df3=contest_df[contest_df['long_wind_250mb']==0]['region']

In [63]:
df3.value_counts()

BSk    70
Dfb    29
Cfa    27
Csb    21
Dfa    16
BWh     5
Dfc     5
Csa     4
Dsb     4
BWk     3
Cfb     2
BSh     1
Dsc     1
Name: region, dtype: int64

In [64]:
df4=contest_df[contest_df['long_wind_925mb']==0]['region']

In [65]:
df4.value_counts()

BSk    377
Dfb    145
Csb     61
BWk     46
Dfa     44
Dfc     43
Cfa     41
BWh     30
Csa     25
Dsb     24
Cfb      8
Dsc      8
BSh      6
Dwa      4
Dwb      1
Name: region, dtype: int64

In [66]:
df5=pd.concat([df1,df2,df3,df4], ignore_index=True)

In [67]:
df5.value_counts()

BSk    803
Dfb    256
Cfa    207
Csb    203
Dfa    105
BWk     94
Dfc     69
BWh     64
Csa     50
Dsb     39
Cfb     20
BSh     14
Dsc     14
Dwa      6
Dwb      2
Name: region, dtype: int64