In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import Ridge, SGDRegressor, LinearRegression
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, ExtraTreesRegressor,AdaBoostRegressor, VotingRegressor, StackingRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, make_pipeline

from sklearn.metrics import mean_squared_error, mean_absolute_error,r2_score

In [2]:
train_set = pd.read_csv("/kaggle/input/playground-series-s4e5/train.csv")
test_set = pd.read_csv("/kaggle/input/playground-series-s4e5/test.csv")

In [3]:
train_set.nunique()

id                                 1117957
MonsoonIntensity                        17
TopographyDrainage                      19
RiverManagement                         17
Deforestation                           18
Urbanization                            18
ClimateChange                           18
DamsQuality                             17
Siltation                               17
AgriculturalPractices                   17
Encroachments                           19
IneffectiveDisasterPreparedness         17
DrainageSystems                         18
CoastalVulnerability                    18
Landslides                              17
Watersheds                              17
DeterioratingInfrastructure             18
PopulationScore                         19
WetlandLoss                             20
InadequatePlanning                      17
PoliticalFactors                        17
FloodProbability                        83
dtype: int64

In [4]:
test_set.nunique()

id                                 745305
MonsoonIntensity                       17
TopographyDrainage                     18
RiverManagement                        17
Deforestation                          18
Urbanization                           18
ClimateChange                          18
DamsQuality                            17
Siltation                              17
AgriculturalPractices                  17
Encroachments                          18
IneffectiveDisasterPreparedness        17
DrainageSystems                        18
CoastalVulnerability                   18
Landslides                             17
Watersheds                             17
DeterioratingInfrastructure            18
PopulationScore                        20
WetlandLoss                            20
InadequatePlanning                     17
PoliticalFactors                       17
dtype: int64

In [5]:
train_set

Unnamed: 0,id,MonsoonIntensity,TopographyDrainage,RiverManagement,Deforestation,Urbanization,ClimateChange,DamsQuality,Siltation,AgriculturalPractices,...,DrainageSystems,CoastalVulnerability,Landslides,Watersheds,DeterioratingInfrastructure,PopulationScore,WetlandLoss,InadequatePlanning,PoliticalFactors,FloodProbability
0,0,5,8,5,8,6,4,4,3,3,...,5,3,3,5,4,7,5,7,3,0.445
1,1,6,7,4,4,8,8,3,5,4,...,7,2,0,3,5,3,3,4,3,0.450
2,2,6,5,6,7,3,7,1,5,4,...,7,3,7,5,6,8,2,3,3,0.530
3,3,3,4,6,5,4,8,4,7,6,...,2,4,7,4,4,6,5,7,5,0.535
4,4,5,3,2,6,4,4,3,3,3,...,2,2,6,6,4,1,2,3,5,0.415
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1117952,1117952,3,3,4,10,4,5,5,7,10,...,7,8,7,2,2,1,4,6,4,0.495
1117953,1117953,2,2,4,3,9,5,8,1,3,...,9,4,4,3,7,4,9,4,5,0.480
1117954,1117954,7,3,9,4,6,5,9,1,3,...,5,5,5,5,6,5,5,2,4,0.485
1117955,1117955,7,3,3,7,5,2,3,4,6,...,6,8,5,3,4,6,7,6,4,0.495


In [6]:
test_set

Unnamed: 0,id,MonsoonIntensity,TopographyDrainage,RiverManagement,Deforestation,Urbanization,ClimateChange,DamsQuality,Siltation,AgriculturalPractices,...,IneffectiveDisasterPreparedness,DrainageSystems,CoastalVulnerability,Landslides,Watersheds,DeterioratingInfrastructure,PopulationScore,WetlandLoss,InadequatePlanning,PoliticalFactors
0,1117957,4,6,3,5,6,7,8,7,8,...,8,5,7,5,6,3,6,4,4,5
1,1117958,4,4,2,9,5,5,4,7,5,...,2,4,7,4,5,1,7,4,4,3
2,1117959,1,3,6,5,7,2,4,6,4,...,7,9,2,5,5,2,3,6,8,3
3,1117960,2,4,4,6,4,5,4,3,4,...,7,8,4,6,7,6,4,2,4,4
4,1117961,6,3,2,4,6,4,5,5,3,...,4,3,2,6,4,6,8,4,5,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
745300,1863257,5,4,8,3,5,4,4,5,5,...,5,6,1,3,5,6,4,4,6,6
745301,1863258,4,4,2,12,4,3,4,3,5,...,3,7,4,4,3,5,5,3,5,4
745302,1863259,5,7,9,5,5,6,7,5,5,...,6,11,3,11,4,5,9,5,5,4
745303,1863260,4,7,6,3,5,2,3,8,6,...,6,6,8,6,2,3,8,7,5,5


In [7]:
len(train_set)

1117957

In [8]:
test_set_id = test_set[["id"]]

In [9]:
train_set = train_set.set_index("id")

In [10]:
test_set = test_set.set_index("id")

In [11]:
X = pd.DataFrame(train_set.drop(columns=["FloodProbability"]))
y = pd.DataFrame(train_set["FloodProbability"])

In [12]:
full_df = pd.concat([X, test_set])

In [13]:
full_df.shape

(1863262, 20)

In [14]:
columns = list(full_df.columns)
columns

['MonsoonIntensity',
 'TopographyDrainage',
 'RiverManagement',
 'Deforestation',
 'Urbanization',
 'ClimateChange',
 'DamsQuality',
 'Siltation',
 'AgriculturalPractices',
 'Encroachments',
 'IneffectiveDisasterPreparedness',
 'DrainageSystems',
 'CoastalVulnerability',
 'Landslides',
 'Watersheds',
 'DeterioratingInfrastructure',
 'PopulationScore',
 'WetlandLoss',
 'InadequatePlanning',
 'PoliticalFactors']

In [15]:
# OH_encoder  = OneHotEncoder()

# OH_encoder.fit_transform(full_df)


In [16]:
full_df_dum = pd.get_dummies(full_df, columns = columns)
full_df = full_df.join(full_df_dum)

In [17]:
full_df_columns = full_df.columns
full_df_columns

Index(['MonsoonIntensity', 'TopographyDrainage', 'RiverManagement',
       'Deforestation', 'Urbanization', 'ClimateChange', 'DamsQuality',
       'Siltation', 'AgriculturalPractices', 'Encroachments',
       ...
       'PoliticalFactors_7', 'PoliticalFactors_8', 'PoliticalFactors_9',
       'PoliticalFactors_10', 'PoliticalFactors_11', 'PoliticalFactors_12',
       'PoliticalFactors_13', 'PoliticalFactors_14', 'PoliticalFactors_15',
       'PoliticalFactors_16'],
      dtype='object', length=377)

In [18]:
full_df

Unnamed: 0_level_0,MonsoonIntensity,TopographyDrainage,RiverManagement,Deforestation,Urbanization,ClimateChange,DamsQuality,Siltation,AgriculturalPractices,Encroachments,...,PoliticalFactors_7,PoliticalFactors_8,PoliticalFactors_9,PoliticalFactors_10,PoliticalFactors_11,PoliticalFactors_12,PoliticalFactors_13,PoliticalFactors_14,PoliticalFactors_15,PoliticalFactors_16
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,5,8,5,8,6,4,4,3,3,4,...,False,False,False,False,False,False,False,False,False,False
1,6,7,4,4,8,8,3,5,4,6,...,False,False,False,False,False,False,False,False,False,False
2,6,5,6,7,3,7,1,5,4,5,...,False,False,False,False,False,False,False,False,False,False
3,3,4,6,5,4,8,4,7,6,8,...,False,False,False,False,False,False,False,False,False,False
4,5,3,2,6,4,4,3,3,3,3,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1863257,5,4,8,3,5,4,4,5,5,5,...,False,False,False,False,False,False,False,False,False,False
1863258,4,4,2,12,4,3,4,3,5,5,...,False,False,False,False,False,False,False,False,False,False
1863259,5,7,9,5,5,6,7,5,5,3,...,False,False,False,False,False,False,False,False,False,False
1863260,4,7,6,3,5,2,3,8,6,7,...,False,False,False,False,False,False,False,False,False,False


In [19]:
full_df[['MonsoonIntensity', 'TopographyDrainage', 'RiverManagement', 
          'Deforestation', 'Urbanization', 'ClimateChange', 'DamsQuality',
          'Siltation', 'AgriculturalPractices', 'Encroachments',
          'IneffectiveDisasterPreparedness', 'DrainageSystems',
          'CoastalVulnerability', 'Landslides', 'Watersheds', 
          'DeterioratingInfrastructure', 'PopulationScore', 'WetlandLoss',
           'InadequatePlanning', 'PoliticalFactors']]

Unnamed: 0_level_0,MonsoonIntensity,TopographyDrainage,RiverManagement,Deforestation,Urbanization,ClimateChange,DamsQuality,Siltation,AgriculturalPractices,Encroachments,IneffectiveDisasterPreparedness,DrainageSystems,CoastalVulnerability,Landslides,Watersheds,DeterioratingInfrastructure,PopulationScore,WetlandLoss,InadequatePlanning,PoliticalFactors
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0,5,8,5,8,6,4,4,3,3,4,2,5,3,3,5,4,7,5,7,3
1,6,7,4,4,8,8,3,5,4,6,9,7,2,0,3,5,3,3,4,3
2,6,5,6,7,3,7,1,5,4,5,6,7,3,7,5,6,8,2,3,3
3,3,4,6,5,4,8,4,7,6,8,5,2,4,7,4,4,6,5,7,5
4,5,3,2,6,4,4,3,3,3,3,5,2,2,6,6,4,1,2,3,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1863257,5,4,8,3,5,4,4,5,5,5,5,6,1,3,5,6,4,4,6,6
1863258,4,4,2,12,4,3,4,3,5,5,3,7,4,4,3,5,5,3,5,4
1863259,5,7,9,5,5,6,7,5,5,3,6,11,3,11,4,5,9,5,5,4
1863260,4,7,6,3,5,2,3,8,6,7,6,6,8,6,2,3,8,7,5,5


In [20]:
scaler = StandardScaler()
full_df[['MonsoonIntensity', 'TopographyDrainage', 'RiverManagement', 
                              'Deforestation', 'Urbanization', 'ClimateChange', 'DamsQuality',
                              'Siltation', 'AgriculturalPractices', 'Encroachments',
                              'IneffectiveDisasterPreparedness', 'DrainageSystems',
                              'CoastalVulnerability', 'Landslides', 'Watersheds', 
                              'DeterioratingInfrastructure', 'PopulationScore', 'WetlandLoss',
                              'InadequatePlanning', 'PoliticalFactors']] = scaler.fit_transform(full_df[['MonsoonIntensity', 'TopographyDrainage', 'RiverManagement', 
                                                                                                         'Deforestation', 'Urbanization', 'ClimateChange', 'DamsQuality',
                                                                                                         'Siltation', 'AgriculturalPractices', 'Encroachments',
                                                                                                         'IneffectiveDisasterPreparedness', 'DrainageSystems',
                                                                                                         'CoastalVulnerability', 'Landslides', 'Watersheds', 
                                                                                                         'DeterioratingInfrastructure', 'PopulationScore', 'WetlandLoss',
                                                                                                         'InadequatePlanning', 'PoliticalFactors']])

In [21]:
full_df = pd.DataFrame(full_df, columns = full_df_columns)

In [22]:
full_df

Unnamed: 0_level_0,MonsoonIntensity,TopographyDrainage,RiverManagement,Deforestation,Urbanization,ClimateChange,DamsQuality,Siltation,AgriculturalPractices,Encroachments,...,PoliticalFactors_7,PoliticalFactors_8,PoliticalFactors_9,PoliticalFactors_10,PoliticalFactors_11,PoliticalFactors_12,PoliticalFactors_13,PoliticalFactors_14,PoliticalFactors_15,PoliticalFactors_16
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.039335,1.467011,0.020654,1.489348,0.508518,-0.453698,-0.458827,-0.932697,-0.938761,-0.456037,...,False,False,False,False,False,False,False,False,False,False
1,0.525633,0.989450,-0.461971,-0.459918,1.468781,1.489614,-0.938314,0.034964,-0.455784,0.504258,...,False,False,False,False,False,False,False,False,False,False
2,0.525633,0.034328,0.503280,1.002031,-0.931878,1.003786,-1.897287,0.034964,-0.455784,0.024110,...,False,False,False,False,False,False,False,False,False,False
3,-0.933262,-0.443233,0.503280,0.027398,-0.451746,1.489614,-0.458827,1.002626,0.510172,1.464552,...,False,False,False,False,False,False,False,False,False,False
4,0.039335,-0.920793,-1.427222,0.514715,-0.451746,-0.453698,-0.938314,-0.932697,-0.938761,-0.936185,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1863257,0.039335,-0.443233,1.468530,-0.947235,0.028386,-0.453698,-0.458827,0.034964,0.027194,0.024110,...,False,False,False,False,False,False,False,False,False,False
1863258,-0.446964,-0.443233,-1.427222,3.438614,-0.451746,-0.939526,-0.458827,-0.932697,0.027194,0.024110,...,False,False,False,False,False,False,False,False,False,False
1863259,0.039335,0.989450,1.951156,0.027398,0.028386,0.517958,0.979632,0.034964,0.027194,-0.936185,...,False,False,False,False,False,False,False,False,False,False
1863260,-0.446964,0.989450,0.503280,-0.947235,0.028386,-1.425354,-0.938314,1.486457,0.510172,0.984405,...,False,False,False,False,False,False,False,False,False,False


In [23]:
X = full_df[0:1117957]

In [24]:
test_set = full_df[1117957:]

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

### VAL

In [None]:
linear = LinearRegression()

linear.fit(X_train,y_train)
print(f"Training score {linear.score(X_train,y_train)}")
y_pred = linear.predict(X_test)
print(f"R2: {r2_score(y_test,y_pred)}")

In [None]:
cat_model = CatBoostRegressor()
cat_model.fit(X_train, y_train)

y_pred = cat_model.predict(X_test)
print(f"Training score {cat_model.score(X_train,y_train)}")
print(f"R2: {r2_score(y_test,y_pred)}")
# Training score 0.849059370869179
# R2: 0.8458341178802313
# before scaling
# after scaling
# Training score 0.849059370869179
# R2: 0.8458341178802313

In [None]:
xg_model = XGBRegressor()
xg_model.fit(X_train, y_train)

y_pred = xg_model.predict(X_test)
print(f"Training score {xg_model.score(X_train,y_train)}")
print(f"R2: {r2_score(y_test,y_pred)}")

In [None]:
svc_model = SVR()
svc_model.fit(X_train, y_train)

y_pred = svc_model.predict(X_test)

print(f"Training score {svc_model.score(X_train,y_train)}")
print(f"R2: {r2_score(y_test,y_pred)}")

In [None]:
knn_model = KNeighborsRegressor()
knn_model.fit(X_train, y_train)

y_pred = knn_model.predict(X_test)

print(f"Training score {knn_model.score(X_train,y_train)}")
print(f"R2: {r2_score(y_test,y_pred)}")

In [None]:
dt_model = DecisionTreeRegressor()
dt_model.fit(X_train, y_train)

y_pred = dt_model.predict(X_test)

print(f"Training score {dt_model.score(X_train,y_train)}")
print(f"R2: {r2_score(y_test,y_pred)}")

In [None]:
rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)
print(f"Training score {rf_model.score(X_train,y_train)}")
print(f"R2: {r2_score(y_test,y_pred)}")

In [None]:
bc_model = BaggingRegressor(estimator=rf_model, n_estimators=100)
bc_model.fit(X_train, y_train)

y_pred = bc_model.predict(X_test)
print(f"Training score {bc_model.score(X_train,y_train)}")
print(f"R2: {r2_score(y_test,y_pred)}")

In [None]:
ET_model = ExtraTreesRegressor()
ET_model.fit(X_train, y_train)

y_pred = ET_model.predict(X_test)
print(f"Training score {ET_model.score(X_train,y_train)}")
print(f"R2: {r2_score(y_test,y_pred)}")

In [None]:
ABC_model = AdaBoostRegressor(estimator=bc_model)
ABC_model.fit(X_train, y_train)

y_pred = ABC_model.predict(X_test)
print(f"Training score {ABC_model.score(X_train,y_train)}")
print(f"R2: {r2_score(y_test,y_pred)}")

In [None]:
vc_model = VotingRegressor()
vc_model.fit(X_train, y_train)

y_pred = vc_model.predict(X_test)
print(f"Training score {vc_model.score(X_train,y_train)}")
print(f"R2: {r2_score(y_test,y_pred)}")

In [None]:
SC_model = StackingRegressor(estimators=)
SC_model.fit(X_train, y_train)

y_pred = SC_model.predict(X_test)
print(f"Training score {SC_model.score(X_train,y_train)}")
print(f"R2: {r2_score(y_test,y_pred)}")

In [None]:
LG_model = LGBMRegressor()
LG_model.fit(X_train, y_train)

y_pred = LG_model.predict(X_test)
print(f"Training score {LG_model.score(X_train,y_train)}")
print(f"R2: {r2_score(y_test,y_pred)}")


### OG Test

In [26]:
cat_model = CatBoostRegressor()
cat_model.fit(X, y)

y_pred = cat_model.predict(test_set)
print(f"Training score {cat_model.score(X,y)}")

Learning rate set to 0.124117
0:	learn: 0.0501348	total: 289ms	remaining: 4m 48s
1:	learn: 0.0492474	total: 514ms	remaining: 4m 16s
2:	learn: 0.0483581	total: 732ms	remaining: 4m 3s
3:	learn: 0.0475541	total: 948ms	remaining: 3m 56s
4:	learn: 0.0468042	total: 1.16s	remaining: 3m 51s
5:	learn: 0.0460497	total: 1.36s	remaining: 3m 45s
6:	learn: 0.0453285	total: 1.57s	remaining: 3m 42s
7:	learn: 0.0446567	total: 1.77s	remaining: 3m 39s
8:	learn: 0.0439907	total: 2s	remaining: 3m 39s
9:	learn: 0.0433295	total: 2.21s	remaining: 3m 38s
10:	learn: 0.0427508	total: 2.38s	remaining: 3m 34s
11:	learn: 0.0421778	total: 2.57s	remaining: 3m 31s
12:	learn: 0.0416091	total: 2.74s	remaining: 3m 28s
13:	learn: 0.0410653	total: 2.93s	remaining: 3m 26s
14:	learn: 0.0405244	total: 3.1s	remaining: 3m 23s
15:	learn: 0.0399932	total: 3.27s	remaining: 3m 20s
16:	learn: 0.0394783	total: 3.43s	remaining: 3m 18s
17:	learn: 0.0390010	total: 3.62s	remaining: 3m 17s
18:	learn: 0.0385237	total: 3.78s	remaining: 3m 1

In [28]:
y_pred = pd.DataFrame(y_pred)

In [36]:
output = test_set_id

In [37]:
output["FloodProbability"] = pd.DataFrame(y_pred)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  output["FloodProbability"] = pd.DataFrame(y_pred)


In [38]:
output

Unnamed: 0,id,FloodProbability
0,1117957,0.574155
1,1117958,0.455757
2,1117959,0.455137
3,1117960,0.465465
4,1117961,0.464750
...,...,...
745300,1863257,0.476583
745301,1863258,0.451191
745302,1863259,0.626610
745303,1863260,0.551336


In [40]:
output.to_csv("FloodProbability_OHE.csv")