In [None]:

#importing relevant libraries
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
import geopandas as gpd

In [None]:
np.random.seed(5)

In [None]:

#variable storing the path of the directory containing all the data files
data_path = ''

In [None]:

#reading the data files
train = gpd.read_file(data_path+'Train.gpkg')
gfaults = gpd.read_file(data_path+'geological_faults.gpkg')

lulc = gpd.read_file(data_path+'land_use_land_cover.gpkg')
test = gpd.read_file(data_path+'Test.gpkg')

In [None]:

#Extracting the area of the train geometry
train['area'] = train.geometry.area

In [None]:

#Considering the characteristics of the area of land 116 units surrounding the test points
test['area'] = test.geometry.buffer(116).area

In [None]:

#converting the test point geometries to polygons representing areas around the test points
test.geometry = test.geometry.buffer(116)

In [None]:

#viewing the test dataframe
test

Unnamed: 0,ID,geometry,area
0,ID_000001,"POLYGON ((541978.336 5103652.266, 541977.777 5...",42205.396489
1,ID_000002,"POLYGON ((566572.496 5131798.978, 566571.937 5...",42205.396489
2,ID_000003,"POLYGON ((584714.972 5109016.391, 584714.413 5...",42205.396489
3,ID_000004,"POLYGON ((542530.162 5125941.301, 542529.604 5...",42205.396489
4,ID_000005,"POLYGON ((532215.144 5133370.588, 532214.585 5...",42205.396489
...,...,...,...
39995,ID_039996,"POLYGON ((605625.423 5146469.307, 605624.864 5...",42205.396489
39996,ID_039997,"POLYGON ((526931.199 5147832.968, 526930.640 5...",42205.396489
39997,ID_039998,"POLYGON ((526989.190 5147470.852, 526988.631 5...",42205.396489
39998,ID_039999,"POLYGON ((569623.854 5101121.584, 569623.295 5...",42205.396489


In [None]:

#extracting details of the geological faults that intersect with the test areas
test_gfaults = gpd.sjoin(test,gfaults,how='left')

In [None]:


#extracting details of the geological faults that intersect with the train areas
train_gfaults = gpd.sjoin(train,gfaults,how='left')

In [None]:

#calculating the areas of the land-use-land-cover geometries
lulc['lulc_area'] = lulc.geometry.area

In [None]:


#extracting details of the land-use-land-cover geometries that intersect with the train areas
train_lulc = gpd.sjoin(train,lulc,how='left')

In [None]:

#extracting details of the land-use-land-cover geometries that intersect with the test areas
test_lulc = gpd.sjoin(test,lulc,how='left')

In [None]:

#encoding the categorical columns of the dataframe containing details of both train and geological faults data
traingfaults_tipoel = pd.get_dummies(train_gfaults['TIPO_EL'],prefix='tipoel')
traingfaults_dtipoel = pd.get_dummies(train_gfaults['DTIPO_EL'],prefix='dtipoel')
traingfaults_tipofag = pd.get_dummies(train_gfaults['TIPO_FAGLI'],prefix='tipofag')
traingfaults_dtipofag = pd.get_dummies(train_gfaults['DTIPO_FAGL'],prefix='dtipofag')

In [None]:


#encoding the categorical columns of the dataframe containing details of both test and geological faults data
testgfaults_tipoel = pd.get_dummies(test_gfaults['TIPO_EL'],prefix='tipoel')
testgfaults_dtipoel = pd.get_dummies(test_gfaults['DTIPO_EL'],prefix='dtipoel')
testgfaults_tipofag = pd.get_dummies(test_gfaults['TIPO_FAGLI'],prefix='tipofag')
testgfaults_dtipofag = pd.get_dummies(test_gfaults['DTIPO_FAGL'],prefix='dtipofag')

In [None]:


#encoding the categorical columns of the dataframe containing details of both test and land-use-land-cover data
test_lulc_codice = pd.get_dummies(test_lulc['2-CODICE'],prefix='cod')

In [None]:

#encoding the categorical columns of the dataframe containing details of both train and land-use-land-cover data
train_lulc_codice = pd.get_dummies(train_lulc['2-CODICE'],prefix='cod')

In [None]:

#adding the encoded columns to the dataframe containing the rest of the test and geological faults data
full_testgfault=pd.concat([test_gfaults,testgfaults_tipoel,testgfaults_dtipoel,testgfaults_tipofag,testgfaults_dtipofag],axis=1)

In [None]:

#adding the encoded columns to the dataframe containing the rest of the train and geological faults data
full_traingfault=pd.concat([train_gfaults,traingfaults_tipoel,traingfaults_dtipoel,traingfaults_tipofag,traingfaults_dtipofag],axis=1)

In [None]:

#adding the encoded columns to the dataframe containing the rest of the test and land-use-land-cover data
full_testlulc = pd.concat([test_lulc,test_lulc_codice],axis=1)

In [None]:

#adding the encoded columns to the dataframe containing the rest of the train and land-use-land-cover data
full_trainlulc = pd.concat([train_lulc,train_lulc_codice],axis=1)

In [None]:

#adding the area of each land-use-land-cover type present in a particular train geometry
for j in ['31', '23', '32', '33', '22', '11', '21', '41', '51', '12', '14',
       '13']:
  full_trainlulc[f'area_{j}']= 0

for j in ['31', '23', '32', '33', '22', '11', '21', '41', '51', '12', '14',
       '13']:
  full_trainlulc[f'area_{j}']=np.where(full_trainlulc[f'cod_{j}']==1,full_trainlulc['lulc_area'],full_trainlulc[f'area_{j}'])

In [None]:

#adding the area of each land-use-land-cover type present in a particular test geometry

for j in ['31', '23', '32', '33', '22', '11', '21', '41', '51', '12', '14',
       '13']:
  full_testlulc[f'area_{j}']= 0

for j in ['31', '23', '32', '33', '22', '11', '21', '41', '51', '12', '14',
       '13']:
  full_testlulc[f'area_{j}']=np.where(full_testlulc[f'cod_{j}']==1,full_testlulc['lulc_area'],full_testlulc[f'area_{j}'])

In [None]:

#Grouping observations by the 'ID' such that the data of all individual geological faults of a particular train or test geometry will no longer be spread across multiple rows
# but be represented by a single row in the dataframe
set_testgfault = full_testgfault.groupby('ID').sum().reset_index()
set_traingfault = full_traingfault.groupby('ID').sum().reset_index()

  set_testgfault = full_testgfault.groupby('ID').sum().reset_index()
  set_traingfault = full_traingfault.groupby('ID').sum().reset_index()


In [None]:

#Grouping observations by the 'ID' such that all individual land-use-land-cover data of a particular train or test geometry will no longer be spread across multiple rows
# but be represented by a single row in the dataframe
set_testlulc = full_testlulc.groupby('ID').sum().reset_index()
set_trainlulc = full_trainlulc.groupby('ID').sum().reset_index()

  set_testlulc = full_testlulc.groupby('ID').sum().reset_index()
  set_trainlulc = full_trainlulc.groupby('ID').sum().reset_index()


In [None]:

#combining land-use-land-cover and geological faults data into a single dataframe
final_test = pd.concat([set_testlulc.drop(['index_right','ID','area'],axis=1),set_testgfault.drop('index_right',axis=1)],axis=1)
final_train = pd.concat([set_trainlulc.drop(['ID','Target','index_right','area'],axis=1),set_traingfault.drop('index_right',axis=1)],axis=1)

In [None]:

#The 'area' values got summed as a result of the groupby operation so this cell replaces it with the correct values from the train and test dataframe
final_train['area'] = train['area']
final_test['area'] = test['area']

In [None]:
##The 'target' values got summed as a result of the groupby operation so this cell replaces it with the correct values from the train dataframe
final_train['Target'] = train['Target']

In [None]:
#Separating target and features
X = final_train.drop(['ID','Target'],axis=1)
y = final_train['Target']

In [None]:

#Splitting train data into train and validation sets
X_train, X_test, y_train,y_test = train_test_split(X,y,test_size=0.2,stratify=y,random_state=5)

In [None]:

#creating an instance of SMOTE
smt = SMOTE(random_state=5)

In [None]:

#resampling to introduce balance into the target distribution
bX,by = smt.fit_resample(X_train,y_train)

In [None]:

#defining a space of hyperparameter for tuning the model
from scipy.stats import randint, uniform

# Define the parameter distribution space for RandomForestClassifier
param_dist = {
    'classifier__n_estimators': randint(30, 501),  # Values between 50 and 500
    'classifier__max_depth': [None] + list(randint(1, 51).rvs(15)),  # Include 'None' for no limit
    'classifier__min_samples_split': randint(2, 31),  # Values between 2 and 20
    'classifier__min_samples_leaf': randint(1, 11),  # Values between 1 and 10
    'classifier__max_features': uniform(0.1, 0.9),  # Values between 0.1 and 1.0 for feature fraction
    'classifier__bootstrap': [True, False],  # True or False for bootstrapping
    'feature_selection__k': randint(5, 38),  # Values between 800 and 1500
}

In [None]:
#creating a model to scale data, select best features and fit a RandomForestClassifier model
pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('feature_selection', SelectKBest(k=33)),
        ('classifier', RandomForestClassifier(
          n_jobs=-1,

          random_state=2,
        ))
    ])

search = RandomizedSearchCV(pipe,param_dist,cv=10,error_score='raise',random_state=3,n_iter=30)

In [None]:

#fitting the model to the balanced train data
search.fit(bX,by)

In [None]:

# best train score
search.best_score_

0.9346013832237743

In [None]:

# evaluating the model's performance on the validation set
acc = accuracy_score(search.predict(X_test),y_test)
acc

0.886326194398682

In [None]:

#predicting on the Zindi test data and presenting it in a dataframe
submission = search.predict(final_test[X.columns])
subdf = test[['ID']]
subdf['Target'] = submission
subdf

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subdf['Target'] = submission


Unnamed: 0,ID,Target
0,ID_000001,1
1,ID_000002,1
2,ID_000003,1
3,ID_000004,1
4,ID_000005,1
...,...,...
39995,ID_039996,0
39996,ID_039997,0
39997,ID_039998,0
39998,ID_039999,1


In [None]:

#Exporting the submission dataframe to a csv document
subdf.to_csv('Submission.csv',index=False)