In [None]:
import pandas as pd
import numpy as np
import geopandas as gpd

import matplotlib.pyplot as plt
import rasterio
from rasterio.plot import show
from shapely.ops import nearest_points


In [None]:
tr_df = pd.read_csv('/kaggle/input/landslide/Train.csv')
te_df = pd.read_csv('/kaggle/input/landslide/Test.csv')
print(tr_df.shape)
print(te_df.shape)

(12140, 2)
(40000, 1)


In [None]:
# Reaf Geodataframe
tr_gdf = gpd.read_file('/kaggle/input/landslide/datasets/Train.gpkg')
te_gdf = gpd.read_file('/kaggle/input/landslide/datasets/Test.gpkg')

river_net_gdf = gpd.read_file('/kaggle/input/landslide/datasets/river_network.gpkg')
road_net_gdf = gpd.read_file('/kaggle/input/landslide/datasets/road_network.gpkg')
valtellina_gdf = gpd.read_file('/kaggle/input/landslide/datasets/valtellina.gpkg')
lulc_gdf = gpd.read_file('/kaggle/input/landslide/datasets/land_use_land_cover.gpkg')
geo_faults_gdf = gpd.read_file('/kaggle/input/landslide/datasets/geological_faults.gpkg')

In [None]:
avg_precip = rasterio.open('//kaggle/input/landslide/datasets/average_precipitation_2020.tif')
perc_precip = rasterio.open('/kaggle/input/landslide/datasets/90_perc_precipitation_2020.tif')
dtm =  rasterio.open('/kaggle/input/landslide/datasets/dtm.tif')

In [None]:
import rasterio
from rasterio.mask import mask
from shapely.geometry import shape, Point
import random

random.seed(2023)

In [None]:
# Function to generate random points within a training polygon - extract one point for each polygon
def generate_points_within_polygon(polygon, num_points, polygon_id):
    minx, miny, maxx, maxy = polygon.bounds
    points = []
    while len(points) < num_points:
        x = random.uniform(minx, maxx)
        y = random.uniform(miny, maxy)
        point = Point(x, y)
        if point.within(polygon):
            points.append({'ID': polygon_id, 'geometry': point})
    return points


# Iterate over polygons
all_points = []
for idx, polygon in tr_gdf.iterrows():
    polygon_geom = shape(polygon['geometry'])
    polygon_id = polygon['ID']
    num_points = 1  # Adjust the number of points as needed
    points_within_polygon = generate_points_within_polygon(polygon_geom, num_points, polygon_id)
    all_points.extend(points_within_polygon)

tr_points_gdf = gpd.GeoDataFrame(all_points, geometry='geometry')


In [None]:
# Extract values of training points from raster data:
## average_precipitation_2020, 90_perc_precipitation_2020, DTM
with rasterio.Env():
    sample_gen = avg_precip.sample(zip(tr_points_gdf['geometry'].x, tr_points_gdf['geometry'].y))
    values = [val[0] for val in sample_gen]
    tr_points_gdf['avg_precip'] = values

    sample_gen = perc_precip.sample(zip(tr_points_gdf['geometry'].x, tr_points_gdf['geometry'].y))
    values = [val[0] for val in sample_gen]
    tr_points_gdf['perc_precip'] = values

    sample_gen = dtm.sample(zip(tr_points_gdf['geometry'].x, tr_points_gdf['geometry'].y))
    values = [val[0] for val in sample_gen]
    tr_points_gdf['dtm'] = values

# Extract values of testing points from raster data ,
## average_precipitation_2020, 90_perc_precipitation_2020, DTM
with rasterio.Env():
    sample_gen = avg_precip.sample(zip(te_gdf['geometry'].x, te_gdf['geometry'].y))
    values = [val[0] for val in sample_gen]
    te_gdf['avg_precip'] = values

    sample_gen = perc_precip.sample(zip(te_gdf['geometry'].x, te_gdf['geometry'].y))
    values = [val[0] for val in sample_gen]
    te_gdf['perc_precip'] = values

    sample_gen = dtm.sample(zip(te_gdf['geometry'].x, te_gdf['geometry'].y))
    values = [val[0] for val in sample_gen]
    te_gdf['dtm'] = values

In [None]:
# Merge extrcted training points with tr_df to have all data in 1 dataframe
tr_points_gdf = pd.merge(tr_points_gdf, tr_df, on=['ID'], how='inner')
tr_points_gdf.crs = 'EPSG:32632'

In [None]:
# Compute the nearest distance from each point to each river segement

## Training points
for _, river_row in river_net_gdf.iterrows():
    river_segment = river_row['geometry']  # geometry of the river segment
    river_id = river_row['Objectid']


    tr_points_gdf[f'distance_to_river{river_id}'] = None
    for index, point_row in tr_points_gdf.iterrows():
        point = point_row['geometry']  # geometry of the poin

        # Find the nearest point on the river segment
        nearest_point_on_river = nearest_points(point, river_segment)

        # Calculate the distance between the point and the nearest point on the river segment
        distance = point.distance(nearest_point_on_river[1])

        tr_points_gdf.at[index, f'distance_to_river{river_id}'] = distance


## Testing points
for _, river_row in river_net_gdf.iterrows():
    river_segment = river_row['geometry']  # geometry of the river segment
    river_id = river_row['Objectid']


    te_gdf[f'distance_to_river{river_id}'] = None

    for index, point_row in te_gdf.iterrows():
        point = point_row['geometry']  # geometry of the poin

        # Find the nearest point on the river segment
        nearest_point_on_river = nearest_points(point, river_segment)

        # Calculate the distance between the point and the nearest point on the river segment
        distance = point.distance(nearest_point_on_river[1])

        te_gdf.at[index, f'distance_to_river{river_id}'] = float(distance)

  super().__setitem__(key, value)
  super().__setitem__(key, value)
  super().__setitem__(key, value)
  super().__setitem__(key, value)
  super().__setitem__(key, value)
  super().__setitem__(key, value)
  super().__setitem__(key, value)
  super().__setitem__(key, value)
  super().__setitem__(key, value)
  super().__setitem__(key, value)
  super().__setitem__(key, value)
  super().__setitem__(key, value)
  super().__setitem__(key, value)
  super().__setitem__(key, value)
  super().__setitem__(key, value)
  super().__setitem__(key, value)
  super().__setitem__(key, value)
  super().__setitem__(key, value)
  super().__setitem__(key, value)
  super().__setitem__(key, value)
  super().__setitem__(key, value)
  super().__setitem__(key, value)
  super().__setitem__(key, value)
  super().__setitem__(key, value)
  super().__setitem__(key, value)
  super().__setitem__(key, value)
  super().__setitem__(key, value)
  super().__setitem__(key, value)
  super().__setitem__(key, value)
  super().__se

In [None]:
# extract the land cover information for each point using land_use_land_cover gdf

tr_with_landcover = gpd.sjoin(tr_points_gdf, lulc_gdf, how='left', op='within')
tr_with_landcover = tr_with_landcover.drop(columns=['index_right'])

te_with_landcover = gpd.sjoin(te_gdf, lulc_gdf, how='left', op='within')
te_with_landcover = te_with_landcover.drop(columns=['index_right'])

  if await self.run_code(code, result, async_=asy):
  if await self.run_code(code, result, async_=asy):


In [None]:
# Encoding the extracted land use/cover

te_with_landcover['2-DESCRIZIONE'] = te_with_landcover['2-DESCRIZIONE'].astype('category')
tr_with_landcover['2-DESCRIZIONE'] = tr_with_landcover['2-DESCRIZIONE'].astype('category')

tr_with_landcover['land_encoded'] = tr_with_landcover['2-DESCRIZIONE'].cat.codes
te_with_landcover['land_encoded'] = te_with_landcover['2-DESCRIZIONE'].cat.codes

tr_with_landcover = tr_with_landcover.drop(columns=['2-DESCRIZIONE'])
te_with_landcover = te_with_landcover.drop(columns=['2-DESCRIZIONE'])

### Modeling

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from catboost import CatBoostClassifier

In [None]:
X1 = tr_with_landcover.drop(['ID', 'geometry', 'Target'], axis = 1)
X_test1 = te_with_landcover.drop(['ID',  'geometry'], axis = 1)

y = tr_with_landcover.Target

In [None]:
X1 = X1.astype(float)
X_test1 = X_test1.astype(float)

In [None]:
skf = StratifiedKFold(n_splits=5,shuffle=True, random_state=2023) # for cross validation
catscores = []
catpreds= []


# Creating loop for the stratified k fold
i = 0
for train, val in skf.split(X1, y):
    print(f'########### Fold number {i+1} ')

    # spliting the data
    x_train, x_val, y_train, y_val = X1.iloc[train], X1.iloc[val], y.iloc[train], y.iloc[val]

    clf = CatBoostClassifier(iterations=30000,  has_time=True ,bootstrap_type='No',random_strength=0,
                                   learning_rate=0.05,use_best_model=True,
                                   random_seed=2023)
    # fitting on train data
    clf.fit( x_train, y_train, eval_set = (x_val,y_val),verbose=500 ,early_stopping_rounds=300)

    # Making predictions
    y_pred = clf.predict(x_val)

    # Measuring the accuracy of the model
    score = accuracy_score(y_val, y_pred)
    print(f'Accuracy Score: {score}')
    catscores.append(score)

    preds = clf.predict_proba(X_test1)
    catpreds.append(preds)
    i+=1


print(f'Mean accuracy: {np.mean(catscores)}')

########### Fold number 1 
0:	learn: 0.6497458	test: 0.6508709	best: 0.6508709 (0)	total: 75.6ms	remaining: 37m 46s
500:	learn: 0.1841604	test: 0.2709119	best: 0.2709119 (500)	total: 6.97s	remaining: 6m 50s
1000:	learn: 0.1348277	test: 0.2648398	best: 0.2647349 (997)	total: 13.8s	remaining: 6m 38s
Stopped by overfitting detector  (300 iterations wait)

bestTest = 0.263896489
bestIteration = 1133

Shrink model to first 1134 iterations.
Accuracy Score: 0.9023887973640856
########### Fold number 2 
0:	learn: 0.6487569	test: 0.6484940	best: 0.6484940 (0)	total: 18.5ms	remaining: 9m 14s
500:	learn: 0.1862589	test: 0.2616308	best: 0.2616308 (500)	total: 6.93s	remaining: 6m 47s
1000:	learn: 0.1371856	test: 0.2538114	best: 0.2537624 (990)	total: 14s	remaining: 6m 46s
Stopped by overfitting detector  (300 iterations wait)

bestTest = 0.2536292159
bestIteration = 1009

Shrink model to first 1010 iterations.
Accuracy Score: 0.9069192751235585
########### Fold number 3 
0:	learn: 0.6503108	test: 0

In [None]:
submission = pd.read_csv('/kaggle/input/landslide/SampleSubmission.csv')

catpreds_mean = np.mean(catpreds, axis=0)
predictions = np.argmax(catpreds_mean, axis=1)
submission['Target'] = predictions
submission.to_csv('predictions.csv', index=False )