In [1]:
pip install geopandas

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import geopandas as gpd
import rasterio
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


In [3]:
geological_faults = gpd.read_file("/Users/muhammadluay/Desktop/Zindi/GEO AI Italy/datasets/geological_faults.gpkg")
land_use = gpd.read_file("/Users/muhammadluay/Desktop/Zindi/GEO AI Italy/datasets/land_use_land_cover.gpkg")
river_network = gpd.read_file("/Users/muhammadluay/Desktop/Zindi/GEO AI Italy/datasets/river_network.gpkg")
road_network = gpd.read_file("/Users/muhammadluay/Desktop/Zindi/GEO AI Italy/datasets/road_network.gpkg")
train_data = gpd.read_file("/Users/muhammadluay/Desktop/Zindi/GEO AI Italy/datasets/Train.gpkg")
test_data = gpd.read_file("/Users/muhammadluay/Desktop/Zindi/GEO AI Italy/datasets/Test.gpkg")


In [4]:
data_train = pd.read_csv('Train.csv')

In [5]:
data_train

Unnamed: 0,ID,Target
0,ID_000001,1
1,ID_000002,1
2,ID_000003,1
3,ID_000004,1
4,ID_000005,1
...,...,...
12135,ID_012136,0
12136,ID_012137,0
12137,ID_012138,0
12138,ID_012139,0


In [6]:
data_test = pd.read_csv('Test.csv')

In [7]:
data_test

Unnamed: 0,ID
0,ID_000001
1,ID_000002
2,ID_000003
3,ID_000004
4,ID_000005
...,...
39995,ID_039996
39996,ID_039997
39997,ID_039998
39998,ID_039999


In [8]:
gdf = train_data

In [9]:
# Load raster data using Rasterio
dtm_dataset = rasterio.open('/Users/muhammadluay/Desktop/Zindi/GEO AI Italy/datasets/dtm.tif')
average_precip_2020_dataset = rasterio.open('/Users/muhammadluay/Desktop/Zindi/GEO AI Italy/datasets/average_precipitation_2020.tif')
perc_90_precip_2020_dataset = rasterio.open('/Users/muhammadluay/Desktop/Zindi/GEO AI Italy/datasets/90_perc_precipitation_2020.tif')

In [10]:
# Read data as 2D arrays
dtm_data = dtm_dataset.read(1)
average_precip_2020_data = average_precip_2020_dataset.read(1)
perc_90_precip_2020_data = perc_90_precip_2020_dataset.read(1)

In [11]:
no_landslides = train_data[train_data['Target'] == 1]

# Compute centroids for landslides MultiPolygons
no_landslides_centroids = no_landslides.geometry.centroid

In [12]:
landslides = train_data[train_data['Target'] == 0]

# Compute centroids for landslides MultiPolygons
landslides_centroids = landslides.geometry.centroid

In [13]:
dtm_transform = dtm_dataset.transform

In [14]:
# import numpy as np
import rasterio

# Extracting the x and y coordinates from the centroids
x_coords = np.array(landslides_centroids.x)
y_coords = np.array(landslides_centroids.y)

# Convert these points to row, col in the DTM
rows_cols = [~dtm_transform * (x,y) for x, y in zip(x_coords, y_coords)]
rows, cols = zip(*rows_cols)
rows = np.array(rows, dtype=int)
cols = np.array(cols, dtype=int)

# Using Rasterio's sample method to get the elevation for all points at once
landslide_elevations = list(dtm_dataset.sample(zip(x_coords, y_coords)))
landslide_elevations = [elevation[0] for elevation in landslide_elevations]


In [15]:
# Extracting the x and y coordinates from no_landslides_centroids
x_coords_no_landslides = np.array(no_landslides_centroids.x)
y_coords_no_landslides = np.array(no_landslides_centroids.y)

# Convert these points to row, col in the DTM
rows_cols_no_landslides = [~dtm_transform * (x,y) for x, y in zip(x_coords_no_landslides, y_coords_no_landslides)]
rows_no_landslides, cols_no_landslides = zip(*rows_cols_no_landslides)
rows_no_landslides = np.array(rows_no_landslides, dtype=int)
cols_no_landslides = np.array(cols_no_landslides, dtype=int)

# Using Rasterio's sample method to get the elevation for all points at once for no_landslides_centroids
no_landslide_elevations = list(dtm_dataset.sample(zip(x_coords_no_landslides, y_coords_no_landslides)))
no_landslide_elevations = [elevation[0] for elevation in no_landslide_elevations]


In [16]:
from rasterstats import zonal_stats

# Extract DTM values
train_data['dtm_mean'] = [stat['mean'] for stat in zonal_stats(train_data, dtm_data, affine=dtm_dataset.transform, stats='mean')]

# Extract average precipitation 2020 values
train_data['avg_precip_2020_mean'] = [stat['mean'] for stat in zonal_stats(train_data, average_precip_2020_data, affine=average_precip_2020_dataset.transform, stats='mean')]

# Extract 90th percentile precipitation 2020 values
train_data['perc_90_precip_2020_mean'] = [stat['mean'] for stat in zonal_stats(train_data, perc_90_precip_2020_data, affine=perc_90_precip_2020_dataset.transform, stats='mean')]




In [17]:
# Calculate the distance from each train_data centroid to the nearest geological fault
train_data['distance_to_fault'] = train_data.geometry.centroid.distance(geological_faults.unary_union)

# Repeat for other datasets
train_data['distance_to_river'] = train_data.geometry.centroid.distance(river_network.unary_union)
train_data['distance_to_road'] = train_data.geometry.centroid.distance(road_network.unary_union)


In [18]:
new_elevations = []

for target in train_data['Target']:
    if target == 0:  # landslide
        new_elevations.append(landslide_elevations.pop(0))
    else:  # non-landslide
        new_elevations.append(no_landslide_elevations.pop(0))

train_data['Elevation'] = new_elevations

train_data

Unnamed: 0,Target,ID,geometry,dtm_mean,avg_precip_2020_mean,perc_90_precip_2020_mean,distance_to_fault,distance_to_river,distance_to_road,Elevation
0,1,ID_000001,"MULTIPOLYGON (((607152.916 5124458.395, 607137...",1500.036196,0.154537,0.265434,3501.188735,99.164823,7313.460316,1499.408203
1,1,ID_000002,"MULTIPOLYGON (((611957.101 5131543.071, 611970...",2370.732285,0.137018,0.223390,821.129744,557.910919,2730.168237,2369.408447
2,1,ID_000003,"MULTIPOLYGON (((612895.966 5130875.565, 612880...",2131.777161,0.133313,0.213740,267.602421,290.728799,1520.080009,2103.769043
3,1,ID_000004,"MULTIPOLYGON (((609366.882 5131249.149, 609361...",2485.924128,0.143612,0.243227,619.451164,255.350102,3473.022264,2484.286621
4,1,ID_000005,"MULTIPOLYGON (((611814.848 5132646.039, 611808...",2625.375500,0.136794,0.234096,306.035376,106.918126,2430.413317,2624.262207
...,...,...,...,...,...,...,...,...,...,...
12135,0,ID_012136,"MULTIPOLYGON (((540821.869 5097076.248, 540821...",2063.098693,0.284925,0.427346,67.710481,45.770766,3413.327766,2070.000000
12136,0,ID_012137,"MULTIPOLYGON (((542231.869 5096911.248, 542231...",1858.095930,0.285867,0.454944,61.403031,1320.683706,2461.103774,1858.089966
12137,0,ID_012138,"MULTIPOLYGON (((542636.869 5096726.248, 542641...",2048.554309,0.286188,0.455146,322.126622,1650.549467,2460.870050,2043.433960
12138,0,ID_012139,"MULTIPOLYGON (((541576.869 5096291.248, 541581...",2255.769422,0.287747,0.456125,25.794079,647.508181,3317.534037,2254.370117


In [19]:
train_data['x'] = train_data.geometry.centroid.x

In [20]:
train_data['y'] = train_data.geometry.centroid.y

In [21]:
df = train_data.drop(columns=['ID', 'geometry'])


In [22]:
df = df.dropna()

In [23]:
from osgeo import gdal, gdalconst

def calculate_slope_aspect(dtm_file, slope_output_file, aspect_output_file):
    # Open the DTM file
    ds = gdal.Open(dtm_file, gdalconst.GA_ReadOnly)
    
    # Get the band of the DTM (assuming it's the first band)
    band = ds.GetRasterBand(1)
    
    # Get the spatial resolution of the raster
    x_res, y_res = ds.GetGeoTransform()[1], ds.GetGeoTransform()[5]
    
    # Calculate slope
    slope = gdal.DEMProcessing(slope_output_file, ds, 'slope', format='GTiff', computeEdges=True, band=1, alg='ZevenbergenThorne')
    slope = None  # Close the dataset
    
    # Calculate aspect
    aspect = gdal.DEMProcessing(aspect_output_file, ds, 'aspect', format='GTiff', computeEdges=True, band=1, trigonometric=True)
    aspect = None  # Close the dataset
    
    ds = None  # Close the original DTM dataset

# Use the function
dtm_file = '/Users/muhammadluay/Desktop/Zindi/GEO AI Italy/datasets/dtm.tif'
calculate_slope_aspect(dtm_file, 'slope_output.tif', 'aspect_output.tif')




In [24]:
def extract_raster_values(points_gdf, raster_file):
    """
    Extract raster values at point locations.
    
    Parameters:
    - points_gdf: A GeoDataFrame containing geometries (either Point or MultiPolygon).
    - raster_file: Path to the raster file.
    
    Returns:
    - A list of raster values at the geometry centroids.
    """
    with rasterio.open(raster_file) as src:
        # Extract centroids of the geometries for sampling
        centroids = points_gdf.geometry.centroid
        coords = [(pt.x, pt.y) for pt in centroids]
        return [val[0] for val in src.sample(coords)]

# Try the extraction again with the modified function
slope_values = extract_raster_values(train_data, 'slope_output.tif')
aspect_values = extract_raster_values(train_data, 'aspect_output.tif')


In [25]:
# Add the extracted values as new columns to the training data
train_data['slope'] = slope_values
train_data['aspect'] = aspect_values


In [26]:
df = train_data

In [27]:
train_data

Unnamed: 0,Target,ID,geometry,dtm_mean,avg_precip_2020_mean,perc_90_precip_2020_mean,distance_to_fault,distance_to_river,distance_to_road,Elevation,x,y,slope,aspect
0,1,ID_000001,"MULTIPOLYGON (((607152.916 5124458.395, 607137...",1500.036196,0.154537,0.265434,3501.188735,99.164823,7313.460316,1499.408203,607174.077140,5.124483e+06,16.891533,215.264526
1,1,ID_000002,"MULTIPOLYGON (((611957.101 5131543.071, 611970...",2370.732285,0.137018,0.223390,821.129744,557.910919,2730.168237,2369.408447,611796.307967,5.131481e+06,13.441610,6.810957
2,1,ID_000003,"MULTIPOLYGON (((612895.966 5130875.565, 612880...",2131.777161,0.133313,0.213740,267.602421,290.728799,1520.080009,2103.769043,613139.606886,5.131229e+06,34.834660,44.501762
3,1,ID_000004,"MULTIPOLYGON (((609366.882 5131249.149, 609361...",2485.924128,0.143612,0.243227,619.451164,255.350102,3473.022264,2484.286621,609347.488060,5.131305e+06,14.806323,312.484772
4,1,ID_000005,"MULTIPOLYGON (((611814.848 5132646.039, 611808...",2625.375500,0.136794,0.234096,306.035376,106.918126,2430.413317,2624.262207,611761.133347,5.132713e+06,33.089279,328.243805
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12135,0,ID_012136,"MULTIPOLYGON (((540821.869 5097076.248, 540821...",2063.098693,0.284925,0.427346,67.710481,45.770766,3413.327766,2070.000000,540783.524896,5.097145e+06,0.000000,225.000000
12136,0,ID_012137,"MULTIPOLYGON (((542231.869 5096911.248, 542231...",1858.095930,0.285867,0.454944,61.403031,1320.683706,2461.103774,1858.089966,542179.737191,5.096990e+06,0.000000,-9999.000000
12137,0,ID_012138,"MULTIPOLYGON (((542636.869 5096726.248, 542641...",2048.554309,0.286188,0.455146,322.126622,1650.549467,2460.870050,2043.433960,542594.970638,5.096728e+06,20.209124,155.115326
12138,0,ID_012139,"MULTIPOLYGON (((541576.869 5096291.248, 541581...",2255.769422,0.287747,0.456125,25.794079,647.508181,3317.534037,2254.370117,541684.956441,5.096290e+06,0.000000,-9999.000000


In [28]:
df.replace(-9999, df['aspect'].median(), inplace=True)  # Here I'm replacing with median. Choose as per your analysis.
df = df.drop(columns=["geometry", "ID"])
df = df.dropna()

In [29]:
# Split data into features (X) and target (y)
X = df.drop(columns="Target")
y = df["Target"]

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)  # Note: We only transform (not fit) the test data to avoid data leakage

In [30]:
df.columns

Index(['Target', 'dtm_mean', 'avg_precip_2020_mean',
       'perc_90_precip_2020_mean', 'distance_to_fault', 'distance_to_river',
       'distance_to_road', 'Elevation', 'x', 'y', 'slope', 'aspect'],
      dtype='object')

In [31]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the Random Forest Classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier
clf.fit(X_train, y_train)


In [32]:
from sklearn.metrics import accuracy_score, classification_report

# Predict the target for the test set
y_pred = clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Print a classification report
print(classification_report(y_test, y_pred))

Accuracy: 96.03%
              precision    recall  f1-score   support

           0       0.88      0.82      0.84       320
           1       0.97      0.98      0.98      2099

    accuracy                           0.96      2419
   macro avg       0.92      0.90      0.91      2419
weighted avg       0.96      0.96      0.96      2419



In [33]:
train_data[['ID']]

Unnamed: 0,ID
0,ID_000001
1,ID_000002
2,ID_000003
3,ID_000004
4,ID_000005
...,...
12135,ID_012136
12136,ID_012137
12137,ID_012138
12138,ID_012139


In [34]:
# Assuming the raster datasets for the test data are similar
test_data['dtm_mean'] = [stat['mean'] for stat in zonal_stats(test_data, dtm_data, affine=dtm_dataset.transform, stats='mean')]
test_data['avg_precip_2020_mean'] = [stat['mean'] for stat in zonal_stats(test_data, average_precip_2020_data, affine=average_precip_2020_dataset.transform, stats='mean')]
test_data['perc_90_precip_2020_mean'] = [stat['mean'] for stat in zonal_stats(test_data, perc_90_precip_2020_data, affine=perc_90_precip_2020_dataset.transform, stats='mean')]
test_data['distance_to_fault'] = test_data.geometry.centroid.distance(geological_faults.unary_union)
test_data['distance_to_river'] = test_data.geometry.centroid.distance(river_network.unary_union)
test_data['distance_to_road'] = test_data.geometry.centroid.distance(road_network.unary_union)
test_data['slope'] = extract_raster_values(test_data, 'slope_output.tif')
test_data['aspect'] = extract_raster_values(test_data, 'aspect_output.tif')
test_data.replace(-9999, test_data['aspect'].median(), inplace=True)

In [35]:
test_data

Unnamed: 0,ID,geometry,dtm_mean,avg_precip_2020_mean,perc_90_precip_2020_mean,distance_to_fault,distance_to_river,distance_to_road,slope,aspect
0,ID_000001,POINT (541862.336 5103652.266),1303.143921,0.226309,0.335290,84.145427,1000.917702,532.189622,41.934853,27.244064
1,ID_000002,POINT (566456.496 5131798.978),2743.524658,0.175368,0.461412,102.287223,1649.903253,4006.930094,35.809925,316.567444
2,ID_000003,POINT (584598.972 5109016.391),1535.253906,0.194696,0.383039,827.982355,584.534148,773.740503,31.549198,121.869400
3,ID_000004,POINT (542414.162 5125941.301),1938.368408,0.170589,0.214643,791.679670,436.181599,5840.225672,30.327429,131.787659
4,ID_000005,POINT (532099.144 5133370.588),2013.937744,0.173891,0.294433,1813.340365,2311.669383,2461.286859,61.452034,317.859100
...,...,...,...,...,...,...,...,...,...,...
39995,ID_039996,POINT (605509.423 5146469.307),1204.146973,0.159315,0.377356,829.073174,191.197953,6.852835,8.188457,164.908310
39996,ID_039997,POINT (526815.199 5147832.968),1891.869995,0.156467,0.242401,1437.766857,1090.397403,279.211348,0.000000,174.141434
39997,ID_039998,POINT (526873.190 5147470.852),1891.869995,0.171749,0.286767,1182.744639,898.942591,293.134691,0.000000,174.141434
39998,ID_039999,POINT (569507.854 5101121.584),2015.124756,0.253332,0.526724,23.201835,695.121443,4489.687645,13.672082,112.259399


In [36]:
test_data['x'] = test_data.geometry.x

In [37]:
test_data['y'] = test_data.geometry.y

In [38]:
test_data

Unnamed: 0,ID,geometry,dtm_mean,avg_precip_2020_mean,perc_90_precip_2020_mean,distance_to_fault,distance_to_river,distance_to_road,slope,aspect,x,y
0,ID_000001,POINT (541862.336 5103652.266),1303.143921,0.226309,0.335290,84.145427,1000.917702,532.189622,41.934853,27.244064,541862.335807,5.103652e+06
1,ID_000002,POINT (566456.496 5131798.978),2743.524658,0.175368,0.461412,102.287223,1649.903253,4006.930094,35.809925,316.567444,566456.495993,5.131799e+06
2,ID_000003,POINT (584598.972 5109016.391),1535.253906,0.194696,0.383039,827.982355,584.534148,773.740503,31.549198,121.869400,584598.971887,5.109016e+06
3,ID_000004,POINT (542414.162 5125941.301),1938.368408,0.170589,0.214643,791.679670,436.181599,5840.225672,30.327429,131.787659,542414.162385,5.125941e+06
4,ID_000005,POINT (532099.144 5133370.588),2013.937744,0.173891,0.294433,1813.340365,2311.669383,2461.286859,61.452034,317.859100,532099.143919,5.133371e+06
...,...,...,...,...,...,...,...,...,...,...,...,...
39995,ID_039996,POINT (605509.423 5146469.307),1204.146973,0.159315,0.377356,829.073174,191.197953,6.852835,8.188457,164.908310,605509.422777,5.146469e+06
39996,ID_039997,POINT (526815.199 5147832.968),1891.869995,0.156467,0.242401,1437.766857,1090.397403,279.211348,0.000000,174.141434,526815.198540,5.147833e+06
39997,ID_039998,POINT (526873.190 5147470.852),1891.869995,0.171749,0.286767,1182.744639,898.942591,293.134691,0.000000,174.141434,526873.189669,5.147471e+06
39998,ID_039999,POINT (569507.854 5101121.584),2015.124756,0.253332,0.526724,23.201835,695.121443,4489.687645,13.672082,112.259399,569507.853816,5.101122e+06


In [39]:
# 1. Extracting the x and y coordinates from test_data
x_coords_test_data = np.array(test_data['x'])
y_coords_test_data = np.array(test_data['y'])

# 2. Convert these points to row, col in the DTM
rows_cols_test_data = [~dtm_transform * (x, y) for x, y in zip(x_coords_test_data, y_coords_test_data)]
rows_test_data, cols_test_data = zip(*rows_cols_test_data)
rows_test_data = np.array(rows_test_data, dtype=int)
cols_test_data = np.array(cols_test_data, dtype=int)

# 3. Using Rasterio's sample method to get the elevation for all points at once
test_data_elevations = list(dtm_dataset.sample(zip(x_coords_test_data, y_coords_test_data)))
test_data_elevations = [elevation[0] for elevation in test_data_elevations]

# 4. Adding the extracted elevations to test_data DataFrame
test_data['Elevation'] = test_data_elevations


In [40]:
test_data.isna().sum()

ID                           0
geometry                     0
dtm_mean                     0
avg_precip_2020_mean        93
perc_90_precip_2020_mean    93
distance_to_fault            0
distance_to_river            0
distance_to_road             0
slope                        0
aspect                       0
x                            0
y                            0
Elevation                    0
dtype: int64

In [41]:
for column in ['avg_precip_2020_mean', 'perc_90_precip_2020_mean']:
    mode_value = test_data[column].mode()[0]  # Mode returns a series, so we get the first value
    test_data[column].fillna(mode_value, inplace=True)

# Check for remaining missing values
print(test_data.isna().sum())

ID                          0
geometry                    0
dtm_mean                    0
avg_precip_2020_mean        0
perc_90_precip_2020_mean    0
distance_to_fault           0
distance_to_river           0
distance_to_road            0
slope                       0
aspect                      0
x                           0
y                           0
Elevation                   0
dtype: int64


In [42]:
test_df = test_data.drop(columns=["geometry", "ID"])

In [43]:
test_df

Unnamed: 0,dtm_mean,avg_precip_2020_mean,perc_90_precip_2020_mean,distance_to_fault,distance_to_river,distance_to_road,slope,aspect,x,y,Elevation
0,1303.143921,0.226309,0.335290,84.145427,1000.917702,532.189622,41.934853,27.244064,541862.335807,5.103652e+06,1303.143921
1,2743.524658,0.175368,0.461412,102.287223,1649.903253,4006.930094,35.809925,316.567444,566456.495993,5.131799e+06,2743.524658
2,1535.253906,0.194696,0.383039,827.982355,584.534148,773.740503,31.549198,121.869400,584598.971887,5.109016e+06,1535.253906
3,1938.368408,0.170589,0.214643,791.679670,436.181599,5840.225672,30.327429,131.787659,542414.162385,5.125941e+06,1938.368408
4,2013.937744,0.173891,0.294433,1813.340365,2311.669383,2461.286859,61.452034,317.859100,532099.143919,5.133371e+06,2013.937744
...,...,...,...,...,...,...,...,...,...,...,...
39995,1204.146973,0.159315,0.377356,829.073174,191.197953,6.852835,8.188457,164.908310,605509.422777,5.146469e+06,1204.146973
39996,1891.869995,0.156467,0.242401,1437.766857,1090.397403,279.211348,0.000000,174.141434,526815.198540,5.147833e+06,1891.869995
39997,1891.869995,0.171749,0.286767,1182.744639,898.942591,293.134691,0.000000,174.141434,526873.189669,5.147471e+06,1891.869995
39998,2015.124756,0.253332,0.526724,23.201835,695.121443,4489.687645,13.672082,112.259399,569507.853816,5.101122e+06,2015.124756


In [44]:
feature_columns = df.drop(columns=['Target']).columns.tolist()


In [45]:
test_df = test_df[feature_columns]


In [46]:
# test_df = test_df.dropna().reset_index()

In [47]:
test_df

Unnamed: 0,dtm_mean,avg_precip_2020_mean,perc_90_precip_2020_mean,distance_to_fault,distance_to_river,distance_to_road,Elevation,x,y,slope,aspect
0,1303.143921,0.226309,0.335290,84.145427,1000.917702,532.189622,1303.143921,541862.335807,5.103652e+06,41.934853,27.244064
1,2743.524658,0.175368,0.461412,102.287223,1649.903253,4006.930094,2743.524658,566456.495993,5.131799e+06,35.809925,316.567444
2,1535.253906,0.194696,0.383039,827.982355,584.534148,773.740503,1535.253906,584598.971887,5.109016e+06,31.549198,121.869400
3,1938.368408,0.170589,0.214643,791.679670,436.181599,5840.225672,1938.368408,542414.162385,5.125941e+06,30.327429,131.787659
4,2013.937744,0.173891,0.294433,1813.340365,2311.669383,2461.286859,2013.937744,532099.143919,5.133371e+06,61.452034,317.859100
...,...,...,...,...,...,...,...,...,...,...,...
39995,1204.146973,0.159315,0.377356,829.073174,191.197953,6.852835,1204.146973,605509.422777,5.146469e+06,8.188457,164.908310
39996,1891.869995,0.156467,0.242401,1437.766857,1090.397403,279.211348,1891.869995,526815.198540,5.147833e+06,0.000000,174.141434
39997,1891.869995,0.171749,0.286767,1182.744639,898.942591,293.134691,1891.869995,526873.189669,5.147471e+06,0.000000,174.141434
39998,2015.124756,0.253332,0.526724,23.201835,695.121443,4489.687645,2015.124756,569507.853816,5.101122e+06,13.672082,112.259399


In [48]:
# 2. Use your trained model to predict the test data
X_test_new = scaler.transform(test_df) # Standardize the test data using the scaler trained on train data
predictions = clf.predict(X_test_new)

# 3. Create the submission file
submission = pd.DataFrame({'ID': test_data['ID'], 'Target': predictions})
submission.to_csv('submission.csv', index=False)