In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset
data = pd.read_csv('tvol_lidarLiveP.csv')

# Display the first few rows to understand what the data looks like
print(data.head())

# Assuming the target variable is the last column, adjust if otherwise
X = data.iloc[:, :-1]  # Features: all columns except the last one
y = data.iloc[:, -1]   # Target: last column

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)


   Status  origin      zmax     zmean       zsd     zskew     zkurt  \
0       1       0  0.482936  0.393575  0.640931  0.417088  0.000799   
1       1       0  0.619445  0.744484  0.669192  0.315335  0.035721   
2       1       0  0.590965  0.552324  0.719417  0.387989  0.005110   
3       1       0  0.585318  0.735132  0.211343  0.251785  0.194261   
4       1       0  0.297815  0.381137  0.203122  0.251367  0.095055   

   pzabovezmean  pzabove2       zq5  ...    zpcum1    zpcum2    zpcum3  \
0      0.653559  0.543409  0.000000  ...  0.228816  0.217799  0.216589   
1      0.972526  0.832260  0.000000  ...  0.066177  0.098509  0.116252   
2      0.749769  0.738540  0.000000  ...  0.217498  0.280967  0.292151   
3      0.687752  0.986705  0.679823  ...  0.001473  0.000000  0.000000   
4      0.959945  0.893755  0.000000  ...  0.050555  0.047751  0.052850   

     zpcum4    zpcum5    zpcum6    zpcum7    zpcum8    zpcum9      Target  
0  0.214684  0.208958  0.211244  0.275474  0.485716 

In [14]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
import xgboost as xgb

# Initialize the models
lr_model = LinearRegression()
rf_model = RandomForestRegressor(random_state=42)
xgb_model = xgb.XGBRegressor(random_state=42)


In [16]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

def evaluate_model(model, X_train, y_train):
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='r2')
    print(f"Average R2-score for {model.__class__.__name__}: {scores.mean()}")
    return scores.mean()

# Evaluate each model
lr_score = evaluate_model(lr_model, X_train, y_train)
rf_score = evaluate_model(rf_model, X_train, y_train)
xgb_score = evaluate_model(xgb_model, X_train, y_train)


Average R2-score for LinearRegression: 0.614460784807388
Average R2-score for RandomForestRegressor: 0.7439567231548766
Average R2-score for XGBRegressor: 0.6709056273741324


In [18]:
import pickle

# Assuming Linear Regression performed the best, adjust based on actual scores
best_model = lr_model.fit(X_train, y_train)
pickle.dump(best_model, open('LR_Baseline.pkl', 'wb'))


In [20]:
# Predict on the training set
y_pred = best_model.predict(X_train)

# Calculate MAE and MSE
mae = mean_absolute_error(y_train, y_pred)
mse = mean_squared_error(y_train, y_pred)

print(f"MAE: {mae}, MSE: {mse}")


MAE: 57.14074194409077, MSE: 5906.6569810718


A. Random Forest Feature Importance

In [22]:
from sklearn.ensemble import RandomForestRegressor

# Train the model
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)

# Get feature importances
importances = rf_model.feature_importances_

# Create a DataFrame for visualization
feature_importance_rf = pd.DataFrame({'Feature': X.columns, 'Importance': importances}).sort_values(by='Importance', ascending=False)
print(feature_importance_rf.head())


   Feature  Importance
21    zq65    0.315345
22    zq70    0.130729
23    zq75    0.098889
24    zq80    0.062002
20    zq60    0.061912


B. Recursive Feature Elimination with RF

In [26]:
from sklearn.feature_selection import RFE

# Create the RFE object and rank each pixel
rfe = RFE(estimator=rf_model, n_features_to_select=10)  # You can adjust the number of features
rfe = rfe.fit(X_train, y_train)

# Summarize the selection of the attributes
feature_importance_rfe = pd.DataFrame({'Feature': X.columns, 'Ranking': rfe.ranking_})
print(feature_importance_rfe.sort_values(by='Ranking'))


         Feature  Ranking
3          zmean        1
25          zq85        1
23          zq75        1
22          zq70        1
20          zq60        1
21          zq65        1
18          zq50        1
17          zq45        1
24          zq80        1
36        zpcum9        1
29        zpcum2        2
19          zq55        3
26          zq90        4
15          zq35        5
9            zq5        6
32        zpcum5        7
2           zmax        8
13          zq25        9
7   pzabovezmean       10
35        zpcum8       11
10          zq10       12
27          zq95       13
33        zpcum6       14
28        zpcum1       15
4            zsd       16
16          zq40       17
34        zpcum7       18
31        zpcum4       19
8       pzabove2       20
12          zq20       21
1         origin       22
30        zpcum3       23
6          zkurt       24
5          zskew       25
14          zq30       26
11          zq15       27
0         Status       28


C. XGBoost Feature Importance

In [29]:
import xgboost as xgb

# Train the model
xgb_model = xgb.XGBRegressor(random_state=42)
xgb_model.fit(X_train, y_train)

# Get feature importances
importance_xgb = xgb_model.feature_importances_

# Create a DataFrame for visualization
feature_importance_xgb = pd.DataFrame({'Feature': X.columns, 'Importance': importance_xgb}).sort_values(by='Importance', ascending=False)
print(feature_importance_xgb.head())


   Feature  Importance
21    zq65    0.629753
22    zq70    0.130930
23    zq75    0.082204
19    zq55    0.017560
17    zq45    0.014145


In [31]:
feature_importance_rf.to_json('Features_RF_RF.json')
feature_importance_rfe.to_json('Features_RFE_RF_RF.json')
feature_importance_xgb.to_json('Features_RF_XGB.json')


In [33]:
# Assuming using Random Forest with selected features
selected_features = feature_importance_rf[feature_importance_rf['Importance'] > 0.01]['Feature']  # Adjust threshold as needed
X_train_selected = X_train[selected_features]

rf_model_selected = RandomForestRegressor(random_state=42)
rf_model_selected.fit(X_train_selected, y_train)
pickle.dump(rf_model_selected, open('RF_Selected_Features.pkl', 'wb'))
