# Random Forest Cloud Detection
1. Data Preprocessing

In [1]:
import numpy as np
# Grab Dataset
data = np.genfromtxt('data.csv', delimiter=',')

In [2]:
# Split data into y (label) and x (features)
y = data[:,0]
x = data[:,1:22]
x = np.nan_to_num(x, nan=0.0)

In [3]:
# Any multi-layer cloud labels set to 1, turn into classification [cloud, no cloud]=[1,0]
y = np.where(y >= 1, 1, y)
print(y[0])

0.0


In [4]:
from sklearn.model_selection import train_test_split
# Split data into Train/Test/Validation 80/20
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [5]:
from sklearn.preprocessing import StandardScaler
# Scale data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

2. Build Random Forest
3. Train

In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

rf_classifier = RandomForestClassifier(n_estimators=300, random_state=42)

rf_classifier.fit(X_train, y_train)

y_pred = rf_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9255038937242327


4. Gather Feature Importance

In [7]:
# Print feature importances
feature_importances = rf_classifier.feature_importances_
num_features = len(feature_importances)
sorted_indices = feature_importances.argsort()[::-1]

print("\nFeature Importances:")
for i in range(num_features):
    print(f"Feature {sorted_indices[i]}: Importance = {feature_importances[sorted_indices[i]]}")


Feature Importances:
Feature 0: Importance = 0.09943824244065133
Feature 6: Importance = 0.08837987997691926
Feature 1: Importance = 0.0725464287022406
Feature 2: Importance = 0.06715528563599818
Feature 5: Importance = 0.06563014463732963
Feature 10: Importance = 0.06502393093231333
Feature 7: Importance = 0.06490915699675105
Feature 3: Importance = 0.06128350836477842
Feature 4: Importance = 0.060584334088559635
Feature 9: Importance = 0.050566976910904296
Feature 8: Importance = 0.04953102266289154
Feature 19: Importance = 0.0382666117141439
Feature 14: Importance = 0.034968024593306064
Feature 17: Importance = 0.03262088151205535
Feature 12: Importance = 0.02455793332431899
Feature 18: Importance = 0.0231378361515029
Feature 13: Importance = 0.02239466332140489
Feature 15: Importance = 0.021842891330709248
Feature 16: Importance = 0.020911899545999327
Feature 11: Importance = 0.019690800180332868
Feature 20: Importance = 0.0165595469768891


## Downsample to compare to QSVM

In [8]:
# Generate 1250 random indices
np.random.seed(7)
random_indices = np.random.choice(len(x), size=1250, replace=False)

# Select samples using the random indices
x = x[random_indices,:]
y = y[random_indices]

In [9]:
# cut down on features
# 0 and 6 were chosen from Random Forest's Feature Importance
x = x[:,[0,6]]

In [10]:
from sklearn.model_selection import train_test_split
# Split data into Train/Test/Validation 80/20
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [11]:
from sklearn.preprocessing import StandardScaler
# Scale data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

rf_classifier = RandomForestClassifier(n_estimators=300, random_state=42)

rf_classifier.fit(X_train, y_train)

y_pred = rf_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.728
