In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.data.gz', names=range(0, 55))

In [14]:
df = df.sample(10000, random_state=8) # Sampling so computing doesn't take as long

In [15]:
X = df[range(0, 54)].to_numpy()
y = df[54].to_numpy()

In [16]:
X.shape, y.shape

((10000, 54), (10000,))

In [18]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

C_value = 0.001
best_acc = 0
best_C = None

for i in range(7):
    clf_SVC = SVC(C=C_value)
    clf_SVC.fit(X_train, y_train) # Limited to 5000 to save on computing power
    SVC_pred = clf_SVC.predict(X_test)
    acc = accuracy_score(SVC_pred, y_test)

    if acc > best_acc:
        best_acc = acc
        best_C = C_value

    C_value *= 10
    
    print(f'Loop {i}/6')

best_acc, best_C

Loop 0/6
Loop 1/6
Loop 2/6
Loop 3/6
Loop 4/6
Loop 5/6
Loop 6/6


(0.725, 1000.0)

In [20]:
from sklearn.ensemble import RandomForestClassifier

depth = 2
best_acc = 0
best_depth = None

for i in range(12):
    clf_RF = RandomForestClassifier(max_depth=depth)
    clf_RF.fit(X_train, y_train)
    RF_pred = clf_RF.predict(X_test)
    acc = accuracy_score(RF_pred, y_test)

    if acc > best_acc:
        best_acc = acc
        best_depth = depth

    depth += 2
    
    print(f'Loop {i}/11')

best_acc, best_depth

Loop 0/11
Loop 1/11
Loop 2/11
Loop 3/11
Loop 4/11
Loop 5/11
Loop 6/11
Loop 7/11
Loop 8/11
Loop 9/11
Loop 10/11
Loop 11/11


(0.789, 24)

Saving models trained on all data

In [22]:
import pickle

SVC_final = SVC(C=best_C)
SVC_final.fit(X, y)

model_name = 'SVC_clf.sav'
pickle.dump(SVC_final, open(model_name, 'wb'))

In [23]:
RF_final = RandomForestClassifier(max_depth=best_depth)
RF_final.fit(X, y)

rf_model_name = 'RF_clf.sav'
pickle.dump(RF_final, open(rf_model_name, 'wb'))