In [2]:
# Import Libraries
import pandas as pd
import pickle
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import accuracy_score


In [3]:
# Load the datasets
x_train = pd.read_csv('x-train.csv')
x_test = pd.read_csv('x-test.csv')
y_train = pd.read_csv('y-train.csv')
y_test = pd.read_csv('y-test.csv')

y_train = y_train['labels']
y_test = y_test['labels']


In [4]:
# Preprocessing
x_train.fillna(0, inplace=True)
x_test.fillna(0, inplace=True)

scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)


In [5]:
print(x_train_scaled.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(520952, 21)
(65120, 21)
(520952,)
(65120,)


In [6]:
# 1. Train the SVM model
svm_model = LinearSVC(C=1) # Regularization parameter

svm_model.fit(x_train_scaled, y_train.values.ravel())  # Fit the model


In [9]:
# 2. Train the XGBoost model
xgb_model = xgb.XGBClassifier(
    eval_metric='mlogloss', 
    max_depth=6,              # Deeper trees for more complex patterns
    learning_rate=0.05,       # Lower learning rate for more training iterations
    n_estimators=1000,        # More boosting rounds
    subsample=0.8,            # Sample 80% of the data
    colsample_bytree=0.8,     # Use 80% of features for each tree
    n_jobs=-1                 # Use all CPUs for faster computation
)

xgb_model.fit(x_train_scaled, y_train)  # Fit the model

In [None]:
# 3. Train the Random Forest model
rf_model = RandomForestClassifier(
    n_estimators=200,         # Increase number of trees
    max_depth=10,             # Limit depth to avoid overfitting
    min_samples_split=2,      # Controls the minimum number of samples required to split an internal node
    min_samples_leaf=1,       # Minimum number of samples required to be at a leaf node
    max_features='sqrt',      # Use a subset of features to split at each node
    n_jobs=-1,                # Use all CPUs for faster computation
    random_state=42
)

rf_model.fit(x_train_scaled, y_train)  # Fit the model


In [7]:
# 4. Train the LightGBM model
lgb_model = lgb.LGBMClassifier(
    num_leaves=50,             # Larger number of leaves for more complex trees
    learning_rate=0.05,        # Learning rate, smaller for more boosting rounds
    n_estimators=1000,         # More boosting rounds
    max_depth=10,              # Limit depth of each tree
    bagging_fraction=0.8,      # Use 80% of data for each iteration (prevent overfitting)
    feature_fraction=0.8,      # Use 80% of features for each tree
    n_jobs=-1                  # Use all CPUs for faster computation
)

lgb_model.fit(x_train_scaled, y_train)  # Fit the model


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.027151 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1418
[LightGBM] [Info] Number of data points in the train set: 520952, number of used features: 21
[LightGBM] [Info] Start training from score -0.419982
[LightGBM] [Info] Start training from score -1.909440
[LightGBM] [Info] Start training from score -2.995947
[LightGBM] [Info] Start training from score -1.932479


In [11]:
# Save the trained models with pickle
with open('svm_model.pkl', 'wb') as svm_model_file:
    pickle.dump(svm_model, svm_model_file)

with open('xgb_model.pkl', 'wb') as xgb_model_file:
    pickle.dump(xgb_model, xgb_model_file)

with open('rf_model.pkl', 'wb') as rf_model_file:
    pickle.dump(rf_model, rf_model_file)

with open('lgb_model.pkl', 'wb') as lgb_model_file:
    pickle.dump(lgb_model, lgb_model_file)

# Save the scaler for all models
with open('scaler.pkl', 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)


In [12]:
# Evaluate the SVM model
svm_train_accuracy = accuracy_score(y_train, svm_model.predict(x_train_scaled))
svm_test_accuracy = accuracy_score(y_test, svm_model.predict(x_test_scaled))

# Evaluate the XGBoost model
xgb_train_accuracy = accuracy_score(y_train, xgb_model.predict(x_train_scaled))
xgb_test_accuracy = accuracy_score(y_test, xgb_model.predict(x_test_scaled))

# Evaluate the Random Forest model
rf_train_accuracy = accuracy_score(y_train, rf_model.predict(x_train_scaled))
rf_test_accuracy = accuracy_score(y_test, rf_model.predict(x_test_scaled))

# Evaluate the LightGBM model
lgb_train_accuracy = accuracy_score(y_train, lgb_model.predict(x_train_scaled))
lgb_test_accuracy = accuracy_score(y_test, lgb_model.predict(x_test_scaled))









In [13]:
# Print out the results for all models
print(f'SVM Accuracy on training data: {svm_train_accuracy:.5f}')
print(f'SVM Accuracy on test data: {svm_test_accuracy:.5f}')

print(f'XGBoost Accuracy on training data: {xgb_train_accuracy:.5f}')
print(f'XGBoost Accuracy on test data: {xgb_test_accuracy:.5f}')

print(f'Random Forest Accuracy on training data: {rf_train_accuracy:.5f}')
print(f'Random Forest Accuracy on test data: {rf_test_accuracy:.5f}')

print(f'LightGBM Accuracy on training data: {lgb_train_accuracy:.5f}')
print(f'LightGBM Accuracy on test data: {lgb_test_accuracy:.5f}')


SVM Accuracy on training data: 0.86909
SVM Accuracy on test data: 0.87062
XGBoost Accuracy on training data: 0.96767
XGBoost Accuracy on test data: 0.96511
Random Forest Accuracy on training data: 0.94638
Random Forest Accuracy on test data: 0.94730
LightGBM Accuracy on training data: 0.97332
LightGBM Accuracy on test data: 0.96685
