# Dependencies


In [65]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from imblearn.combine import SMOTEENN, SMOTETomek
from xgboost import XGBClassifier

# Read file

In [10]:
# Get the path of all CSV files in the folder
folder_path = '../Q1_data/train'
file_list = os.listdir(folder_path)
csv_files = [file for file in file_list if file.endswith('.csv')]
print(csv_files)

In [16]:
# Read and merge all CSV files for training
data_list = []
for file in csv_files:
    file_path = os.path.join(folder_path, file)
    data = pd.read_csv(file_path)
    data_list.append(data)

df = pd.concat(data_list)

# Check the DataFrame
print(df.head())
print(df.shape)

           x          y          z  a  b  c  d  Is_Falling
0  18.495860  13.766527  14.362624  0  0  0  1           0
1  18.501072  13.827225  14.270268  0  0  1  0           0
2  18.405950  13.868976  14.094804  1  0  0  0           0
3  18.444572  13.910701  14.116078  0  1  0  0           0
4  18.418470  13.933917  14.320566  0  0  0  1           0
(134229, 8)


In [18]:
test_df = pd.read_csv('../Q1_data/test/test_set.csv')
print(test_df.head())
print(test_df.shape)

   ID         x          y          z  a  b  c  d  Is_Falling
0   1  6.912997  11.518698  15.471855  0  0  0  1           0
1   2  6.936432  11.574586  15.446939  0  0  1  0           0
2   3  6.935274  11.571790  15.437505  1  0  0  0           0
3   4  6.886688  11.561593  15.704019  0  0  0  1           0
4   5  6.921823  11.597728  15.634435  0  0  1  0           0
(6623, 9)


In [26]:
# Exploring the numerical distribution of labels
zero = len(df[df['Is_Falling'] == 1])
zero_test = len(test_df[test_df['Is_Falling'] == 1])
print(zero/df.shape[0])
print(zero_test/test_df.shape[0])

0.04896855374024987
0.05178921938698475


# Method 1: oversampling

In [148]:
# Format data
X = df.iloc[:, :7]
y = df.iloc[:, 7] 

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Using SMOTETomek to balance data
sm = SMOTETomek(random_state=0)
X_res, y_res = sm.fit_resample(X_train, y_train)

# Using the Random Forest Model
model = RandomForestClassifier(n_estimators=100, random_state=0, max_depth=20)
model.fit(X_res, y_res)

# Predict
y_pred = model.predict(X_val)

# Evaluate
print('Accuracy:', accuracy_score(y_val, y_pred))
print('Confusion Matrix:\n', confusion_matrix(y_val, y_pred))
print('Classification Report:\n', classification_report(y_val, y_pred))

Accuracy: 0.9320941667287491
Confusion Matrix:
 [[24014  1543]
 [  280  1009]]
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.94      0.96     25557
           1       0.40      0.78      0.53      1289

    accuracy                           0.93     26846
   macro avg       0.69      0.86      0.74     26846
weighted avg       0.96      0.93      0.94     26846


In [149]:
# Do it on test set
X_test = test_df.iloc[:, 1:8]
y_test = test_df.iloc[:, 8]

y_pred = model.predict(X_test)

print('Accuracy:', accuracy_score(y_test, y_pred))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_pred))
print('Classification Report:\n', classification_report(y_test, y_pred))

Accuracy: 0.912124414917711
Confusion Matrix:
 [[5936  344]
 [ 238  105]]
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.95      0.95      6280
           1       0.23      0.31      0.27       343

    accuracy                           0.91      6623
   macro avg       0.60      0.63      0.61      6623
weighted avg       0.92      0.91      0.92      6623


# Method 2: Ensemble model

In [152]:
# Format data
X = df.iloc[:, :7]
y = df.iloc[:, 7] 

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize XGBoost model
model = XGBClassifier(scale_pos_weight=95/5)

# Predict
model.fit(X_train, y_train)
y_pred = model.predict(X_val)

# Evaluate
print('Accuracy:', accuracy_score(y_val, y_pred))
print('Confusion Matrix:\n', confusion_matrix(y_val, y_pred))
print('Classification Report:\n', classification_report(y_val, y_pred))

Accuracy: 0.906541011696342
Confusion Matrix:
 [[23225  2332]
 [  177  1112]]
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.91      0.95     25557
           1       0.32      0.86      0.47      1289

    accuracy                           0.91     26846
   macro avg       0.66      0.89      0.71     26846
weighted avg       0.96      0.91      0.93     26846


In [153]:
# Do it on test set
X_test = test_df.iloc[:, 1:8]
y_test = test_df.iloc[:, 8]

y_pred = model.predict(X_test)

print('Accuracy:', accuracy_score(y_test, y_pred))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_pred))
print('Classification Report:\n', classification_report(y_test, y_pred))

Accuracy: 0.9291861694096331
Confusion Matrix:
 [[6034  246]
 [ 223  120]]
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.96      0.96      6280
           1       0.33      0.35      0.34       343

    accuracy                           0.93      6623
   macro avg       0.65      0.66      0.65      6623
weighted avg       0.93      0.93      0.93      6623


# Output the Result

In [154]:
test_df['prediction'] = y_pred
outputs = test_df[['ID', 'prediction']]
outputs.to_csv('../Q1_output.csv', index=False)