# Initial Model Evaluation
For this test phase, we will import one somewhat large dataset (in this case, 8_dataset.csv) among the ones in the datasets folder and prepare it, followed by running a few models on it to see how they perform. The models we will be using are:
- Logistic Regression
- Random Forest
- Support Vector Machine
- K-Nearest Neighbors
- Naive Bayes
- Decision Tree

#### Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
import os
from imblearn.over_sampling import SMOTE
from collections import Counter
import re

#### Preprocessing

In [2]:
os.chdir('C:/Users/hifia/Projects/ML Semester project/datasets')
df = pd.read_csv('8_dataset.csv', low_memory=False)

In [3]:
# Delete all entries where neo = nan
df = df[df.neo.notna()]

In [7]:
x = df.iloc[:, 8:]
neo = df.iloc[:, 6]
pha = df.iloc[:, 7]

print(x.shape)
print(neo.shape)
print(pha.shape)

print(x.head())

(958520, 37)
(958520,)
(958520,)
      H  diameter  albedo  diameter_sigma orbit_id      epoch  epoch_mjd  \
0  3.40   939.400  0.0900           0.200   JPL 47  2458600.5      58600   
1  4.20   545.000  0.1010          18.000   JPL 37  2459000.5      59000   
2  5.33   246.596  0.2140          10.594  JPL 112  2459000.5      59000   
3  3.00   525.400  0.4228           0.200   JPL 35  2458600.5      58600   
4  6.90   106.699  0.2740           3.140  JPL 114  2459000.5      59000   

    epoch_cal equinox         e  ...       sigma_i      sigma_om  \
0  20190427.0   J2000  0.076009  ...  4.608900e-09  6.168800e-08   
1  20200531.0   J2000  0.229972  ...  3.469400e-06  6.272400e-06   
2  20200531.0   J2000  0.256936  ...  3.223100e-06  1.664600e-05   
3  20190427.0   J2000  0.088721  ...  2.170600e-07  3.880800e-07   
4  20200531.0   J2000  0.190913  ...  2.740800e-06  2.894900e-05   

        sigma_w      sigma_ma      sigma_ad       sigma_n      sigma_tp  \
0  6.624800e-08  7.820700e

In [8]:
numeric_columns_indexes = df.select_dtypes(include=np.number).columns
numeric_columns_numeric_indexes = [df.columns.get_loc(col) for col in numeric_columns_indexes]
print(df.iloc[:, numeric_columns_numeric_indexes].corr())

                   spkid         H  diameter    albedo  diameter_sigma  \
spkid           1.000000  0.146078 -0.095362 -0.179656        0.023419   
H               0.146078  1.000000 -0.572648 -0.221658       -0.070651   
diameter       -0.095362 -0.572648  1.000000 -0.108880        0.337145   
albedo         -0.179656 -0.221658 -0.108880  1.000000       -0.080525   
diameter_sigma  0.023419 -0.070651  0.337145 -0.080525        1.000000   
epoch           0.006816 -0.175728  0.058475  0.094071       -0.005169   
epoch_mjd       0.006816 -0.175728  0.058475  0.094071       -0.005169   
epoch_cal       0.006999 -0.176366  0.058539  0.094114       -0.005120   
e               0.010955  0.345547 -0.050649 -0.020403       -0.016542   
a               0.000132 -0.037301  0.146799 -0.114484        0.206718   
q              -0.002115 -0.437418  0.329223 -0.267607        0.381335   
i               0.013449 -0.099026  0.054963 -0.086802        0.039580   
om              0.003874  0.000896  0.

In [9]:
# numeric_columns = x.select_dtypes(include=np.number).columns
# imputer = SimpleImputer(missing_values=np.nan, strategy='median')
# x[:, numeric_columns] = imputer.fit_transform(x[:, numeric_columns])
# print(Counter(x))

numeric_columns_indexes = x.select_dtypes(include=np.number).columns
numeric_columns_numeric_indexes = [x.columns.get_loc(col) for col in numeric_columns_indexes]
imputer = SimpleImputer(strategy='mean')
x.iloc[:, numeric_columns_numeric_indexes] = imputer.fit_transform(x.iloc[:, numeric_columns_numeric_indexes])

In [10]:
categorical_columns = x.select_dtypes(include='object').columns
categorical_columns_numeric_indexes = [x.columns.get_loc(col) for col in categorical_columns]
print(categorical_columns, categorical_columns_numeric_indexes)

Index(['orbit_id', 'equinox', 'class'], dtype='object') [4, 8, 35]


In [11]:
def process(string):
    if type(string) == str:
        return int(''.join(re.findall(r'\d+', string)))
    return string

x['orbit_id'] = x['orbit_id'].apply(process)
print(x['orbit_id'].head())

0     47
1     37
2    112
3     35
4    114
Name: orbit_id, dtype: int64


In [12]:
x['equinox'] = x['equinox'].apply(process)
print(x['equinox'].head())

0    2000
1    2000
2    2000
3    2000
4    2000
Name: equinox, dtype: int64


In [13]:
le = LabelEncoder()
x['class'] = le.fit_transform(x['class'])
print(x['class'].head())

0    7
1    7
2    7
3    7
4    7
Name: class, dtype: int32


#### Train test splits and smote

In [30]:
x_train, x_test, y_train, y_test = train_test_split(x, neo, test_size=0.3, random_state=42)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

# Smote
print("Class distribution in training set before SMOTE:", Counter(y_train), Counter(y_test))


(766816, 37) (191704, 37) (766816,) (191704,)
Class distribution in training set before SMOTE: Counter({'N': 748512, 'Y': 18304}) Counter({'N': 187113, 'Y': 4591})


In [31]:
smote = SMOTE(random_state=0)
x_traina, y_traina = smote.fit_resample(x_train, y_train)
print("Class distribution in training set after SMOTE:", Counter(y_traina))

Class distribution in training set after SMOTE: Counter({'N': 748512, 'Y': 748512})


In [32]:
print(x_traina.shape, y_traina.shape)
print(x_traina.info())

(1497024, 37) (1497024,)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1497024 entries, 0 to 1497023
Data columns (total 37 columns):
 #   Column          Non-Null Count    Dtype  
---  ------          --------------    -----  
 0   H               1497024 non-null  float64
 1   diameter        1497024 non-null  float64
 2   albedo          1497024 non-null  float64
 3   diameter_sigma  1497024 non-null  float64
 4   orbit_id        1497024 non-null  int64  
 5   epoch           1497024 non-null  float64
 6   epoch_mjd       1497024 non-null  int64  
 7   epoch_cal       1497024 non-null  float64
 8   equinox         1497024 non-null  int64  
 9   e               1497024 non-null  float64
 10  a               1497024 non-null  float64
 11  q               1497024 non-null  float64
 12  i               1497024 non-null  float64
 13  om              1497024 non-null  float64
 14  w               1497024 non-null  float64
 15  ma              1497024 non-null  float64
 16  ad         

In [33]:
os.chdir('C:/Users/hifia/Projects/ML Semester project/Model testing')
# Export the DataFrame to a CSV file
x_traina.to_csv('x_traina.csv', index=False)  # Set index=False to exclude the DataFrame index
print("Done")
y_traina.to_csv('y_traina.csv', index=False)  # Set index=False to exclude the DataFrame index
print("Done")
x_test.to_csv('x_test.csv', index=False)  # Set index=False to exclude the DataFrame index
print("Done")
y_test.to_csv('y_test.csv', index=False)  # Set index=False to exclude the DataFrame index
print("Done")

print(f'DataFrames have been saved to {os.getcwd()}')

Done
Done
Done
Done
DataFrames have been saved to C:\Users\hifia\Projects\ML Semester project\Model testing


### TRAINING BEGINS

In [34]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
import os
from imblearn.over_sampling import SMOTE
from collections import Counter
import re

We will use a bunch of different models to assess which model performs best, and then accordingly proceed with the remaining datasets.

In [35]:
x_train = pd.read_csv('x_traina.csv')
y_train = pd.read_csv('y_traina.csv')
x_test = pd.read_csv('x_test.csv')
y_test = pd.read_csv('y_test.csv')


In [36]:
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(1497024, 37) (191704, 37) (1497024, 1) (191704, 1)


#### K Fold Cross Validation

In [48]:
from sklearn.model_selection import StratifiedKFold, KFold, cross_val_score, LeavePOut, TimeSeriesSplit
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import warnings
warnings.filterwarnings('ignore')

folds = [5, 10, 15]

#### Default K-Fold:

Logistic Regression

In [49]:
sc = StandardScaler()
logistic_x_train = sc.fit_transform(x_train)
logistic_x_test = sc.transform(x_test)

regressor = LogisticRegression(max_iter=100000)

for fold in folds:
    # Create a StratifiedKFold object to ensure class balance in each fold
    k_fold = KFold(n_splits=fold, shuffle=True, random_state=42)

    # Perform K-fold cross-validation and get accuracy scores
    accuracy_scores = cross_val_score(regressor, x_train, y_train, cv=k_fold, scoring='accuracy')

    # Print the accuracy scores for each fold
    for fold_num, accuracy in enumerate(accuracy_scores, start=1):
        print(f"Fold {fold_num} Accuracy: {accuracy:.4f}")

    # Calculate the mean and standard deviation of the accuracy scores
    mean_accuracy = np.mean(accuracy_scores)
    std_accuracy = np.std(accuracy_scores)

    print(f"Mean Accuracy: {mean_accuracy:.4f}")
    print(f"Standard Deviation of Accuracy: {std_accuracy:.4f}")

    # Now, you can fit the model on the entire training dataset and evaluate on your test dataset
    regressor.fit(x_train, y_train)
    test_accuracy = regressor.score(x_test, y_test)
    print(f"Test Accuracy: {test_accuracy:.4f} with {fold} folds\n\n---------------------\n")

Fold 1 Accuracy: 0.6130
Fold 2 Accuracy: 0.7889
Fold 3 Accuracy: 0.6345
Fold 4 Accuracy: 0.7834
Fold 5 Accuracy: 0.7456
Mean Accuracy: 0.7131
Standard Deviation of Accuracy: 0.0748
Test Accuracy: 0.9565 with 5 folds

---------------------

Fold 1 Accuracy: 0.9976
Fold 2 Accuracy: 0.7353
Fold 3 Accuracy: 0.7171
Fold 4 Accuracy: 0.7253
Fold 5 Accuracy: 0.7239
Fold 6 Accuracy: 0.6755
Fold 7 Accuracy: 0.7483
Fold 8 Accuracy: 0.8088
Fold 9 Accuracy: 0.6521
Fold 10 Accuracy: 0.9249
Mean Accuracy: 0.7709
Standard Deviation of Accuracy: 0.1043
Test Accuracy: 0.9565 with 10 folds

---------------------

Fold 1 Accuracy: 0.9833
Fold 2 Accuracy: 0.6132
Fold 3 Accuracy: 0.6697
Fold 4 Accuracy: 0.7321
Fold 5 Accuracy: 0.7247
Fold 6 Accuracy: 0.7409
Fold 7 Accuracy: 0.7175
Fold 8 Accuracy: 0.7375
Fold 9 Accuracy: 0.6327
Fold 10 Accuracy: 0.6159
Fold 11 Accuracy: 0.9083
Fold 12 Accuracy: 0.7326
Fold 13 Accuracy: 0.7158
Fold 14 Accuracy: 0.9368
Fold 15 Accuracy: 0.9212
Mean Accuracy: 0.7588
Standard D

Random Forest

In [50]:
sc = StandardScaler()
forest_x_train = sc.fit_transform(x_train)
forest_x_test = sc.transform(x_test)

classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)

for fold in folds:
    k_fold = KFold(n_splits=fold, shuffle=True, random_state=42)

    # Perform K-fold cross-validation and get accuracy scores
    accuracy_scores = cross_val_score(regressor, x_train, y_train, cv=k_fold, scoring='accuracy')

    # Print the accuracy scores for each fold
    for fold_num, accuracy in enumerate(accuracy_scores, start=1):
        print(f"Fold {fold_num} Accuracy: {accuracy:.4f}")

    # Calculate the mean and standard deviation of the accuracy scores
    mean_accuracy = np.mean(accuracy_scores)
    std_accuracy = np.std(accuracy_scores)

    print(f"Mean Accuracy: {mean_accuracy:.4f}")
    print(f"Standard Deviation of Accuracy: {std_accuracy:.4f}")

    # Now, you can fit the model on the entire training dataset and evaluate on your test dataset
    regressor.fit(x_train, y_train)
    test_accuracy = regressor.score(x_test, y_test)
    print(f"Test Accuracy: {test_accuracy:.4f} with {fold} folds\n\n---------------------\n")

Fold 1 Accuracy: 0.6130
Fold 2 Accuracy: 0.7889
Fold 3 Accuracy: 0.6345
Fold 4 Accuracy: 0.7834
Fold 5 Accuracy: 0.7456
Mean Accuracy: 0.7131
Standard Deviation of Accuracy: 0.0748
Test Accuracy: 0.9565 with 5 folds

---------------------

Fold 1 Accuracy: 0.9976
Fold 2 Accuracy: 0.7353
Fold 3 Accuracy: 0.7171
Fold 4 Accuracy: 0.7253
Fold 5 Accuracy: 0.7239
Fold 6 Accuracy: 0.6755
Fold 7 Accuracy: 0.7483
Fold 8 Accuracy: 0.8088
Fold 9 Accuracy: 0.6521
Fold 10 Accuracy: 0.9249
Mean Accuracy: 0.7709
Standard Deviation of Accuracy: 0.1043
Test Accuracy: 0.9565 with 10 folds

---------------------

Fold 1 Accuracy: 0.9833
Fold 2 Accuracy: 0.6132
Fold 3 Accuracy: 0.6697
Fold 4 Accuracy: 0.7321
Fold 5 Accuracy: 0.7247
Fold 6 Accuracy: 0.7409
Fold 7 Accuracy: 0.7175
Fold 8 Accuracy: 0.7375
Fold 9 Accuracy: 0.6327
Fold 10 Accuracy: 0.6159
Fold 11 Accuracy: 0.9083
Fold 12 Accuracy: 0.7326
Fold 13 Accuracy: 0.7158
Fold 14 Accuracy: 0.9368
Fold 15 Accuracy: 0.9212
Mean Accuracy: 0.7588
Standard D

Support Vector Machine

In [51]:
# # sc = StandardScaler()
# # svm_x_train = sc.fit_transform(x_train)
# # svm_x_test = sc.transform(x_test)
# # print("Finished standard scaling")

# classifier = SVC(kernel = 'linear', random_state = 0)
# classifier.fit(logistic_x_train, y_train)
# print("Finished fitting")

# y_pred = classifier.predict(logistic_x_test)
# print("Finished predicting")

# cm = confusion_matrix(y_test, y_pred)
# print("Finished confusion matrix")

# print(cm)
# print(accuracy_score(y_test, y_pred))
# print("Finished accuracy score")

In [None]:
# Better feature extraction based on CNN patterns
# LSTM, RNN, GRU
# Ablation study logging all the parameters I'm using
# Deep learning models with plots
# Deeper into the literature survey