Libraries Import 


In [31]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, recall_score, confusion_matrix, f1_score
import seaborn as sb

Loading the CSV files 


In [2]:
train_df = pd.read_csv("train_set.csv")
test_df = pd.read_csv("test_set.csv")
blind_df = pd.read_csv("blinded_test_set.csv")

Working on Training Set Data 


In [3]:
print(train_df.head())

     ID     Feature_1  Feature_2     Feature_3  Feature_4  Feature_5  \
0  ID_1  18281.541667    18432.0   9409.650391   0.514708   0.011300   
1  ID_2  20010.083333    20100.0   8303.049072   0.417707   0.014959   
2  ID_3  27260.125000    27437.0  12189.649414   0.447160   0.011428   
3  ID_4  41938.125000    42138.0  17866.433594   0.426019   0.009908   
4  ID_5  41274.125000    41439.0  14315.041992   0.346828   0.013596   

   Feature_6  Feature_7  Feature_8  Feature_9  ...  Feature_3230  \
0   0.045369   2.803803   0.356658   1.803803  ...    382.968383   
1   0.080294   2.338398   0.429532   1.338398  ...    452.986164   
2   0.046402   2.782842   0.359345   1.782842  ...    419.781765   
3   0.034878   3.060655   0.326727   2.060655  ...    439.023968   
4   0.065680   2.478506   0.403469   1.478506  ...    485.209184   

   Feature_3231  Feature_3232  Feature_3233  Feature_3234  Feature_3235  \
0        2214.0           1.0    136.625113      0.061710           0.0   
1       

In [4]:
print(train_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 315 entries, 0 to 314
Columns: 3240 entries, ID to CLASS
dtypes: float64(3238), int64(1), object(1)
memory usage: 7.8+ MB
None


In [5]:
print(train_df.describe())

           Feature_1      Feature_2     Feature_3   Feature_4   Feature_5  \
count     315.000000     315.000000    315.000000  315.000000  315.000000   
mean    36401.611839   36558.978836  13421.797935    0.399783    0.013326   
std     23979.228698   24006.711019   5229.346354    0.064272    0.002885   
min      4601.166667    4646.000000   2420.351481    0.137726    0.008904   
25%     23287.562500   23443.500000  10245.704590    0.357646    0.011459   
50%     34818.166667   35028.000000  13894.792969    0.394076    0.012477   
75%     45575.708333   45750.000000  16633.839844    0.434799    0.014242   
max    332120.750000  332379.000000  45741.601562    0.643473    0.025418   

        Feature_6   Feature_7   Feature_8   Feature_9    Feature_10  ...  \
count  315.000000  315.000000  315.000000  315.000000    315.000000  ...   
mean     0.066770    2.572654    0.395949    1.572654    355.140036  ...   
std      0.034442    0.305500    0.054492    0.305500   5460.014132  ...   
mi

In [7]:
print(train_df.isnull()) #Check for missing values 

        ID  Feature_1  Feature_2  Feature_3  Feature_4  Feature_5  Feature_6  \
0    False      False      False      False      False      False      False   
1    False      False      False      False      False      False      False   
2    False      False      False      False      False      False      False   
3    False      False      False      False      False      False      False   
4    False      False      False      False      False      False      False   
..     ...        ...        ...        ...        ...        ...        ...   
310  False      False      False      False      False      False      False   
311  False      False      False      False      False      False      False   
312  False      False      False      False      False      False      False   
313  False      False      False      False      False      False      False   
314  False      False      False      False      False      False      False   

     Feature_7  Feature_8  Feature_9  .

In [9]:
print(train_df.columns)

Index(['ID', 'Feature_1', 'Feature_2', 'Feature_3', 'Feature_4', 'Feature_5',
       'Feature_6', 'Feature_7', 'Feature_8', 'Feature_9',
       ...
       'Feature_3230', 'Feature_3231', 'Feature_3232', 'Feature_3233',
       'Feature_3234', 'Feature_3235', 'Feature_3236', 'Feature_3237',
       'Feature_3238', 'CLASS'],
      dtype='object', length=3240)


In [25]:
x = train_df.drop(columns=['ID', 'CLASS'])
y = train_df['CLASS']


Data Split and Scale 

In [28]:
X_train, X_val, y_train, y_val = train_test_split(
    x, y, test_size=0.2, random_state=42, stratify=y
)

print("NaNs:", X_train.isnull().sum().sum())

print("Infs:", np.isinf(X_train).sum().sum())


NaNs: 2231
Infs: 4


In [29]:
# Replace Inf and -Inf with NaN
X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
X_val.replace([np.inf, -np.inf], np.nan, inplace=True)

# Fill all NaNs with the column mean
X_train.fillna(X_train.mean(), inplace=True)
X_val.fillna(X_val.mean(), inplace=True)


In [30]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)


Train Logistic Regression 

In [32]:
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train_scaled, y_train)

y_pred = logreg.predict(X_val_scaled)
y_prob = logreg.predict_proba(X_val_scaled)[:, 1]  # Probability of class 1

cm = confusion_matrix(y_val, y_pred)
tn, fp, fn, tp = cm.ravel()

print("Accuracy:", accuracy_score(y_val, y_pred))
print("AUROC:", roc_auc_score(y_val, y_prob))
print("Sensitivity (Recall):", recall_score(y_val, y_pred))
print("Specificity:", tn / (tn + fp))
print("F1 Score:", f1_score(y_val, y_pred))

Accuracy: 0.6031746031746031
AUROC: 0.5884210526315788
Sensitivity (Recall): 0.44
Specificity: 0.7105263157894737
F1 Score: 0.46808510638297873
