In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score, roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [2]:
# Load training data
train_df = pd.read_csv("/kaggle/input/week4-practice-hackathon-2024/Train.csv")

# Check for missing values in training data
print(train_df.isnull().sum())

V1       0
V2       0
V3       0
V4       0
V5       0
V6       0
V7       0
V8       0
V9       0
V10      0
V11      0
V13      0
V19      0
V22      0
V30      0
V33      0
V35      0
V36      0
V40      0
V41      0
V42      0
V43      0
V45      0
V47      0
V50      0
V51      0
V52      0
V59      0
V63      0
V72      0
V75      0
class    0
dtype: int64


In [3]:
# Fill missing values in training data with mean
train_df = train_df.fillna(train_df.mean())

print(train_df.shape)
print(train_df.head())
print(train_df.columns)

(40776, 32)
     V1     V2     V3  V4      V5     V6    V7  V8     V9  V10  ...  V45  V47  \
0   0.0    1.0  243.0  29  4528.0  138.0   3.0  29  201.0  0.0  ...    2    3   
1   0.0   20.0    8.0  14  4183.0  183.0   4.0  26  156.0  0.0  ...    5    7   
2  14.0    2.0    2.0   9  1113.0   15.0  37.0  31   78.0  1.0  ...    7    5   
3   0.0  175.0   17.0  16  4250.0   68.0   8.0  30  152.0  0.0  ...    1    3   
4   2.0   -1.0    1.0   3     0.0    0.0   2.0   5    6.0  1.0  ...    1    5   

    V50  V51  V52   V59  V63  V72  V75  class  
0  71.0    0    0  46.0    1    2    0      1  
1  20.0    0    2  13.0    9    6    0      1  
2  11.0    0    0   7.0    9    2    1      0  
3   0.0    1    1  31.0    6    4    5      1  
4   0.0    0    0   2.0    8    0    1      1  

[5 rows x 32 columns]
Index(['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11',
       'V13', 'V19', 'V22', 'V30', 'V33', 'V35', 'V36', 'V40', 'V41', 'V42',
       'V43', 'V45', 'V47', 'V50', 'V5

In [4]:
# Separate features and target
X = train_df.drop(['class'], axis=1)
y = train_df['class']

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)
print(y_train.value_counts())


class
1    14375
0    14168
Name: count, dtype: int64


In [5]:
# Define pipeline with scaling and logistic regression
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('lr', LogisticRegression(max_iter=1000))
])

# Define parameter grid including scaling parameters
param_grid = {
    'lr__C': [0.001, 0.01, 0.1, 1, 10],
    'lr__solver': ['liblinear', 'lbfgs', 'saga']
}

In [6]:
# Initialize GridSearchCV with pipeline and param_grid
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, scoring='f1')

# Fit GridSearchCV on training data
grid_search.fit(X_train, y_train)


In [7]:
# Print best parameters and best score
print(f"Best parameters found: {grid_search.best_params_}")
print(f"Best F1 score on validation set: {grid_search.best_score_}")

# Access best estimator (pipeline) and predict on validation set
best_pipeline = grid_search.best_estimator_
y_val_pred = best_pipeline.predict(X_val)

# Evaluate F1 score on validation set
f1 = f1_score(y_val, y_val_pred)
print(f"F1 score for Logistic Regression after tuning and scaling: {f1}")


Best parameters found: {'lr__C': 1, 'lr__solver': 'lbfgs'}
Best F1 score on validation set: 0.6378784545980591
F1 score for Logistic Regression after tuning and scaling: 0.6387391124014931


In [10]:
# Optionally, you can print other metrics like accuracy, recall, precision, etc.
accuracy = accuracy_score(y_val, y_val_pred)
recall = recall_score(y_val, y_val_pred)
precision = precision_score(y_val, y_val_pred)



In [11]:
# Load test data
test_df = pd.read_csv("/kaggle/input/week4-practice-hackathon-2024/Test.csv")


In [12]:
# Check for missing values in test data
print(test_df.isnull().sum())

# Fill missing values in test data with mean (using training data mean)
test_df = test_df.fillna(train_df.mean())


Index    0
V1       0
V2       0
V3       0
V4       0
V5       0
V6       0
V7       0
V8       0
V9       0
V10      0
V11      0
V13      0
V19      0
V22      0
V30      0
V33      0
V35      0
V36      0
V40      0
V41      0
V42      0
V43      0
V45      0
V47      0
V50      0
V51      0
V52      0
V59      0
V63      0
V72      0
V75      0
dtype: int64


In [13]:
# Save Index column and drop it from test data
Index = test_df['Index']
test_data = test_df.drop(['Index'], axis=1)


In [14]:
# Predict on test data using the trained model
y_test_pred = best_pipeline.predict(test_data)


In [15]:
# Prepare submission file
submission = pd.DataFrame({
    'Index': Index,
    'class': y_test_pred
})

In [18]:
# Save submission file
submission.to_csv('submission2.csv', index=False)
print("Submission file saved successfully.")

Submission file saved successfully.
