In [1]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import  roc_auc_score
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier

In [18]:

train_set=pd.read_csv('train_set.csv')
test_set=pd.read_csv('test_set.csv')

In [19]:

# Find missing values in the training set
missing_values = train_set.isnull().sum()
print(missing_values[missing_values > 0])

# Find missing values in the test set
missing_values = test_set.isnull().sum()
print(missing_values[missing_values > 0])

X2     2590
X3     2139
X75     456
X76     444
X77     447
X78     447
dtype: int64
X2     1085
X3      971
X75     186
X76     198
X77     195
X78     195
dtype: int64


In [20]:
categorical_cols = []
numerical_cols = []

for column in train_set.columns:
  if train_set[column].dtype == object or train_set[column].nunique() < 10:
    categorical_cols.append(column)
  else:
    numerical_cols.append(column)

print("Categorical Columns:", categorical_cols)
print("Numerical Columns:", numerical_cols)

Categorical Columns: ['X4', 'X5', 'X6', 'X8', 'X10', 'X11', 'X16', 'Y']
Numerical Columns: ['RecordId', 'X2', 'X3', 'X7', 'X9', 'X12', 'X13', 'X14', 'X15', 'X17', 'X18', 'X19', 'X20', 'X21', 'X22', 'X23', 'X24', 'X25', 'X26', 'X27', 'X28', 'X29', 'X30', 'X31', 'X32', 'X33', 'X34', 'X35', 'X36', 'X37', 'X38', 'X39', 'X40', 'X41', 'X42', 'X43', 'X44', 'X45', 'X46', 'X47', 'X48', 'X49', 'X50', 'X51', 'X52', 'X53', 'X54', 'X55', 'X56', 'X57', 'X58', 'X59', 'X60', 'X61', 'X62', 'X63', 'X64', 'X65', 'X66', 'X67', 'X68', 'X69', 'X70', 'X71', 'X72', 'X73', 'X74', 'X75', 'X76', 'X77', 'X78']


In [21]:
# Handle missing values for both train and test datasets
for column in categorical_cols:
  if column in train_set.columns and train_set[column].isnull().any():
    mode_imputer = SimpleImputer(strategy='most_frequent')
    train_set[column] = mode_imputer.fit_transform(train_set[[column]])
  if column in test_set.columns and test_set[column].isnull().any():
    if column in train_set.columns:
      mode_imputer = SimpleImputer(strategy='most_frequent')
      test_set[column] = mode_imputer.fit_transform(test_set[[column]])
    else:
      print(f"Warning: Column '{column}' is missing in the training set and cannot be imputed in the test set.")

for column in numerical_cols:
  if column in train_set.columns and train_set[column].isnull().any():
    mean_imputer = SimpleImputer(strategy='mean')
    train_set[column] = mean_imputer.fit_transform(train_set[[column]])
  if column in test_set.columns and test_set[column].isnull().any():
    if column in train_set.columns:
      mean_imputer = SimpleImputer(strategy='mean')
      test_set[column] = mean_imputer.fit_transform(test_set[[column]])
    else:
      print(f"Warning: Column '{column}' is missing in the training set and cannot be imputed in the test set.")

In [22]:
# Find missing values in the training set
missing_values = train_set.isnull().sum()
print(missing_values[missing_values > 0])

# Find missing values in the test set
missing_values = test_set.isnull().sum()
print(missing_values[missing_values > 0])

Series([], dtype: int64)
Series([], dtype: int64)


In [23]:
# Calculate the correlation matrix
correlation_matrix = train_set.corr()

# Get the correlation of all columns with 'Y'
correlation_with_y = correlation_matrix['Y']

# Print the correlation values
print(correlation_with_y)

RecordId    0.002391
X2          0.021354
X3         -0.000553
X4         -0.010512
X5         -0.002148
              ...   
X75         0.012872
X76         0.005523
X77         0.013847
X78         0.012811
Y           1.000000
Name: Y, Length: 79, dtype: float64


In [24]:

# Keep only columns with correlation > 0 with 'Y' in the training set
relevant_features = correlation_with_y[correlation_with_y > 0.01].index.tolist()

# Print the number of relevant features
print(f'Number of relevant features: {len(relevant_features)}')

# Print the relevant features
print('Relevant features:', relevant_features)

# Set Y
y = train_set['Y']

if 'Y' in relevant_features:
    relevant_features.remove('Y')

# Select features for training
X = train_set[relevant_features]

# Test data
X_testdata = test_set[relevant_features]

Number of relevant features: 24
Relevant features: ['X2', 'X10', 'X12', 'X19', 'X21', 'X22', 'X24', 'X26', 'X31', 'X37', 'X51', 'X52', 'X57', 'X58', 'X65', 'X67', 'X69', 'X70', 'X72', 'X74', 'X75', 'X77', 'X78', 'Y']


In [25]:
if 'RecordId' in X.columns:
  X = X.drop('RecordId', axis=1)
if 'RecordId' in X_testdata.columns:
  X_testdata = X_testdata.drop('RecordId', axis=1)

In [26]:
scalar=MinMaxScaler()
X=scalar.fit_transform(X)
X_testdata=scalar.fit_transform(X_testdata)




In [27]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

In [28]:
clf = DecisionTreeClassifier(random_state=1)
param_grid = {
    'max_depth': [5, 7, 9, 10],
    'min_samples_split': [5, 10, 7, 20],
    'min_samples_leaf': [50, 250, 550,600]
}


# Create a GridSearchCV instance
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, scoring='accuracy')

# Fit the grid search to your data
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

KeyboardInterrupt: 

In [29]:
#adjust hyperparameters
clf.set_params(max_depth=5, min_samples_split=50, min_samples_leaf=5)

#train model
clf.fit(X_train, y_train)

In [30]:
from sklearn.metrics import roc_curve, roc_auc_score, accuracy_score
import matplotlib.pyplot as plt

y_pred = clf.predict(X_test)
y_probs = clf.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_probs)



# Calculate AUC
auc_score = roc_auc_score(y_test, y_probs)
print(f'AUC: {auc_score}')

AUC: 0.8571874912177164


In [31]:

# Predict probabilities for the test set
y_test_probs = clf.predict_proba(X_testdata)[:, 1]

# Create a DataFrame with RecordId and predicted probabilities
test_set['Y_probability'] = y_test_probs

# Save the DataFrame to a CSV file
test_set[['RecordId', 'Y_probability']].to_csv('test_set_with_probabilities.csv', index=False)


import pandas as pd

# Read the generated CSV file
csv_file_path = 'test_set_with_probabilities.csv'
test_set_with_probabilities = pd.read_csv(csv_file_path)

# Get unique values in the Y_probability column
unique_y_probabilities = test_set_with_probabilities['Y_probability'].unique()

# Print the unique values
print(unique_y_probabilities)
print(test_set_with_probabilities)


[0.00072189 0.00421246 0.02015217 0.00490405 0.01836735 0.06140351
 0.5        0.1292517  0.00266312 0.24       0.         0.40540541
 0.38461538 0.4        0.14285714 0.2        0.42857143]
        RecordId  Y_probability
0         300001       0.000722
1         300002       0.004212
2         300003       0.000722
3         300004       0.000722
4         300005       0.000722
...          ...            ...
105477    405478       0.000722
105478    405479       0.129252
105479    405480       0.000722
105480    405481       0.004212
105481    405482       0.000722

[105482 rows x 2 columns]
