In [36]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import  roc_auc_score
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier


In [60]:

train_set=pd.read_csv('train_set.csv')
test_set=pd.read_csv('test_set.csv')

In [61]:

# Find missing values in the training set
missing_values = train_set.isnull().sum()
print(missing_values[missing_values > 0])

# Find missing values in the test set
missing_values = test_set.isnull().sum()
print(missing_values[missing_values > 0])

X2     2590
X3     2139
X75     456
X76     444
X77     447
X78     447
dtype: int64
X2     1085
X3      971
X75     186
X76     198
X77     195
X78     195
dtype: int64


In [62]:


categorical_cols = []
numerical_cols = []

for column in train_set.columns:
  if train_set[column].dtype == object or train_set[column].nunique() < 10:
    categorical_cols.append(column)
  else:
    numerical_cols.append(column)

print("Categorical Columns:", categorical_cols)
print("Numerical Columns:", numerical_cols)

Categorical Columns: ['X4', 'X5', 'X6', 'X8', 'X10', 'X11', 'X16', 'Y']
Numerical Columns: ['RecordId', 'X2', 'X3', 'X7', 'X9', 'X12', 'X13', 'X14', 'X15', 'X17', 'X18', 'X19', 'X20', 'X21', 'X22', 'X23', 'X24', 'X25', 'X26', 'X27', 'X28', 'X29', 'X30', 'X31', 'X32', 'X33', 'X34', 'X35', 'X36', 'X37', 'X38', 'X39', 'X40', 'X41', 'X42', 'X43', 'X44', 'X45', 'X46', 'X47', 'X48', 'X49', 'X50', 'X51', 'X52', 'X53', 'X54', 'X55', 'X56', 'X57', 'X58', 'X59', 'X60', 'X61', 'X62', 'X63', 'X64', 'X65', 'X66', 'X67', 'X68', 'X69', 'X70', 'X71', 'X72', 'X73', 'X74', 'X75', 'X76', 'X77', 'X78']


In [63]:

# Handle missing values for both train and test datasets
for column in categorical_cols:
  if column in train_set.columns and train_set[column].isnull().any():
    mode_imputer = SimpleImputer(strategy='most_frequent')
    train_set[column] = mode_imputer.fit_transform(train_set[[column]])
  if column in test_set.columns and test_set[column].isnull().any():
    if column in train_set.columns:
      mode_imputer = SimpleImputer(strategy='most_frequent')
      test_set[column] = mode_imputer.fit_transform(test_set[[column]])
    else:
      print(f"Warning: Column '{column}' is missing in the training set and cannot be imputed in the test set.")

for column in numerical_cols:
  if column in train_set.columns and train_set[column].isnull().any():
    mean_imputer = SimpleImputer(strategy='mean')
    train_set[column] = mean_imputer.fit_transform(train_set[[column]])
  if column in test_set.columns and test_set[column].isnull().any():
    if column in train_set.columns:
      mean_imputer = SimpleImputer(strategy='mean')
      test_set[column] = mean_imputer.fit_transform(test_set[[column]])
    else:
      print(f"Warning: Column '{column}' is missing in the training set and cannot be imputed in the test set.")

In [64]:


# Find missing values in the training set
missing_values = train_set.isnull().sum()
print(missing_values[missing_values > 0])

# Find missing values in the test set
missing_values = test_set.isnull().sum()
print(missing_values[missing_values > 0])

Series([], dtype: int64)
Series([], dtype: int64)


In [65]:
# prompt: do corelationanalyss of train_set  with column Y

# Calculate the correlation matrix
correlation_matrix = train_set.corr()

# Get the correlation of all columns with 'Y'
correlation_with_y = correlation_matrix['Y']

# Print the correlation values
print(correlation_with_y)

RecordId    0.002391
X2          0.021354
X3         -0.000553
X4         -0.010512
X5         -0.002148
              ...   
X75         0.012872
X76         0.005523
X77         0.013847
X78         0.012811
Y           1.000000
Name: Y, Length: 79, dtype: float64


In [144]:
# Keep only columns with correlation > 0 with 'Y' in the training set
relevant_features = correlation_with_y[correlation_with_y > 0.01].index.tolist()

# Print the number of relevant features
print(f'Number of relevant features: {len(relevant_features)}')

# Print the relevant features
print('Relevant features:', relevant_features)

# Set Y
y = train_set['Y']

if 'Y' in relevant_features:
    relevant_features.remove('Y')

# Select features for training
X = train_set[relevant_features]

# Test data
X_testdata = test_set[relevant_features]


Number of relevant features: 24
Relevant features: ['X2', 'X10', 'X12', 'X19', 'X21', 'X22', 'X24', 'X26', 'X31', 'X37', 'X51', 'X52', 'X57', 'X58', 'X65', 'X67', 'X69', 'X70', 'X72', 'X74', 'X75', 'X77', 'X78', 'Y']


In [145]:
# prompt: drop the RecordId column from train_set

if 'RecordId' in X.columns:
  X = X.drop('RecordId', axis=1)
if 'RecordId' in X_testdata.columns:
  X_testdata = X_testdata.drop('RecordId', axis=1)

In [146]:
X.columns

Index(['X2', 'X10', 'X12', 'X19', 'X21', 'X22', 'X24', 'X26', 'X31', 'X37',
       'X51', 'X52', 'X57', 'X58', 'X65', 'X67', 'X69', 'X70', 'X72', 'X74',
       'X75', 'X77', 'X78'],
      dtype='object')

In [147]:
X_testdata.columns

Index(['X2', 'X10', 'X12', 'X19', 'X21', 'X22', 'X24', 'X26', 'X31', 'X37',
       'X51', 'X52', 'X57', 'X58', 'X65', 'X67', 'X69', 'X70', 'X72', 'X74',
       'X75', 'X77', 'X78'],
      dtype='object')

In [148]:
scalar=MinMaxScaler()
X=scalar.fit_transform(X)
X_testdata=scalar.fit_transform(X_testdata)




In [149]:
X

array([[0.97260274, 1.        , 0.13628285, ..., 0.        , 0.        ,
        0.        ],
       [0.90920936, 0.        , 0.13666074, ..., 0.        , 0.        ,
        0.        ],
       [0.46575342, 1.        , 0.13618852, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.56811069, 0.        , 0.14294789, ..., 0.4120134 , 0.        ,
        0.4120134 ],
       [0.68493151, 0.        , 0.13788276, ..., 0.        , 0.        ,
        0.        ],
       [0.46928274, 0.        , 0.13602519, ..., 0.        , 0.        ,
        0.        ]])

In [150]:
X_testdata

array([[0.8630137 , 0.        , 0.09938696, ..., 0.        , 0.        ,
        0.        ],
       [0.30136986, 0.        , 0.09876615, ..., 0.        , 0.        ,
        0.        ],
       [0.2748524 , 0.        , 0.09873065, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.62839055, 0.        , 0.10032052, ..., 0.        , 0.        ,
        0.        ],
       [0.58524416, 0.        , 0.10059368, ..., 0.        , 0.        ,
        0.        ],
       [0.77843722, 0.        , 0.10440861, ..., 0.        , 0.        ,
        0.        ]])

In [151]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

In [155]:
# Train the Decision Tree model
#create decision tree classifier
clf = DecisionTreeClassifier(random_state=1, max_depth=5)
clf.fit(X_train, y_train)

In [156]:
# Evaluate the model on test data
from sklearn.metrics import roc_curve, roc_auc_score, accuracy_score
import matplotlib.pyplot as plt

y_pred = clf.predict(X_test)
y_probs = clf.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_probs)


In [157]:
# Calculate AUC
auc_score = roc_auc_score(y_test, y_probs)
print(f'AUC: {auc_score}')

AUC: 0.8510792372762275


In [117]:
X_testdata

array([[0.8630137 , 0.7027027 , 0.        , ..., 0.11904762, 0.54314721,
        0.52820513],
       [0.30136986, 0.07432432, 1.        , ..., 0.42857143, 0.53299492,
        0.53333333],
       [0.2748524 , 0.47375977, 0.        , ..., 0.18521694, 0.511141  ,
        0.51638348],
       ...,
       [0.62839055, 0.31711844, 0.        , ..., 0.30952381, 0.59526952,
        0.55364542],
       [0.58524416, 0.55877583, 1.        , ..., 0.3102409 , 0.50807286,
        0.51312941],
       [0.77843722, 0.49685031, 0.        , ..., 0.18633135, 0.55481152,
        0.5306254 ]])

In [118]:
# prompt: test this model on train_set and use the result to give probabilty of positive class for each row. Then geenerate a csv and store the values for probabilitiess in  a column

# Predict probabilities for the test set
y_test_probs = clf.predict_proba(X_testdata)[:, 1]

# Create a DataFrame with RecordId and predicted probabilities
test_set['Y_probability'] = y_test_probs

# Save the DataFrame to a CSV file
test_set[['RecordId', 'Y_probability']].to_csv('test_set_with_probabilities.csv', index=False)

In [140]:
import pandas as pd

# Read the generated CSV file
csv_file_path = 'test_set_with_probabilities.csv'
test_set_with_probabilities = pd.read_csv(csv_file_path)

# Get unique values in the Y_probability column
unique_y_probabilities = test_set_with_probabilities['Y_probability'].unique()

# Print the unique values
print(unique_y_probabilities)


[7.76008811e-04 4.25840794e-03 2.01521694e-02 4.90405117e-03
 6.40608035e-02 0.00000000e+00 1.44654088e-01 2.65251989e-03
 7.14285714e-02 4.54545455e-01 5.38461538e-01 2.50000000e-01
 2.85714286e-01 5.00000000e-01 1.11111111e-01 1.00000000e+00]
