In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
import numpy as np

In [None]:

train_set=pd.read_csv('train_set.csv')
test_set=pd.read_csv('test_set.csv')

In [None]:
categorical_cols = []
numerical_cols = []

for column in train_set.columns:
  if train_set[column].dtype == object or train_set[column].nunique() < 10:
    categorical_cols.append(column)
  else:
    numerical_cols.append(column)

print("Categorical Columns:", categorical_cols)
print("Numerical Columns:", numerical_cols)

In [None]:
# Handle missing values for both train and test datasets
for column in categorical_cols:
  if column in train_set.columns and train_set[column].isnull().any():
    mode_imputer = SimpleImputer(strategy='most_frequent')
    train_set[column] = mode_imputer.fit_transform(train_set[[column]])
  if column in test_set.columns and test_set[column].isnull().any():
    if column in train_set.columns:
      mode_imputer = SimpleImputer(strategy='most_frequent')
      test_set[column] = mode_imputer.fit_transform(test_set[[column]])
    else:
      print(f"Warning: Column '{column}' is missing in the training set and cannot be imputed in the test set.")

for column in numerical_cols:
  if column in train_set.columns and train_set[column].isnull().any():
    mean_imputer = SimpleImputer(strategy='mean')
    train_set[column] = mean_imputer.fit_transform(train_set[[column]])
  if column in test_set.columns and test_set[column].isnull().any():
    if column in train_set.columns:
      mean_imputer = SimpleImputer(strategy='mean')
      test_set[column] = mean_imputer.fit_transform(test_set[[column]])
    else:
      print(f"Warning: Column '{column}' is missing in the training set and cannot be imputed in the test set.")


# Find missing values in the training set
missing_values = train_set.isnull().sum()
print(missing_values[missing_values > 0])

# Find missing values in the test set
missing_values = test_set.isnull().sum()
print(missing_values[missing_values > 0])



In [None]:


# Assuming train_set and test_set are pandas DataFrames
# Get all columns except 'Y' for X
X = train_set[[col for col in train_set.columns if col != 'Y']]

# Get only 'Y' column for y
y = train_set['Y']

# Select the same features for the test data
X_testdata = test_set[[col for col in test_set.columns if col != 'Y']]
if 'RecordId' in X.columns:
  X = X.drop('RecordId', axis=1)
if 'RecordId' in X_testdata.columns:
  X_testdata = X_testdata.drop('RecordId', axis=1)

# ... rest of your code (scaling, feature selection, model training, etc.) ...
X.columns

In [None]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel

# Train Extra Trees Classifier on the training data
clf = RandomForestClassifier(
    n_estimators=1000,  #  number of trees
    max_depth=5,       # Limit tree depth to control overfitting-->change depth ab
    min_samples_split=10,  # Minimum samples required to split
    min_samples_leaf=2,    # Minimum samples required at a leaf node
    max_features='sqrt',   # Randomly select a subset of features
    random_state=42,
    n_jobs=-1
)
clf.fit(X, y)

# Option 1: Using mean threshold
# selector_mean = SelectFromModel(clf, threshold="mean", prefit=True)
# X_selected_mean = selector_mean.transform(X)
# X_test_selected_mean = selector_mean.transform(X_testdata)

# Option 2: Using a custom threshold value
custom_threshold = 0.5  # Set a custom threshold for feature importance
selector_custom = SelectFromModel(clf, threshold=custom_threshold, prefit=True)
X_selected_custom = selector_custom.transform(X)
X_test_selected_custom = selector_custom.transform(X_testdata)

# Option 3: Using a quantile threshold (e.g., top 25% features)
# import numpy as np

# feature_importances = clf.feature_importances_
# quantile_threshold = np.percentile(feature_importances, 90)  # Top 10% most important features
# selector_quantile = SelectFromModel(clf, threshold=quantile_threshold, prefit=True)
# X_selected_quantile = selector_quantile.transform(X)
# X_test_selected_quantile = selector_quantile.transform(X_testdata)

# Get the selected feature names for each threshold
# selected_features_mean = X.columns[selector_mean.get_support()]
# selected_features_custom = X.columns[selector_custom.get_support()]
# selected_features_quantile = X.columns[selector_quantile.get_support()]

# print("Selected features with mean threshold:", selected_features_mean)
print("Selected features with custom threshold:", selected_features_custom)
# print("Selected features with quantile threshold:", selected_features_quantile)


In [None]:
import pandas as pd

# Assuming feature names are known
feature_names = [f'X{i}' for i in range(X.shape[1])]  # Replace with actual feature names
X_df = pd.DataFrame(X, columns=feature_names)

# # Then use X_df in the code
# selected_features_mean = X_df.columns[selector_mean.get_support()]
selected_features_custom = X_df.columns[selector_custom.get_support()]
# selected_features_quantile = X_df.columns[selector_quantile.get_support()]

# print("Selected features with mean threshold:", selected_features_mean)
print("Selected features with custom threshold:", selected_features_custom)
# print("Selected features with quantile threshold:", selected_features_quantile)


In [None]:
#Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(
    n_estimators=1500,  #  number of trees
    max_depth=5,       # Limit tree depth to control overfitting-->change depth ab
    min_samples_split=10,  # Minimum samples required to split
    min_samples_leaf=2,    # Minimum samples required at a leaf node
    max_features='sqrt',   # Randomly select a subset of features
    random_state=42,
    n_jobs=-1
)
clf.fit(X_train, y_train)

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score, accuracy_score
import matplotlib.pyplot as plt

y_pred = clf.predict(X_test)
y_probs = clf.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_probs)



# Calculate AUC
auc_score = roc_auc_score(y_test, y_probs)
print(f'AUC: {auc_score}')

In [None]:
clf.fit(X, y)

In [None]:

# Predict probabilities for the test set
y_test_probs = clf.predict_proba(X_testdata)[:, 1]

# Create a DataFrame with RecordId and predicted probabilities
test_set['Y_probability'] = y_test_probs

# Save the DataFrame to a CSV file
test_set[['RecordId', 'Y_probability']].to_csv('test_set_with_probabilities.csv', index=False)


import pandas as pd

# Read the generated CSV file
csv_file_path = 'test_set_with_probabilities.csv'
test_set_with_probabilities = pd.read_csv(csv_file_path)

# Get unique values in the Y_probability column
unique_y_probabilities = test_set_with_probabilities['Y_probability'].unique()

# Print the unique values
print(unique_y_probabilities)
print(test_set_with_probabilities)
