In [26]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
import numpy as np

In [42]:
train_data = pd.read_csv('/content/drive/MyDrive/datasets/Participants_Data_analytics_olympiad_2023/train.csv')
test_data = pd.read_csv('/content/drive/MyDrive/datasets/Participants_Data_analytics_olympiad_2023/test.csv')

In [13]:
# train['is_zero_loans_over_30_days']
column_names = train.columns.tolist()
column_names

['customer_id',
 'firstname',
 'lastname',
 'record_number',
 'days_since_opened',
 'days_since_confirmed',
 'primary_term',
 'final_term',
 'days_till_primary_close',
 'days_till_final_close',
 'loans_credit_limit',
 'loans_next_payment_summary',
 'loans_outstanding_balance',
 'loans_max_overdue_amount',
 'loans_credit_cost_rate',
 'loans_within_5_days',
 'loans_within_5_to_30_days',
 'loans_within_30_to_60_days',
 'loans_within_60_to_90_days',
 'loans_over_90_days',
 'is_zero_loans_within_5_days',
 'is_zero_loans_within_5_to_30_days',
 'is_zero_loans_within_30_to_60_days',
 'is_zero_loans_within_60_to_90_days',
 'is_zero_loans_over_90_days',
 'utilization',
 'over_limit_count',
 'max_over_limit_count',
 'is_zero_utilization',
 'is_zero_over_limit_count',
 'is_zero_max_over_limit_count',
 'encoded_payment_0',
 'encoded_payment_1',
 'encoded_payment_2',
 'encoded_payment_3',
 'encoded_payment_4',
 'encoded_payment_5',
 'encoded_payment_6',
 'encoded_payment_7',
 'encoded_payment_8',
 '

In [28]:
# Mandatory features for analysis
mandatory_features = [
    'days_since_opened',
    'primary_term',
    'final_term',
    'days_till_primary_close',
    'days_till_final_close',
    'loans_credit_limit',
    'loans_outstanding_balance',
    'loans_max_overdue_amount',
    'loans_credit_cost_rate',
    'loans_within_5_days',
    'loans_over_90_days',
    'utilization',
    'over_limit_count',
    'max_over_limit_count',
    'encoded_payment_0',
    'encoded_payment_1',
    'encoded_loans_account_holder_type',
    'encoded_loans_credit_status',
    'encoded_loans_credit_type',
    'encoded_loans_account_currency'
]

In [46]:
# Select mandatory features and target variable
X_train = train_data[mandatory_features]
y_train = train_data['final_close_flag']

In [45]:
X_test = test_data[mandatory_features]

In [47]:
# Data preprocessing: Impute missing values and standardize features
numeric_imputer = SimpleImputer(strategy='mean')
X_train_imputed = numeric_imputer.fit_transform(X_train)
X_test_imputed = numeric_imputer.transform(X_test)

In [48]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_test_scaled = scaler.transform(X_test_imputed)

In [49]:
# Model selection and training: RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)

In [50]:
# Prediction
y_pred = model.predict(X_test_scaled)

In [51]:
# Evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

ValueError: ignored

In [52]:
# Create the final submission DataFrame
submission_df = pd.DataFrame({
    'primary_close_flag': [0] * len(y_pred),
    'final_close_flag': y_pred
})

In [55]:
# Save the submission DataFrame to a CSV file
submission_df.to_csv('/content/drive/MyDrive/datasets/Participants_Data_analytics_olympiad_2023/Machine_hack_submissions.csv', index=False, header=True)

In [36]:
# Example prediction for a new customer
new_customer_data = {
    'days_since_opened': 150,
    'primary_term': 365,
    'final_term': 365,
    'days_till_primary_close': 215,
    'days_till_final_close': 215,
    'loans_credit_limit': 5000,
    'loans_outstanding_balance': 2000,
    'loans_max_overdue_amount': 100,
    'loans_credit_cost_rate': 0.1,
    'loans_within_5_days': 1,
    'loans_over_90_days': 0,
    'utilization': 0.4,
    'over_limit_count': 2,
    'max_over_limit_count': 1,
    'encoded_payment_0': 0,
    'encoded_payment_1': 1,
    'encoded_loans_account_holder_type': 2,
    'encoded_loans_credit_status': 1,
    'encoded_loans_credit_type': 0,
    'encoded_loans_account_currency': 1
}

In [37]:
# Create a DataFrame with the new customer data
new_customer_df = pd.DataFrame(new_customer_data, index=[0])

In [38]:
# Preprocess and scale the new customer data
new_customer_imputed = numeric_imputer.transform(new_customer_df)
new_customer_scaled = scaler.transform(new_customer_imputed)

In [40]:
# Make a prediction for the new customer
prediction = model.predict(new_customer_scaled)
print(f'Predicted final_close_flag: {prediction[0]}')

Predicted final_close_flag: 0


In [41]:
# Make a prediction for the new customer
prediction = model.predict(new_customer_scaled)
print(f'Predicted final_close_flag: {prediction[0]}')

Predicted final_close_flag: 0
