# Data Preprocessing

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

# Load the training and testing datasets
train_url = "https://raw.githubusercontent.com/dsrscientist/dataset5/main/termdeposit_train.csv"
test_url = "https://raw.githubusercontent.com/dsrscientist/dataset5/main/termdeposit_test.csv"

train_data = pd.read_csv(train_url)
test_data = pd.read_csv(test_url)

# Display basic information about the datasets
print("Training Data Info:")
print(train_data.info())
print("\nTesting Data Info:")
print(test_data.info())

# Handle missing or null values (drop or impute based on your analysis)
train_data.dropna(inplace=True)
test_data.dropna(inplace=True)

# Encode categorical variables (one-hot encoding)
categorical_features = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']

encoder = OneHotEncoder(drop='first', sparse=False)
train_encoded = pd.DataFrame(encoder.fit_transform(train_data[categorical_features]))
train_encoded.columns = encoder.get_feature_names_out(categorical_features)

test_encoded = pd.DataFrame(encoder.transform(test_data[categorical_features]))
test_encoded.columns = encoder.get_feature_names_out(categorical_features)

# Concatenate the encoded features with the original datasets
train_data = pd.concat([train_data, train_encoded], axis=1)
test_data = pd.concat([test_data, test_encoded], axis=1)

# Drop the original categorical columns as they are no longer needed
train_data.drop(categorical_features, axis=1, inplace=True)
test_data.drop(categorical_features, axis=1, inplace=True)

# Convert categorical target variable to numerical format
label_encoder = LabelEncoder()
train_data['subscribed'] = label_encoder.fit_transform(train_data['subscribed'])

# Explore the data to gain insights (optional)
# You can use functions like train_data.head(), train_data.describe(), train_data['subscribed'].value_counts(), etc.

# Display the first few rows of the modified training dataset
print("\nModified Training Data:")
print(train_data.head())


# Feature Engineering

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Assuming you have already loaded and preprocessed the data (as shown in the previous code)

# Feature engineering: Extract relevant features
# Example: Create a new feature 'contact_month_combined' by combining 'contact' and 'month'
train_data['contact_month_combined'] = train_data['contact'] + '_' + train_data['month']
test_data['contact_month_combined'] = test_data['contact'] + '_' + test_data['month']

# Example: Create a binary feature 'recent_contact' based on the 'pdays' (number of days since last contact)
train_data['recent_contact'] = (train_data['pdays'] >= 0).astype(int)
test_data['recent_contact'] = (test_data['pdays'] >= 0).astype(int)

# Example: Create a binary feature 'previous_contact' based on the 'previous' (number of contacts before this campaign)
train_data['previous_contact'] = (train_data['previous'] > 0).astype(int)
test_data['previous_contact'] = (test_data['previous'] > 0).astype(int)

# Consider scaling numerical features
numeric_features = ['age', 'duration', 'campaign', 'pdays', 'previous']

scaler = StandardScaler()
train_data[numeric_features] = scaler.fit_transform(train_data[numeric_features])
test_data[numeric_features] = scaler.transform(test_data[numeric_features])

# Display the first few rows of the modified training dataset
print("\nModified Training Data with Feature Engineering:")
print(train_data.head())


# Train-Test Split

In [None]:
from sklearn.model_selection import train_test_split

# Assuming you have already loaded, preprocessed, and engineered features in the training dataset

# Define features (X) and target variable (y)
X = train_data.drop('subscribed', axis=1)
y = train_data['subscribed']

# Split the data into training and validation sets (80% training, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the resulting sets
print("Shapes of the sets:")
print("X_train:", X_train.shape)
print("X_val:", X_val.shape)
print("y_train:", y_train.shape)
print("y_val:", y_val.shape)


# Model Selection

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Assuming you have already performed the train-test split and feature engineering

# Define the model
model = RandomForestClassifier(random_state=42)

# Train the model
model.fit(X_train, y_train)

# Evaluate the model on the validation set
y_val_pred = model.predict(X_val)

# Display the evaluation metrics (e.g., accuracy, classification report)
from sklearn.metrics import accuracy_score, classification_report

accuracy = accuracy_score(y_val, y_val_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(y_val, y_val_pred))


# Model Training

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Assuming you have already performed the train-test split and feature engineering

# Define the Random Forest model
model = RandomForestClassifier(random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions on the validation set
y_val_pred = model.predict(X_val)

# Evaluate the model on the validation set
accuracy = accuracy_score(y_val, y_val_pred)
print(f'Accuracy on the validation set: {accuracy:.4f}')

# Display the classification report
print("\nClassification Report:")
print(classification_report(y_val, y_val_pred))


# Model Evaluation

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
import matplotlib.pyplot as plt

# Assuming you have already trained the Random Forest model and made predictions on the validation set

# Evaluate the model on the validation set
y_val_pred = model.predict(X_val)

# Calculate evaluation metrics
accuracy = accuracy_score(y_val, y_val_pred)
precision = precision_score(y_val, y_val_pred)
recall = recall_score(y_val, y_val_pred)
f1 = f1_score(y_val, y_val_pred)
roc_auc = roc_auc_score(y_val, model.predict_proba(X_val)[:, 1])

# Display the metrics
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')
print(f'ROC-AUC Score: {roc_auc:.4f}')

# Plot ROC curve
fpr, tpr, _ = roc_curve(y_val, model.predict_proba(X_val)[:, 1])
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()


# Prediction on Test Data

In [None]:
# Assuming you have already loaded, preprocessed, and engineered features in the test dataset

# Preprocess the test data using the same steps as the training data
# (e.g., handle missing values, one-hot encode categorical variables, scale numerical features)

# Example: Assuming 'test_data' is the preprocessed test dataset
# Note: Make sure to use the same transformations applied to the training data

# Encode categorical variables (one-hot encoding)
test_encoded = pd.DataFrame(encoder.transform(test_data[categorical_features]))
test_encoded.columns = encoder.get_feature_names_out(categorical_features)

# Concatenate the encoded features with the original test dataset
test_data = pd.concat([test_data, test_encoded], axis=1)

# Drop the original categorical columns as they are no longer needed
test_data.drop(categorical_features, axis=1, inplace=True)

# Scale numerical features using the same scaler fitted on the training data
test_data[numeric_features] = scaler.transform(test_data[numeric_features])

# Display the first few rows of the modified test dataset
print("\nModified Test Data:")
print(test_data.head())

# Use the trained model to predict the target variable for the test set
test_predictions = best_rf_model.predict(test_data)

# You can further process test_predictions as needed (e.g., converting to the required format)

# Display the predicted values
print("\nPredicted Values on Test Data:")
print(test_predictions)
