<div align="center">


</div>

<h1 align="center">Portuguese Bank Marketing Analysis</h1>

<div align="center">

---

</div>


### 1:Data Loading and Exploration

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, OneHotEncoder , LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score

In [None]:
# Load the dataset
df = pd.read_csv('https://raw.githubusercontent.com/GangasrinivasKatraji/Predictive-Analysis-and-Strategic-Insights-for-Portuguese-Bank-Marketing-Campaigns/main/Dataset/bank-additional-full.csv')


In [None]:
df

In [None]:
# Display the first five rows of the dataframe
df.head()

In [None]:
# Checking the info of the model
df.info()

In [None]:
# Summary statistics
df.describe()

In [None]:
# Check for missing values
df.isnull().sum()

In [None]:
df

In [None]:
df.info()

### 2: Data Preprocessing

In [None]:
# Dropping rows with missing values
df = df.dropna()

# Encoding categorical variables
categorical_vars = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']
label_encoders = {}
for column in categorical_vars:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le

# Encoding the target variable
df['y'] = df['y'].map({'yes': 1, 'no': 0})

# Splitting the data into training and testing sets
X = df.drop(columns=['y'])
y = df['y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


### 3: Exploratory Data Analysis (EDA)

In [None]:
# Target variable analysis
plt.figure(figsize=(8, 6))
sns.countplot(data=df, x='y')
plt.title('Distribution of Target Variable (y)')
plt.show()

In [None]:
# Calculate average age for each job
average_age_by_job = df.groupby('job')['age'].mean()

# Create a bar chart to visualize the differences
plt.figure(figsize=(10, 6))  # Make the chart bigger
average_age_by_job.plot(kind='bar')
plt.title('Average Age by Job')
plt.xlabel('Job')
plt.ylabel('Average Age')
plt.show()  # Display the chart

In [None]:
# Distribution Comparison (Box Plot)
sns.barplot(x='marital', y='age', data=df)
plt.title('Age Distribution by Marital Status')
plt.show()

In [None]:
# Bar Chart (Categorical)
plt.figure(figsize=(10, 6))
sns.countplot(x='education', data=df)
plt.title('Distribution of Education Levels')
plt.show()

In [None]:
# Calculating value counts for the 'marital' column
marital_counts = df['marital'].value_counts()

# Plotting the distribution of the 'marital' column
plt.figure(figsize=(8, 6))

# Plot bar chart
plt.bar(marital_counts.index, marital_counts.values, color='mediumspringgreen')

# Set x-axis label
plt.xlabel('Marital Status')

# Set y-axis label
plt.ylabel('Frequency')

# Set title
plt.title('Distribution of Marital Status')

# Show the plot
plt.show()

In [None]:
# Ensure the 'contact' column is defined and contains data
contact = df['contact'].value_counts()

# Plotting the bar chart
plt.figure(figsize=(8, 6))

# Plot bar chart
plt.bar(contact.index, contact.values, color='crimson')

# Set x-axis label
plt.xlabel('Contact Status')

# Set y-axis label
plt.ylabel('Frequency')

# Set title
plt.title('Distribution of Contact Status')

# Show the plot
plt.show()


In [None]:
# Histogram

# Create a new figure with size 8x6 inches

plt.figure(figsize=(8, 6))  

# Plot histogram with 20 bins

plt.hist(df['age'], bins=20, color='orange', edgecolor='black')  

# Set x-axis label

plt.xlabel('Age')  

# Set y-axis label

plt.ylabel('Frequency')  

# Set title of the plot

plt.title('Histogram: Distribution of Age')  

# Add gridlines

plt.grid(True)  

# Display the plot
plt.show()  

In [None]:
# Calculating the average age for each education level
average_age = df.groupby('education')['age'].mean().reset_index()

# Plotting the average age by education level
plt.figure(figsize=(10, 8))

# Create a bar plot
sns.barplot(x='education', y='age', data=average_age, palette='Set2')

# Set x-axis label
plt.xlabel('Education Level', fontsize=14)

# Set y-axis label
plt.ylabel('Average Age', fontsize=14)

# Set title of the plot
plt.title('Average Age by Education Level', fontsize=16)

# Rotate x-axis labels for better readability
plt.xticks(rotation=45)

# Display the plot
plt.show()

In [None]:
# Bar Chart of Education Levels by Job Category
plt.figure(figsize=(12, 8))
sns.countplot(data=df, x='job', hue='education')
plt.xlabel('Job')
plt.ylabel('Count')
plt.title('Distribution of Education Levels by Job')
plt.xticks(rotation=45)
plt.legend(title='Education')
plt.tight_layout()
plt.show()

In [None]:
# Count Plot of Contact Method by Outcome
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='contact', hue='poutcome')
plt.xlabel('Contact Method')
plt.ylabel('Count')
plt.title('Distribution of Contact Methods by Outcome')
plt.legend(title='Outcome')
plt.tight_layout()
plt.show()

### 4: Model Building

In [None]:
# Logistic Regression
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

In [None]:
# Decision Tree Classifier
tree_clf = DecisionTreeClassifier()
tree_clf.fit(X_train, y_train)

In [None]:
# Random Forest Classifier
forest_clf = RandomForestClassifier()
forest_clf.fit(X_train, y_train)

In [None]:
# Gradient Boosting Classifier
gb_clf = GradientBoostingClassifier()
gb_clf.fit(X_train, y_train)

In [None]:
# Support Vector Machine Classifier
svm_clf = SVC()
svm_clf.fit(X_train, y_train)

### 5: Model Evaluation

In [None]:
# Logistic Regression Evaluation
y_pred_log_reg = log_reg.predict(X_test)
print("Logistic Regression")
print(classification_report(y_test, y_pred_log_reg))
print("ROC AUC Score:", roc_auc_score(y_test, y_pred_log_reg))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_log_reg))

In [None]:
# Decision Tree Classifier Evaluation
y_pred_tree_clf = tree_clf.predict(X_test)
print("\nDecision Tree Classifier")
print(classification_report(y_test, y_pred_tree_clf))
print("ROC AUC Score:", roc_auc_score(y_test, y_pred_tree_clf))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_tree_clf))

In [None]:
# Random Forest Classifier Evaluation
y_pred_forest_clf = forest_clf.predict(X_test)
print("\nRandom Forest Classifier")
print(classification_report(y_test, y_pred_forest_clf))
print("ROC AUC Score:", roc_auc_score(y_test, y_pred_forest_clf))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_forest_clf))

In [None]:
# Gradient Boosting Classifier Evaluation
y_pred_gb_clf = gb_clf.predict(X_test)
print("\nGradient Boosting Classifier")
print(classification_report(y_test, y_pred_gb_clf))
print("ROC AUC Score:", roc_auc_score(y_test, y_pred_gb_clf))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_gb_clf))

In [None]:
# Support Vector Machine Classifier
y_pred_svm_clf = svm_clf.predict(X_test)
print("\nSupport Vector Machine")
print(classification_report(y_test, y_pred_svm_clf))
print("ROC AUC SCORE:", roc_auc_score(y_test, y_pred_svm_clf))
print("confusion_matrix")
print(confusion_matrix(y_test, y_pred_svm_clf))

### 6: Suggestions to the Bank Marketing Team

In [None]:
suggestions = """
1. **Focus on specific job sectors**: Certain job types such as management, technician, and entrepreneur show a higher likelihood of subscribing to the product.
2. **Target specific age groups**: Age groups around 30-60 years tend to have a higher conversion rate.
3. **Optimize call timings**: Days of the week and months with higher conversion rates should be targeted more aggressively.
4. **Leverage previous campaign data**: Use the outcome of previous campaigns (poutcome) to identify and target potential customers more effectively.
5. **Economic indicators**: Pay attention to economic indicators such as employment variation rate and euribor 3-month rate which might influence customer decisions.
"""

print("Suggestions for the marketing team:")
print(suggestions)


In [None]:
# Section 7: Challenges and Techniques

challenges = """
1. **Handling missing values**: There were no missing values in the dataset, which simplified the preprocessing step.
2. **Encoding categorical variables**: Used Label Encoding to convert categorical variables into numeric values.
3. **Imbalanced dataset**: The dataset might be imbalanced, affecting the performance of the models. Techniques such as oversampling, undersampling, or using balanced class weights in models can be applied.
4. **Feature selection**: Dropped the 'duration' feature as it is not realistic to use it for prediction purposes before the call is made.
5. **Model selection and tuning**: Evaluated multiple models (Logistic Regression, Decision Tree, Random Forest) to identify the best performing model. Hyperparameter tuning can further improve model performance.
"""

print("Challenges faced and techniques used:")
print(challenges)


### Performance Summary Table

| Model                    | Accuracy | Precision (Class 1) | Recall (Class 1) | F1-Score (Class 1) | ROC AUC |
| ------------------------- | -------- | ------------------- | ---------------- | ------------------ | ------- |
| Logistic Regression     | 0.90     | 0.63                | 0.21             | 0.32               | 0.597   |
| Decision Tree Classifier | 0.83     | 0.29                | 0.34             | 0.31               | 0.616   |
| Random Forest Classifier | 0.89     | 0.54                | 0.30             | 0.39               | 0.635   |
| Gradient Boosting        | 0.90     | 0.64                | 0.24             | 0.35               | 0.613   |

### Key Observations

* **Accuracy:** All models except the Decision Tree have high accuracy (around 90%). However, this is misleading due to the imbalanced dataset (more class 0 samples).
* **Class 1 Performance:**  All models struggle with class 1 (minority class). This is evident in the low precision, recall, and F1-score for class 1.
* **ROC AUC:**  ROC AUC scores are relatively low, indicating that the models aren't particularly good at discriminating between the two classes.
* **Best Model:** The Random Forest Classifier achieves the highest F1-score for class 1 (0.39) and a good ROC AUC (0.635), making it a relatively better choice for this imbalanced dataset. However, its performance on the minority class still needs improvement.

<div align="center">

---

</div>

<h1 align="center">Thank You!</h1>

<div align="center">

---

</div>
