<a href="https://colab.research.google.com/github/NdumisoMbili/Python-and-Application/blob/main/Ndumiso_Mbili_Python%26App.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Section A: Data Visualization

In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc

In [None]:
# Uploading the data
from google.colab import files
files.upload()

In [None]:
# Load the dataset
df = pd.read_csv('TRAQDataScienceTest. (1).csv')

In [None]:
# Display the first few rows of the dataset
df.head()

In [None]:
df.drop(['Previous_Payment_Amount', 'Previous_Total_Due', 'Director_Status', 'Contact_Score', 'Credit_Risk'],axis=1,inplace=True)

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.info()

In [None]:
#1. Plotting Monthly Account Balances
# Set the style for the plots
sns.set(style="white")

# Aggregate the Opening_Balance by Month
monthly_balance = df.groupby('Month')['Opening_Balance'].mean().reset_index()

# Plot the trend of Opening_Balance over the Month
plt.figure(figsize=(10, 6))
plt.plot(monthly_balance['Month'], monthly_balance['Opening_Balance'], marker='o', linestyle='-')
plt.title('Trend of Opening Balance Over Months')
plt.xlabel('Month')
plt.ylabel('Average Opening Balance')
plt.legend(['Opening Balance'])
plt.grid(True)
plt.show()

In [None]:
#2. Visualizing Arrears
# Aggregate the Opening_Arrears by Month
monthly_arrears = df.groupby('Month')['Opening_Arrears'].sum().reset_index()

# Plot the total Opening_Arrears for each Month
plt.figure(figsize=(10, 6))
sns.barplot(x='Month', y='Opening_Arrears', data=monthly_arrears, palette='viridis')
plt.title('Total Opening Arrears for Each Month')
plt.xlabel('Month')
plt.ylabel('Total Opening Arrears')
plt.show()

In [None]:
#3. Distribution of Account Age
# Set the style for the plots
sns.set(style="dark")

# Plot the distribution of Opening_acc_age
plt.figure(figsize=(10, 6))
sns.histplot(df['Opening_acc_age'], bins=20, kde=True, color='blue')
plt.title('Distribution of Opening Account Age')
plt.xlabel('Opening Account Age')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Section B: Data Science

In [None]:
# 1. Summary Statistics
summary_stats = df[['Opening_Balance', 'Opening_Arrears', 'Payment_Amount']].describe().T
summary_stats['median'] = df[['Opening_Balance', 'Opening_Arrears', 'Payment_Amount']].median()
summary_stats = summary_stats[['mean', 'median', 'std']]
print(summary_stats)

In [None]:
#2. Correlation Matrix
plt.figure(figsize=(18, 12))
corr_matrix = df.corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', linewidths=0.5, annot_kws={"size": 10}, cbar_kws={"shrink": 0.8})
plt.title('Correlation Heatmap', fontsize=16)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.show()

In [None]:
#3. Feature Importance

from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder


# Separate features and target variable
X = df.drop(columns=['Current_Payment'])
y = df['Current_Payment']

# Define categorical and numerical features
numeric_features = X.select_dtypes(include=['int', 'float']).columns
categorical_features = X.select_dtypes(include=['object']).columns

# Pipeline for preprocessing
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(drop='first'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Append classifier to preprocessing pipeline
rf_model = Pipeline(steps=[('preprocessor', preprocessor),
                            ('classifier', RandomForestClassifier(random_state=42))])

# Fit the model
rf_model.fit(X, y)

# Get feature importances
feature_importances = rf_model.named_steps['classifier'].feature_importances_

# Create a DataFrame to display feature importances
feature_importances_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importances
})

# Sort by importance descending
feature_importances_df = feature_importances_df.sort_values(by='Importance', ascending=False)

# Display the top 5 most important features
top_features = feature_importances_df.head(5)
print("Top 5 most important features:")
print(top_features)


In [None]:
# calculate the percentage of null values in each column
import numpy as np
null_percentage = df.isnull().mean()*100
print(null_percentage)

In [None]:
# Filter columns with more than 50% null values
columns_to_keep = null_percentage[null_percentage <= 50].index
df = df[columns_to_keep]

In [None]:
# 4. Missing Values
missing_values = df.isnull().mean() * 100
missing_values = missing_values[missing_values > 0].sort_values(ascending=False)
print(missing_values)

In [None]:
# Imputation strategy:
for column in df.columns:
    if df[column].dtype == 'float64' or df[column].dtype == 'int64':
        # For numerical columns, use mean
        df[column].fillna(df[column].mean(), inplace=True)

In [None]:
# Check for missing values after imputation
print("\nMissing values after imputation:\n", df.isnull().sum())
print("\nDataFrame after imputation:\n", df)

In [None]:
# 5. Distribution Analysis for Delinquency Stage
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='Delinquency Stage')
plt.title('Distribution of Delinquency Stage')
plt.xlabel('Delinquency Stage')
plt.ylabel('Count')
plt.show()

In [None]:
# Section C: Data Cleaning and Machine Learning

In [None]:
# 1. Handle Missing Values
#Checking and removing extreme values
df1 = df.copy()
for i in [i for i in df1.columns]:
 if df1[i].nunique()>=12:
  Q1 = df1[i].quantile(0.25)
  Q3 = df1[i].quantile(0.75)
  IQR = Q3 - Q1
  df1 = df1[df1[i] <= (Q3+(1.5*IQR))]
  df1 = df1[df1[i] >= (Q1-(1.5*IQR))]
df1 = df1.reset_index(drop=True)
display(df1.head())
print('\n\033[1mInference:\033[0m Before removal of outliers, The dataset had {} samples.'.format(df.shape[0]))
print('\033[1mInference:\033[0m After removal of outliers, The dataset now has {} samples.'.format(df1.shape[0]))

In [None]:
# Fill missing values with mean
df = df.fillna(df.mean())

In [None]:
# Identify constant features
constant_features = [col for col in df1.columns if df1[col].nunique() == 1]
print("Constant features:", constant_features)

# Remove constant features
df_filtered = df1.drop(columns=constant_features)
print("\nDataFrame after removing constant features:\n", df_filtered)

In [None]:
df_filtered = df1.drop(columns=constant_features)

In [None]:
# Encode Categorical Variables
df = pd.get_dummies(df, drop_first=True)

In [None]:
# 2. Feature Engineering
df['Payment_Difference'] = df['Current_Payment'] - df['Previous_Payment']
# Justification: This feature might indicate the change in payment behavior

In [None]:
# 4. Identify the Target Variable
target = 'Current_Payment'
print(f'Target variable selected for predicting customer payment: {target}')

In [None]:
# 5. Train-Test Split
X = df.drop(columns=[target])
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier

# Example data (replace with your actual data loading and preprocessing)
# Assuming X_train, X_test, y_train, y_test are already defined

# Check unique classes in y_train and y_test
print("Unique classes in y_train:", sorted(pd.unique(y_train)))
print("Unique classes in y_test:", sorted(pd.unique(y_test)))

# Use LabelEncoder to transform classes to sequential integers starting from 0
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Initialize and fit XGBoost model
xgb_model = XGBClassifier(random_state=42)
xgb_model.fit(X_train, y_train_encoded)

# Evaluate model performance
xgb_train_accuracy = xgb_model.score(X_train, y_train_encoded)
xgb_test_accuracy = xgb_model.score(X_test, y_test_encoded)

print(f"XGBoost Train Accuracy: {xgb_train_accuracy:.4f}")
print(f"XGBoost Test Accuracy: {xgb_test_accuracy:.4f}")

In [None]:
# 7. Model Evaluation
best_model = xgb_model if xgb_test_accuracy > xgb_test_accuracy else rf_model

y_pred = best_model.predict(X_test)
print(classification_report(y_test, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()