In [29]:
import pandas as pd
from datetime import timedelta
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

# Load your dataset
df = pd.read_csv('C:\\Users\\Hiwi\\Documents\\week5\\data.csv')
df['TransactionStartTime'] = pd.to_datetime(df['TransactionStartTime'])

# Set snapshot date to one day after the most recent transaction
snapshot_date = df['TransactionStartTime'].max() + timedelta(days=1)


In [30]:
rfm = df.groupby('CustomerId').agg({
    'TransactionStartTime': lambda x: (snapshot_date - x.max()).days,  
    'TransactionId': 'count',                                          
    'Amount': 'sum'                                                    
}).rename(columns={
    'TransactionStartTime': 'Recency',
    'TransactionId': 'Frequency',
    'Amount': 'Monetary'
}).reset_index()


In [31]:
scaler = StandardScaler()
rfm_scaled = scaler.fit_transform(rfm[['Recency', 'Frequency', 'Monetary']])


In [32]:
kmeans = KMeans(n_clusters=3, random_state=42)
rfm['Cluster'] = kmeans.fit_predict(rfm_scaled)


  super()._check_params_vs_input(X, default_n_init=10)


In [33]:
# See cluster profiles
print(rfm.groupby('Cluster')[['Recency', 'Frequency', 'Monetary']].mean())

# Choose the cluster with highest Recency and lowest Frequency/Monetary as high-risk
high_risk_cluster = rfm.groupby('Cluster')[['Recency', 'Frequency', 'Monetary']].mean().idxmin()['Frequency']
rfm['is_high_risk'] = (rfm['Cluster'] == high_risk_cluster).astype(int)


           Recency    Frequency      Monetary
Cluster                                      
0        61.877279     7.720196  8.172068e+04
1        12.726566    34.800000  2.725741e+05
2        29.000000  4091.000000 -1.049000e+08


In [34]:
from src.features.proxy_target import engineer_proxy_target

df_with_target = engineer_proxy_target(df)

# Save if needed
df_with_target.to_csv('data_with_proxy_target.csv', index=False)

  super()._check_params_vs_input(X, default_n_init=10)


In [35]:
df_with_target = engineer_proxy_target(df)

  super()._check_params_vs_input(X, default_n_init=10)


In [38]:
df.columns

Index(['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId',
       'CurrencyCode', 'CountryCode', 'ProviderId', 'ProductId',
       'ProductCategory', 'ChannelId', 'Amount', 'Value',
       'TransactionStartTime', 'PricingStrategy', 'FraudResult'],
      dtype='object')

In [47]:
# Drop existing 'is_high_risk' column to avoid suffix conflict
if 'is_high_risk' in df.columns:
    df = df.drop(columns=['is_high_risk'])

# Merge risk labels back
df = df.merge(rfm[['CustomerId', 'is_high_risk']], on='CustomerId', how='left')

# Replace any missing values (in case of customers not in RFM) with 0
df['is_high_risk'] = df['is_high_risk'].fillna(0).astype(int)


In [48]:
df


Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy,FraudResult,is_high_risk_x,is_high_risk_y,is_high_risk
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,UGX,256,ProviderId_6,ProductId_10,airtime,ChannelId_3,1000.0,1000,2018-11-15 02:18:49+00:00,2,0,0,0,0
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-20.0,20,2018-11-15 02:19:08+00:00,2,0,0,0,0
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,UGX,256,ProviderId_6,ProductId_1,airtime,ChannelId_3,500.0,500,2018-11-15 02:44:21+00:00,2,0,1,1,1
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,UGX,256,ProviderId_1,ProductId_21,utility_bill,ChannelId_3,20000.0,21800,2018-11-15 03:32:55+00:00,2,0,0,0,0
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-644.0,644,2018-11-15 03:34:21+00:00,2,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95657,TransactionId_89881,BatchId_96668,AccountId_4841,SubscriptionId_3829,CustomerId_3078,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-1000.0,1000,2019-02-13 09:54:09+00:00,2,0,0,0,0
95658,TransactionId_91597,BatchId_3503,AccountId_3439,SubscriptionId_2643,CustomerId_3874,UGX,256,ProviderId_6,ProductId_10,airtime,ChannelId_3,1000.0,1000,2019-02-13 09:54:25+00:00,2,0,0,0,0
95659,TransactionId_82501,BatchId_118602,AccountId_4841,SubscriptionId_3829,CustomerId_3874,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-20.0,20,2019-02-13 09:54:35+00:00,2,0,0,0,0
95660,TransactionId_136354,BatchId_70924,AccountId_1346,SubscriptionId_652,CustomerId_1709,UGX,256,ProviderId_6,ProductId_19,tv,ChannelId_3,3000.0,3000,2019-02-13 10:01:10+00:00,2,0,0,0,0


In [125]:
# List your dependencies here
requirements = [
    "mlflow",
    "pytest",
    "scikit-learn",
    "fastapi",
    "uvicorn",
    "pydantic",
    "flake8"


    # add any other packages you need
]

# Write to requirements.txt
with open("requirements.txt", "w") as f:
    for package in requirements:
        f.write(package + "\n")

print("requirements.txt created with the following packages:")
print("\n".join(requirements))


requirements.txt created with the following packages:
mlflow
pytest
scikit-learn
fastapi
uvicorn
pydantic
flake8


In [126]:
!git -C "C:\\Users\\Hiwi\\Documents\\week5" init

Reinitialized existing Git repository in C:/Users/Hiwi/Documents/week5/.git/


In [127]:
!git -C "C:/Users/Hiwi/src" add .
!git commit -m "Initial commit: feature engineering and model training"



On branch main
Your branch is ahead of 'origin/main' by 1 commit.
  (use "git push" to publish your local commits)

Changes not staged for commit:
  (use "git add/rm <file>..." to update what will be committed)
  (use "git restore <file>..." to discard changes in working directory)
	modified:   Documents/week5/CreditScoring.ipynb
	deleted:    src/__init__.py
	deleted:    src/feature_engineering.py
	deleted:    src/features/__pycache__/proxy_target.cpython-311.pyc
	deleted:    src/features/proxy_target.py
	deleted:    src/main.py
	deleted:    src/pipelines.py
	deleted:    src/requirements.txt
	deleted:    src/woe_iv.py

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	.VirtualBox/
	.anaconda/
	.android/
	.bash_history
	.cache/
	.conda/
	.condarc
	.continuum/
	.ganttproject
	.gitconfig
	.gradio/
	.idlerc/
	.ipynb_checkpoints/
	.ipython/
	.jupyter/
	.keras/
	.kivy/
	.lesshst
	.matplotlib/
	.nbi/
	.python_history
	.spyder-py3/
	.streamlit/
	.vscode/
	20190

In [128]:
#!git remote add origin https://github.com/HiwotWonago/credit-scoring-system.git
#!git branch -M main
#!git push -u origin main

In [107]:
CATEGORICAL_COLS = [
    'CurrencyCode', 'ProviderId', 'ProductId', 'ProductCategory', 'ChannelId', 'PricingStrategy'
]
df_encoded = pd.get_dummies(df, columns=CATEGORICAL_COLS, drop_first=True)

In [108]:
datetime_cols = X_train.select_dtypes(include='datetime').columns.tolist()
print("Datetime columns:", datetime_cols)

Datetime columns: []


In [109]:
for col in ['TransactionStartTime', 'CreatedAt']:  # Adjust to your actual datetime columns
    if col in X.columns:
        X[col] = pd.to_datetime(X[col])
        X[f"{col}_hour"] = X[col].dt.hour
        X[f"{col}_dayofweek"] = X[col].dt.dayofweek
        X = X.drop(columns=[col])


In [110]:
X_train = X_train.drop(columns=datetime_cols)
X_test = X_test.drop(columns=datetime_cols)


In [111]:
for col in datetime_cols:
    X_train[f"{col}_hour"] = X_train[col].dt.hour
    X_train[f"{col}_dayofweek"] = X_train[col].dt.dayofweek
    X_test[f"{col}_hour"] = X_test[col].dt.hour
    X_test[f"{col}_dayofweek"] = X_test[col].dt.dayofweek

X_train = X_train.drop(columns=datetime_cols)
X_test = X_test.drop(columns=datetime_cols)


In [112]:
X = df_encoded.drop(columns=['is_high_risk'])  # Features
y = df_encoded['is_high_risk']                # Target

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [113]:
import mlflow
mlflow.set_experiment("credit_scoring_model_training")


<Experiment: artifact_location='file:///C:/Users/Hiwi/mlruns/171955629711195000', creation_time=1751403466326, experiment_id='171955629711195000', last_update_time=1751403466326, lifecycle_stage='active', name='credit_scoring_model_training', tags={}>

In [152]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

def evaluate_and_log_metrics(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]  # assumes binary classifier

    metrics = {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred, zero_division=0),
        "recall": recall_score(y_test, y_pred),
        "f1_score": f1_score(y_test, y_pred),
        "roc_auc": roc_auc_score(y_test, y_proba),
    }

    for key, value in metrics.items():
        mlflow.log_metric(key, value)
    # Print metrics to console
    print("Evaluation Metrics:")
    for k, v in metrics.items():
        print(f"{k}: {v:.4f}")    
    return metrics


In [118]:
from sklearn.linear_model import LogisticRegression

with mlflow.start_run(run_name="Logistic Regression"):
    model = LogisticRegression(max_iter=1000, random_state=42)
    model.fit(X_train, y_train)

    mlflow.log_param("model_type", "LogisticRegression")
    mlflow.log_param("max_iter", 1000)

    evaluate_and_log_metrics(model, X_test, y_test)
    mlflow.sklearn.log_model(model, "model")




In [119]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

param_grid = {
    "n_estimators": [100, 200],
    "max_depth": [None, 10, 20],
    "min_samples_split": [2, 5],
}

grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    scoring="roc_auc",
    cv=3,
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

best_rf = grid_search.best_estimator_
best_params = grid_search.best_params_


In [153]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

def evaluate_and_log_metrics(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]

    metrics = {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred),
        "recall": recall_score(y_test, y_pred),
        "f1_score": f1_score(y_test, y_pred),
        "roc_auc": roc_auc_score(y_test, y_proba),
    }

    # Log to MLflow
    for key, value in metrics.items():
        mlflow.log_metric(key, value)

    return metrics


In [154]:
with mlflow.start_run(run_name="Random Forest (Grid Search)"):
    # Log best parameters
    for param, val in best_params.items():
        mlflow.log_param(param, val)

    mlflow.log_param("model_type", "RandomForestClassifier")

    # Evaluate
    evaluate_and_log_metrics(best_rf, X_test, y_test)

    # Log model
    mlflow.sklearn.log_model(best_rf, "model")




In [122]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Initialize models
logreg = LogisticRegression(max_iter=1000, random_state=42)
rf = RandomForestClassifier(random_state=42)

# Train
logreg.fit(X_train, y_train)
rf.fit(X_train, y_train)


In [123]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# Random Forest base model
rf = RandomForestClassifier(random_state=42)

# Grid Search: exhaustive
grid_params = {
    'n_estimators': [50, 100],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}

# Random Search: quick and random
random_params = {
    'n_estimators': [10, 50, 100, 150],
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 4, 6],
    'max_features': ['auto', 'sqrt']
}


In [124]:
grid_search = GridSearchCV(estimator=rf, param_grid=grid_params, 
                           cv=5, scoring='f1', n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
best_params = grid_search.best_params_


Fitting 5 folds for each of 12 candidates, totalling 60 fits


In [132]:
!pip install fastapi uvicorn mlflow flake8 pytest



In [131]:
!pip install fastapi uvicorn
!docker build -t credit-scoring-api .



'docker' is not recognized as an internal or external command,
operable program or batch file.


In [144]:
import os

# Absolute path to where you want the .github/workflows/ci.yml file
base_path = r"C:\Users\Hiwi\Documents\week5\credit-scoring-system\src"

# Create the .github/workflows subdirectory inside that path
workflow_dir = os.path.join(base_path, ".github", "workflows")
os.makedirs(workflow_dir, exist_ok=True)

# Write the ci.yml file inside that directory
workflow_file = os.path.join(workflow_dir, "ci.yml")
with open(workflow_file, "w") as f:
    f.write("""\
name: CI

on:
  push:
    branches: [main]

jobs:
  lint-and-test:
    runs-on: ubuntu-latest

    steps:
      - name: Checkout repository
        uses: actions/checkout@v3

      - name: Set up Python
        uses: actions/setup-python@v4
        with:
          python-version: "3.9"

      - name: Install dependencies
        run: |
          python -m pip install --upgrade pip
          pip install -r requirements.txt

      - name: Run linter (flake8)
        run: flake8 src tests

      - name: Run unit tests (pytest)
        run: pytest tests
""")


In [145]:
!git remote add origin https://github.com/HiwotWonago/credit-scoring-system.git
!git branch -M main
!git push -u origin main


error: remote origin already exists.


branch 'main' set up to track 'origin/main'.


To https://github.com/HiwotWonago/solar-challenge-week1.git
   be43c45..f262b83  main -> main


In [148]:
!git add
!git commit -m "Add CI workflow with flake8 and pytest"


Nothing specified, nothing added.
hint: Maybe you wanted to say 'git add .'?
hint: Disable this message with "git config set advice.addEmptyPathspec false"


On branch main
Your branch is up to date with 'origin/main'.

Changes not staged for commit:
  (use "git add/rm <file>..." to update what will be committed)
  (use "git restore <file>..." to discard changes in working directory)
	modified:   Documents/week5/CreditScoring.ipynb
	deleted:    src/__init__.py
	deleted:    src/feature_engineering.py
	deleted:    src/features/__pycache__/proxy_target.cpython-311.pyc
	deleted:    src/features/proxy_target.py
	deleted:    src/main.py
	deleted:    src/pipelines.py
	deleted:    src/requirements.txt
	deleted:    src/woe_iv.py

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	.VirtualBox/
	.anaconda/
	.android/
	.bash_history
	.cache/
	.conda/
	.condarc
	.continuum/
	.ganttproject
	.gitconfig
	.gradio/
	.idlerc/
	.ipynb_checkpoints/
	.ipython/
	.jupyter/
	.keras/
	.kivy/
	.lesshst
	.matplotlib/
	.nbi/
	.python_history
	.spyder-py3/
	.streamlit/
	.vscode/
	20190314_153155.jpg
	20190314_153159.jpg
	20190314_153237.j

