In [3]:
from datetime import date, timedelta
import pandas as pd
import numpy as np
import boto3
import re
import json
import string
import pysftp
from botocore.exceptions import ClientError
cnopts = pysftp.CnOpts()
cnopts.hostkeys = None

today = date.today()
yesterday = today - timedelta(days=1)
start_date = str(yesterday)
from sqlalchemy import create_engine

In [4]:
def get_aws_creds(database):
    """Get Redshift credentials from AWS Systems Manager (SSM)"""
    ssm_keys = ['prod_redshift_host', 'prod_redshift_ro_password', 'prod_redshift_port', 'prod_redshift_ro_user',
                'lending_readonly_host', 'lending_readonly_pass', 'lending_readonly_port', 'lending_readonly_user',
                database]
    ssm_parameters = boto3.client('ssm', region_name='ap-south-1').get_parameters(Names=ssm_keys, WithDecryption=True)['Parameters']
    ssm_secrets = dict()
    for ssm_parameter in ssm_parameters:
        ssm_secrets[ssm_parameter['Name']] = ssm_parameter['Value']
    return ssm_secrets
def get_engine(database):
    ssm_secrets = get_aws_creds(database)
    if database == 'prod_redshift_name':
        username = ssm_secrets.get('prod_redshift_ro_user')
        password = ssm_secrets.get('prod_redshift_ro_password')
        hostname = ssm_secrets.get('prod_redshift_host')
        port_number = ssm_secrets.get('prod_redshift_port')
        databasename = ssm_secrets.get(database)
        engine_link = 'postgresql+psycopg2://{}:{}@{}:{}/{}'.format(username, password, hostname, port_number, databasename)
        engine = create_engine(engine_link)
    elif database == 'lending_readonly_dbname':
        username = ssm_secrets.get('lending_readonly_user')
        password = ssm_secrets.get('lending_readonly_pass')
        hostname = ssm_secrets.get('lending_readonly_host')
        port_number = ssm_secrets.get('lending_readonly_port')
        databasename = ssm_secrets.get(database)
        engine_link = 'postgresql+psycopg2://{}:{}@{}:{}/{}'.format(username, password, hostname, port_number, databasename)
        engine = create_engine(engine_link)
    elif database == 'UNIFIED_READ_ONLY_DB':
        username = ssm_secrets.get('UNIFIED_READ_ONLY_USER')
        password = ssm_secrets.get('UNIFIED_READ_ONLY_PASSWORD')
        hostname = ssm_secrets.get('UNIFIED_READ_ONLY_HOST')
        port_number = ssm_secrets.get('UNIFIED_READ_ONLY_PORT')
        databasename = ssm_secrets.get(database)
        engine_link = 'postgresql+psycopg2://{}:{}@{}:{}/{}'.format(username, password, hostname, port_number, databasename)
        engine = create_engine(engine_link)
    return engine


def get_engine_unified(database):
    ssm_keys = ['UNIFIED_READ_ONLY_USER','UNIFIED_READ_ONLY_PASSWORD','UNIFIED_READ_ONLY_HOST','UNIFIED_READ_ONLY_PORT',
                database]
    ssm_parameters = boto3.client('ssm', region_name='ap-south-1').get_parameters(Names=ssm_keys, WithDecryption=True)['Parameters']
    ssm_secrets = dict()
    for ssm_parameter in ssm_parameters:
        ssm_secrets[ssm_parameter['Name']] = ssm_parameter['Value']
    username = ssm_secrets.get('UNIFIED_READ_ONLY_USER')
    password = ssm_secrets.get('UNIFIED_READ_ONLY_PASSWORD')
    hostname = ssm_secrets.get('UNIFIED_READ_ONLY_HOST')
    port_number = ssm_secrets.get('UNIFIED_READ_ONLY_PORT')
    databasename = ssm_secrets.get(database)
    engine_link = 'postgresql+psycopg2://{}:{}@{}:{}/{}'.format(username, password, hostname, port_number, databasename)
    engine = create_engine(engine_link)
    return engine

def get_engine_banking(database):
    ssm_keys = ['PORTAL_READONLY_USER','PORTAL_READONLY_HOST','PORTAL_READONLY_PASSWORD']
    ssm_parameters = boto3.client('ssm', region_name='ap-south-1').get_parameters(Names=ssm_keys, WithDecryption=True)['Parameters']
    ssm_secrets = dict()
    for ssm_parameter in ssm_parameters:
        ssm_secrets[ssm_parameter['Name']] = ssm_parameter['Value']
    username = ssm_secrets.get('PORTAL_READONLY_USER')
    password = ssm_secrets.get('PORTAL_READONLY_PASSWORD')
    hostname = ssm_secrets.get('PORTAL_READONLY_HOST')
    databasename = database
    engine_link = 'postgresql+psycopg2://{}:{}@{}/{}'.format(username, password, hostname, databasename)
    engine = create_engine(engine_link)
    return engine


engine_dw2 = get_engine('prod_redshift_name')
engine_lending = get_engine('lending_readonly_dbname')
engine_unified = get_engine_unified('UNIFIED_READ_ONLY_DB')
engine_bank_connect = get_engine_banking('finbox_dashboard')


ClientError: An error occurred (AccessDeniedException) when calling the GetParameters operation: User: arn:aws:iam::661900564209:user/srishti is not authorized to perform: ssm:GetParameters on resource: arn:aws:ssm:ap-south-1:661900564209:parameter/prod_redshift_host because no identity-based policy allows the ssm:GetParameters action

#### Load SMS

In [97]:
sms_data = pd.read_sql(''' select sms_body from alternate_pipeline_classifier_with_sms_prod_v3
where is_extracted_sms=false and predictions ilike '%%debit%%'
and sms_sender='sbiinb' and date(sms_inbox_timestamp) = current_date-10
and date(created_at) = current_date ''', engine_dw2)

In [98]:
sms_data.shape

(0, 1)

In [99]:
sms_data.head(2)

Unnamed: 0,sms_body


#### SMS Cleaning process

In [100]:
def clean_date(sms):
    cleaned = re.sub(r'\s+([0-9]{1,4}[-.\s]*[A-Za-z0-9]{1,3}[-.\s]*[0-9]{1,4})\s+', ' DDMMYYYY ', sms)
    cleaned = re.sub(r'([0-9]{2}[:]*[0-9]{2}:[0-9]{1,3})', 'HH:MM:SSS', cleaned)
    return cleaned

In [101]:
def clean_punc(sms):
    cleaned = re.sub(r'[?|!|\'|"|#]', r'',sms)
    cleaned = re.sub(r'[)|(|\|/]', r'',cleaned)
    cleaned = re.sub(r'[-:]', r' ',cleaned)
    return cleaned

In [102]:
def clean_numeric_or_name(sms):
    cleaned = re.sub(r'\s+[0-9xX.,]+\s+', ' XXXXXX ',sms)
    cleaned = re.sub(r'(?:Rs\.?|INR|I@NR)(?:\s*\.*)([-+]?(?:(?:.[0-9\s,，]+(?:.[0-9]+)?)|(?:.[0-9.]+)))',r' XXXXXX ',cleaned)
    cleaned = re.sub(r'[,|.]', ' ',cleaned)
    return cleaned

In [103]:
def extra_cleaning(sms): 
    cleaned = re.sub(r'[0-9]', 'X',sms)
    cleaned = re.sub(r'([\w.-]+[@][\w.-]+)','X', cleaned)
    return cleaned

In [104]:
def clean_sms(sms):
    for process in [clean_date, clean_punc,clean_numeric_or_name,extra_cleaning]:
        sms = process(sms)
    return sms

In [105]:
sms_data['clean_smsbody']=sms_data['sms_body'].apply(lambda sms:clean_sms(sms))

#### Using Bag Of Words for text feature extraction

In [106]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()


In [107]:
bow = count_vect.fit(sms_data['clean_smsbody'])

ValueError: empty vocabulary; perhaps the documents only contain stop words

In [108]:
bow_vector = bow.transform(sms_data['clean_smsbody'])

In [73]:
bow_vector.toarray()

array([[2, 1, 1, ..., 1, 1, 3],
       [2, 1, 1, ..., 1, 1, 3],
       [2, 1, 1, ..., 1, 1, 3],
       ...,
       [2, 1, 1, ..., 1, 1, 3],
       [2, 1, 1, ..., 1, 1, 3],
       [2, 1, 1, ..., 1, 1, 3]])

In [74]:
len(bow.vocabulary_)

25

In [75]:
len(bow.vocabulary_)

25

In [76]:
bow.get_feature_names()[1:10]

['balance',
 'bank',
 'bfil',
 'customer',
 'ddmmyyyy',
 'dear',
 'debit',
 'due',
 'ensure']

### Using all processes jobs = -1 and k means++ for starting initilization advantage

In [77]:
from sklearn.cluster import KMeans

In [78]:
model = KMeans(n_clusters = 10, init = 'k-means++', n_jobs=-1, random_state=99)

In [79]:
model.fit(bow_vector)

  """Entry point for launching an IPython kernel.


KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=10, n_init=10, n_jobs=-1, precompute_distances='auto',
       random_state=99, tol=0.0001, verbose=0)

In [80]:
labels = model.labels_

In [81]:
sms_data['labels'] = model.labels_

In [82]:
sms_data.to_excel('./sms_data.xlsx')

In [60]:
from sklearn import metrics
silhouetter_score_tf = metrics.silhouette_score(bow_vector, labels, metric='euclidean')

In [97]:
silhouetter_score_tf

0.5197965005655522

### Choose number of clusters

In [31]:
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster

In [None]:
linkage_data = linkage(bow_vector.toarray(), method='ward', metric='euclidean')

In [None]:
dend = dendrogram(linkage_data)

In [None]:
clusters = fcluster(linkage_data, 10, criterion='distance')

In [None]:
sms_data['sms_cluster'] = clusters

In [None]:
len(clusters), pd.Series(clusters).nunique()

In [None]:
sms_data[['sms_body','clean_smsbody','sms_cluster']].to_excel('./sms_clusters.xlsx')