In [4]:
from datetime import date, timedelta
import pandas as pd
import numpy as np
import re
import json
import string

In [7]:
sms_data=pd.read_excel('/content/sms.xlsx')
sms_data.head()

ImportError: Missing optional dependency 'openpyxl'.  Use pip or conda to install openpyxl.

In [None]:
sms_data.drop(columns=['sms_sender'], inplace=True)
sms_data.head(2)

SMS Cleaning Process

In [None]:
def clean_date(sms):
 cleaned = re.sub(r'\s+([0-9]{1,4}[-.\s]*[A-Za-z0-9]{1,3}[-.\s]*[0-9]{1,4})\s+', ' DDMMYYYY ', sms)
 cleaned = re.sub(r'([0-9]{2}[:]*[0-9]{2}:[0-9]{1,3})', 'HH:MM:SSS', cleaned)
 return cleaned

In [None]:
def clean_punc(sms):
 cleaned = re.sub(r'[?|!|\'|"|#]', r'',sms)
 cleaned = re.sub(r'[)|(|\|/]', r'',cleaned)
 cleaned = re.sub(r'[-:]', r' ',cleaned)
 return cleaned

In [None]:
def clean_numeric_or_name(sms):
 cleaned = re.sub(r'\s+[0-9xX.,]+\s+', ' XXXXXX ',sms)
 cleaned = re.sub(r'(?:Rs\.?|INR|I@NR)(?:\s*\.*)([-+]?(?:(?:.[0-9\s,，]+(?:.[0-9]+)?)|(?:.[0-9.]+)))',r' XXXXXX ',cleaned)
 cleaned = re.sub(r'[,|.]', ' ',cleaned)
 return cleaned

In [None]:
def extra_cleaning(sms):
 cleaned = re.sub(r'[0-9]', 'X',sms)
 cleaned = re.sub(r'([\w.-]+[@][\w.-]+)','X', cleaned)
 return cleaned

In [None]:
def clean_sms(sms):
 for process in [clean_date, clean_punc,clean_numeric_or_name,extra_cleaning]:
  sms = process(sms)
 return sms

In [None]:
sms_data['sms_body']=sms_data['sms_body'].astype(str)

In [None]:
sms_data['clean_smsbody']=sms_data['sms_body'].apply(lambda sms:clean_sms(sms))

In [None]:
sms_data['clean_smsbody'].head(2)

Bag of Words for feature extraction

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()

In [None]:
bow = count_vect.fit(sms_data['clean_smsbody'])

In [None]:
bow_vector = bow.transform(sms_data['clean_smsbody'])

In [None]:
bow_vector_int32 = bow_vector.astype(np.int32)

In [None]:
bow_vector_int32.toarray()

In [None]:
len(bow.vocabulary_)

In [None]:
bow.get_feature_names_out()[1:10]

Using all processes jobs = -1 and k means++ for starting initilization advantage

In [None]:
from sklearn.cluster import KMeans
from tqdm import tqdm

In [None]:
model = KMeans(n_clusters = 10, init = 'k-means++', random_state=99)

with tqdm(total=bow_vector.getnnz()) as pbar:
 model.fit(bow_vector)
 pbar.update(1)

In [None]:
model.fit(bow_vector)

In [None]:
labels = model.labels_

In [None]:
sms_data['labels'] = model.labels_

In [None]:
sms_data.to_excel('./sms_data.xlsx')

In [None]:
from sklearn import metrics
silhouetter_score_tf = metrics.silhouette_score(bow_vector, labels, metric='euclidean')
silhouetter_score_tf

Choose number of clusters

In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster

In [None]:
linkage_data = linkage(bow_vector.toarray(), method='ward', metric='euclidean')

In [None]:
dend = dendrogram(linkage_data)

In [None]:
clusters = fcluster(linkage_data, 10, criterion='distance')

In [None]:
sms_data['sms_cluster'] = clusters

In [None]:
len(clusters), pd.Series(clusters).nunique()

In [None]:
sms_data[['sms_body','clean_smsbody','sms_cluster']].to_excel('./sms_clusters.xlsx')