In [1]:
### Uncomment to install each dependency ###
#% pip install pandas
#% pip install DateTime
#%pip install --user -U nltk
#%pip install -U scikit-learn

In [None]:
import pandas as pd
from datetime import datetime

import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.cluster import DBSCAN
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

**data_processor** This function processes several columns of the dataframe to be used in the main function.<br>
  
operations:  
- date processesing: converts all date values into datetime objects to allow for comparison operations<br>
- invoice description: processed invoice description to allow similarity checks <br>
- cluster creation: created new column to index the different clusters grouping similar invoices <br>
  
<!-- authors: *Michael Ye* -->

In [2]:
data = pd.read_excel('Final Dataset v12.xlsx')
data.drop('RISK RATING', inplace=True, axis=1)

def data_processor():
    data['Payment Date'] = pd.to_datetime(data['Payment Date'])
    data['Invoice Date'] = pd.to_datetime(data['Invoice Date'])
    data['BOS Date'] = pd.to_datetime(data['BOS Date'])
    data['Void Date'] = pd.to_datetime(data['Void Date'])
    data['PO Date'] = pd.to_datetime(data['PO Date'])
    data['Invoice Description'] = data['Invoice Description'].astype(str)
    # creating clusters
    stemmer = PorterStemmer()
    stop_words = set(stopwords.words('english'))

    def string_processor(sentence):
        tokens = word_tokenize(sentence)
        stemmed_tokens = [stemmer.stem(token) for token in tokens if token.lower() not in stop_words]
        return ' '.join(stemmed_tokens)

    data['Processed Invoice Description'] = data['Invoice Description'].apply(string_processor)

    # Compute TF-IDF matrix for all descriptions
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(data['Processed Invoice Description'])

    # Cluster the descriptions using DBSCAN
    dbscan = DBSCAN(metric='cosine', eps=0.2, min_samples=2)
    clusters = dbscan.fit_predict(tfidf_matrix)
    data['cluster'] = clusters
    data.to_excel('Processed_Data.xlsx', index=False)
    
data_processor()