In [None]:
# Importing necessary libraries for data manipulation, NLP, and visualization
import pandas as pd  # Importing pandas for data manipulation
import numpy as np  # Importing numpy for numerical operations
import re  # Importing re for regular expressions
import dask  # Importing dask for parallel computing
dask.config.set({'dataframe.query-planning': True})  # Configuring dask for query planning
import dask.dataframe as dd  # Importing dask dataframe for big data processing

# NLP Libraries
from nltk.corpus import stopwords  # Importing NLTK's stopwords
import spacy  # Importing spaCy for advanced NLP tasks
from nltk.tokenize import RegexpTokenizer  # Importing RegexpTokenizer for tokenization
from gensim.models import Phrases  # Importing Phrases for phrase modeling
from gensim.corpora import Dictionary  # Importing Dictionary for building corpora
from gensim.models.ldamulticore import LdaMulticore  # Importing LdaMulticore for topic modeling
from nltk.probability import FreqDist  # Importing FreqDist for frequency distribution
from gensim.parsing.preprocessing import remove_stopwords  # Importing remove_stopwords for preprocessing
from nltk.sentiment.vader import SentimentIntensityAnalyzer  # Importing SentimentIntensityAnalyzer for sentiment analysis

# Visualisation Libraries
import matplotlib.pyplot as plt  # Importing matplotlib for basic visualization
from plotly.subplots import make_subplots  # Importing make_subplots for subplots
import plotly.graph_objects as go  # Importing graph_objects for plotly visualizations
import seaborn as sns  # Importing seaborn for advanced visualization
import plotly.offline as pyo  # Importing plotly for interactive plots
import pyLDAvis  # Importing pyLDAvis for topic visualization
import pyLDAvis.gensim


# Setting matplotlib style
plt.style.use('ggplot')

# Initializing plotly for offline use
pyo.init_notebook_mode()

# Setting seaborn context for paper
sns.set_context('paper')

In [None]:
import requests

In [None]:
username = "GreyScaling"
repo_name = "UOB-Financial-Loan-Analysis"

# Specify the folder path (e.g., "path/to/folder")
folder_path = "csvs"

In [None]:
def get_filenames(username , repo_name , folder_path):

    '''

    Takes in the github credentials and returns the csv filenames within the folder given
    
    Parameters: 
        username (str): GitHub Username
        repo_name (str): User's Repository Name
        folder_path (str): The folder path containing the csv files
           
    Returns:

        filenames (list) : Returns a list of csv folderpaths    
     '''

    url = f"https://api.github.com/repos/{username}/{repo_name}/contents/{folder_path}"
    csv_path = f'https://raw.githubusercontent.com/{username}/{repo_name}/main/{folder_path}/'


    filenames = []
    response = requests.get(url)

    #Checks if the url is OK
    if response.status_code == 200:
        data = response.json()
        for item in data:
            # Check if the item is a file and ends with ".csv" extension
            if item["type"] == "file" and item["name"].endswith(".csv"):

                #Attach the folder path together with the filename get the entire url  
                filename = csv_path + item['name']
                filenames.append(filename)
    else:
        print(f"Error: {response.status_code}")

    return(filenames)

In [None]:
csvs = get_filenames(username , repo_name , folder_path)

In [None]:
#Read all the files into a single dataframe
df = dd.read_csv(csvs).compute().reset_index(drop=True)

In [None]:
# Check for duplicate rows based on all columns
df[df.duplicated(keep=False)]

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
def pre_process(dataframe):
    '''
    This Function performs the pre-processing of the dataframe 

    args:
        dataframe: The dataframe to be preprocessed
    
    Functions:
        
        the following are the steps taken to proprocess the data

        1) Lower case tranformation
        2) Removing Stopwords
        3) Removing single alphabets
        4) Removing WhiteSpace
        5) Removing Punctuations
        6) Removing Emojis & other image related symbolds
        7) Lemmatizing the words

    '''
    
    #converts text to lowercase
    dataframe['Content'] = dataframe['Content'].apply(lambda x: x.lower())

    #Remove all punctuations
    dataframe['Content'] = dataframe['Content'].apply(lambda x: re.sub('[^\w\s]', ' ', x))

    #Remove all stopwords
    stop_words = stopwords.words('english')
    stop_words.extend(['from', 'singapore', 'malaysia' , 'said' ,','])
    dataframe['Content'] = dataframe['Content'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

    #Removing Numbers
    dataframe['Content'] = dataframe['Content'].apply(lambda x: re.sub('\d+' , '' , x))

    #remove single alphabets
    dataframe['Content'] = dataframe['Content'].apply(lambda x: re.sub('\b[a-zA-Z]\s', ' ' , x))

    #remove Hyperlinks
    dataframe['Content'] = dataframe['Content'].apply(lambda x: re.sub("<.*?>+|https?://\S+|www\.\S+", " " , x))

    dataframe['Content'] = dataframe['Content'].apply(lambda x: re.sub("/^\s+|\s+$|\s+(?=\s)/g", "" , x))


    #Removing Emojis and any other image related symbols
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoji
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251" 
                           "]+", flags=re.UNICODE)
    
    dataframe['Content'] = dataframe['Content'].apply(lambda x: emoji_pattern.sub(r' ', x))

    #Lemmatize the words
    lemmatizer = WordNetLemmatizer()
    dataframe['Content'] = dataframe['Content'].apply(lambda x:" ".join([lemmatizer.lemmatize(word) for word in x.split(' ')]) ) 


