In [1]:
import os
import pandas as pd
import numpy as np

# Pycaret
import pycaret.nlp as pycnlp
import pycaret.classification as pyclass

#mlflow
import mlflow
from mlflow.tracking import MlflowClient

import importlib
import helper_pipeline as helper
#importlib.reload(helper)

#### Dataset import

In [2]:
#data_filepath = "C:\\Users\\xtanl\\OneDrive - Singapore Management University\\Capstone\\inputs\\preprocessed_230604.xlsx"
data_filepath =  "C:\\Users\\xtanl\\OneDrive\\Desktop\\data_file_20230730.csv"
data = pd.read_csv(data_filepath)

In [3]:
data.head(1)

Unnamed: 0.1,Unnamed: 0,name,shares,reactions,reaction_count,comments,content,posted_on,video,image,post_url
0,pfbid0TRxXKVM1CWB2WLiWP429LvudvVVxDuAouBiabnYK...,Nicholas Goh Organisation,0,"{'likes': 5, 'loves': 1, 'wow': 0, 'cares': 0,...",6,1,Follow us at NGO’s BKK FastTrack 2022! In cele...,2022-07-23T00:00:00,['blob:https://www.facebook.com/30566a36-94fb-...,['https://scontent-xsp1-1.xx.fbcdn.net/v/t15.5...,https://www.facebook.com/NicholasGohOrganisati...


#### Data Preprocessing

In [4]:
data_df = data.copy()
data_df = data.drop_duplicates(subset=['content'])
### Features addition ###

# Fill Nulls in content column
data_df['content'] = data_df['content'].fillna('')
# Apply spaces behind the hastags to identify hashes
data_df['content'] = data_df['content'].apply(lambda x: helper.add_space_hashes(x))
# Extract all hashtags
data_df['hashtags'] = data_df['content'].apply(lambda x: helper.extract_hashtags(x))
# Extract all mentiaons
data_df['mentions'] = data_df['content'].apply(lambda x: helper.extract_mentions(x))
# Extract all emojis
data_df['emojis'] = data_df['content'].apply(lambda x: helper.extract_emojis(x))
# Translate Emojis to text
data_df['emojis_text'] = data_df['emojis'].apply(lambda x: helper.translate_emojis(x))

# Check if there are words to be flagged - breach class
data_df['breach_flagwords'] = data_df['content'].apply(lambda x: helper.contains_flagged_words(x))
# Check if there are words to be flagged in the hashes - breach class
data_df['breach_hashes'] = data_df['hashtags'].apply(lambda x: helper.contains_flagged_hashes(x))

# Create label
data_df['incompliant'] = np.where((data_df.breach_flagwords == True) | (data_df.breach_hashes == True) , 1, 0)

In [5]:
data_df = data_df[['Unnamed: 0', 'name', 'content', 'hashtags', 'mentions', 'emojis', 'emojis_text', 'breach_flagwords', 'breach_hashes', 'incompliant']].rename(columns={'Unnamed: 0': 'id'})

In [6]:
data_df.head(1)

Unnamed: 0,id,name,content,hashtags,mentions,emojis,emojis_text,breach_flagwords,breach_hashes,incompliant
0,pfbid0TRxXKVM1CWB2WLiWP429LvudvVVxDuAouBiabnYK...,Nicholas Goh Organisation,Follow us at NGO’s BKK FastTrack 2022! In cele...,[],[],,,False,False,0


### Pipeline

In [None]:
!mlflow ui
experiment_id = helper.setup_mlflow()

In [None]:
def topic_modelling(data_df, text_col, save_dir, target_col=None):

    lda_exp = pycnlp.setup(data=data_df, target=text_col, session_id=42)
    if target_col:
        tuned_lda = pycnlp.tune_model(model = 'lda',
                               multi_core = True,
                               supervised_target = target_col)
        lda_data = pycnlp.assign_model(tuned_lda)
        tuned_unsupervised = pycnlp.tune_model(model='lda', multi_core = True, filepath="C:\\Users\\xtanl\\OneDrive\\Desktop\\test.png")
    else:
        lda = pycnlp.create_model(model='lda', multi_core=True)
        lda_data = pycnlp.assign_model(lda)
            
    # Log artifacts
    pycnlp.plot_model(lda, plot='topic_distribution', save=save_dir)
    mlflow.log_artifacts(save_dir, artifact_path='topic_model')

    return lda_data.reset_index(drop=True)

In [None]:
with mlflow.start_run(experiment_id=experiment_id, run_name = f'text_classification') as run:
    
    # List file paths
    run_id = run.info.run_uuid
    artifact_path = f"./ai_critic/{run_id}/artifacts/"
    pp_artifact_path = os.getcwd() + f'\\ai_critic\\{run_id}\\artifacts\\preprocessing\\'
    exai_artifact_path = artifact_path + 'explainable_ai/'

    # Check filepath existence, otherwise create filepath
    if not os.path.exists(pp_artifact_path):
        os.makedirs(pp_artifact_path)
    if not os.path.exists(exai_artifact_path):
        os.makedirs(exai_artifact_path)

    # Topic Modelling
    topic_df = topic_modelling(data_df, 'content', exai_artifact_path)

    # Text Classification
    classfication_exp = pyclass.setup(data= train,
                                       target = 'non_compliant',
                                       train_size= 0.2,
                                       high_cardinality_features = ['content'],
                                       session_id=42)

    #pycnlp.evaluate_model(lda)

    pyclass.compare_models()

#### Pycaret Topic Modelling

In [7]:
nlp_exp = pycnlp.setup(data= data_df, target = 'content', session_id=42)

Description,Value
session_id,42
Documents,288
Vocab Size,1534
Custom Stopwords,False


#### Text Classification

In [None]:
train = lda_data[['content', 'hashtags', 'mentions','emojis', 'emojis_text', 'breach_flagwords', 'breach_hashes',
                   'Topic_0', 'Topic_1', 'Topic_2', 'Topic_3', 'Dominant_Topic', 'Perc_Dominant_Topic', 'non_compliant']]

In [None]:
classf_exp = pyclass.setup(data= train,
                           target = 'non_compliant',
                           train_size= 0.2,
                           high_cardinality_features = ['content'],
                           session_id=42)

In [None]:
%time pyclass.compare_models()

In [None]:
class_model = pyclass.tune_model(best)

In [None]:
%time pyclass.evaluate_model(class_model)