## Data Scrapping and Preprocessing Pipeline

### Import Libraries

In [None]:
import os
import shutil
import pickle
import pandas as pd
import numpy as np

# Facebook Scrapper
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary

# Custom Libraries
import src.helpers_scrapper as scrap
import src.helpers_preprocess as pp
import src.helpers_mlflow as mlf
import src.breach_words as breach
import src.config as config

# import importlib
# importlib.reload(config)

### Set Configurations

In [None]:
# Check if filepaths exists and create filepaths if do not exist
config.create_path(config.output_path)
config.create_path(config.raw_data_path)

In [None]:
REFRESH_BREACH_LIST = True

### Scrap data from Facebook

##### Set configurtions for Mozilla Browser

In [None]:
# Instantiate Mozilla Firefox Browser to scrap from facebook
options = Options()
options.binary = FirefoxBinary(r"C:\Program Files\Mozilla Firefox\firefox.exe")
options.set_preference("browser.download.folderList",2)
options.set_preference("browser.download.manager.showWhenStarting", False)
options.set_preference("browser.download.dir","/Data")
options.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/octet-stream,application/vnd.ms-excel")
driver = webdriver.Firefox(executable_path=r"C:\Users\xtanl\miniconda3\Lib\site-packages\selenium\webdriver\geckodriver.exe", options=options)

#driver.get('http://google.com/') # brings up the browser

In [None]:
# Configure the Facebook_scraper class
posts_count = 100
browser = "firefox"
timeout = 60  #seconds
headless = True

In [None]:
# Set facebook users to scrap
df1 = scrap.extract_json("NicholasGohOrganisation", posts_count, browser, timeout, headless)
df2 = scrap.extract_json("QualityLifeWithQiLe", posts_count, browser, timeout, headless)
df3 = scrap.extract_json("AinWallofTrust", posts_count, browser, timeout, headless)
df4 = scrap.extract_json("JayceeOngFC", posts_count, browser, timeout, headless)

In [None]:
# Combine user datasets
fb_df = pd.concat([df1, df2, df3, df4])
print(f"fb_df output: {fb_df.shape}")

In [None]:
# Export the dataset
config.export_file_csv(fb_df, config.fb_media_datapath)

### Scrap data from Instagram

In [None]:
# Initialise a instagraphi client 
insta_client = scrap.login_instgram( config.insta_username, config.insta_pwd )

In [None]:
# Set list of users to extract from
instagram_users = [

        'phiyphiy'
        ,'angelicaasimm'
        ,'augustineseah'
        ,'abrialpang'
        ,'yul_dewi'
        ,'jocelynkau'
        ,'headlights_'
        ,'jasperseahassociates'
        ,'yugitoh'
        ,'ato.par'
        ,'agent_e'
        ,'teamey_'
        ,'danny_c'
        ,'ryankoh'
        ,'alanyey'
        ,'geralds'
        ,'33lespe'
        ,'xie.xie'
        ,'sandrao'
]

insta_user_df, insta_df = scrap.instagram_scrapper(insta_client, instagram_users)

In [None]:
# Export scrapped instagram post to specified filepath
config.export_file_csv(insta_user_df, config.insta_user_datapath)
config.export_file_csv(insta_df, config.insta_media_datapath)

In [None]:
# Check Session
# scrap.check_session(insta_client, config.insta_username, config.insta_pwd)

### Combine datasets

In [None]:
data_df = pp.combine_datasets(fb_df, insta_df)

#### Clean the content

In [None]:
data_df['cleaned_text'] = pp.clean_text(data_df, 'content')

#### Feature Creation

##### Initialise the potential breach list

In [None]:
if not REFRESH_BREACH_LIST: 
    all_breachlist_files = [os.path.join(config.raw_data_path, x) for x in os.listdir(config.raw_data_path) if x.startswith("breach_list") and x.endswith(".pkl")]
    if all_breachlist_files == []:
        print("Breach Wordlist does not exists. Set REFRESH_BREACH_LIST = True.")
    else:
        # Read breach wordlist from filepath
        curr_breachlist_filepath = max(all_breachlist_files, key = os.path.getctime)
        with open (curr_breachlist_filepath, 'rb') as fp:
            text_breach = pickle.load(fp)
        print(f"Potential Breach Words Loaded from {config.breachlist_datapath}")
else:
    print("Synthesizing new list of potential breach words...")
    # Synthesize potential breach words from a list of words specified in src/breach_words.py
    potential_breach_desig = list(breach.synthesize_words(breach.given_list_of_designations))
    potential_breach_hashtags = [x.lower().replace(' ', '_') for x in potential_breach_desig]
    potential_breach_promos = list(breach.synthesize_words(breach.given_list_of_promo))
    text_breach = [x.lower() for x in potential_breach_desig + potential_breach_promos]
    # Replace the breach wordlist
    with open(config.breachlist_datapath, 'wb') as fp:
        pickle.dump(text_breach, fp)
    print(f"Potential Breach Words Updated into {config.breachlist_datapath}")

In [None]:
# # Download spacy english package
# !python -m spacy download en_core_web_sm

In [None]:
# Select which features to create
output_features = ['data_source', 'id', 'username', 'posted_on', 'content', 'cleaned_text', #'hashtags', 'mentions', 'emojis', 
                    'breach_flagwords', 'breach_hashes', 'has_nonpru_email', 'has_hyperlinks', 'has_disclaimer']

data_df = pp.create_features(data_df, 'content', output_features, text_breach, potential_breach_hashtags)

# Add NER features - contains_monetary
data_df = pp.get_ner_features(data_df)

# Replace null content with a string 'None'
data_df.loc[data_df['cleaned_text'].isna(), 'cleaned_text'] = 'None'

### Export dataset

In [None]:
config.export_file_csv(data_df, config.feature_data, mode='w+')
print(f"Dataset with {list(data_df.columns)}")