# Preprocess of Youtube Titles

In [1]:
import pandas as pd
import re
import emoji

## 1. load data

In [2]:
data_original = pd.read_csv("./pretest_data.csv")

## 2. process special characters
 - replace similar meaning texts
 - remove meaningless characters
 - change emoticon to :string:

In [3]:
punct_mapping = {"‘": "'", "₹": "e", "´": "'", "°": "",
                 "€": "e", "™": "tm", "√": " sqrt ", "×": "x", "²": "2",
                 "—": "-", "–": "-", "’": "'", "_": "-", "`": "'", '“': '"',
                 '”': '"', '“': '"', "£": "e", '∞': 'infinity', 'θ': 'theta',
                 '÷': '/', 'α': 'alpha', '•': '.', 'à': 'a', '−': '-',
                 'β': 'beta', '∅': '', '³': '3', 'π': 'pi', '│':'|', 'ㅣ':'|'} #replace
specials = {'\u200b': ' ', '…': ' ... ', '\ufeff': '', 'करना': '', 'है': ''}

punct1 = "/-'.,#$\'()*+-/:;<=>@[\\]^_`{|}~%" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&' #remove
punct2 = "!?|" #add space
punct3 = 'ㅋㅠㅎㅜ' #regulate ㅋㅋㅋㅋ, ㅎㅎㅎㅎ,, etc

def clean_punc(text):
    for p in punct_mapping:  #replace
        text = text.replace(p, punct_mapping[p])
    for p in punct1:   #remove
        text = text.replace(p, f' ')
    for p in punct2:   #add space
        text = text.replace(p, f' {p} ')

    for s in specials:
        text = text.replace(s, specials[s])
    
    text = emoji.demojize(text) #change emoticon to :string:
    text = re.sub(r':(\D*)(\d*)(\D*):', r' :\1\2\3: ', text)

    #regulate ㅋㅋㅋㅋ, ㅎㅎㅎㅎ,, etc
    for t in punct3:
        text = re.sub(r'['+t+']+', ' '+t*4+' ', text)
    
    return text.strip()

In [4]:
data_clean_punc = []
for title in data_original['title']:
    data_clean_punc.append(clean_punc(title))

## 3. remove space

In [5]:
def clean_text(texts):
    corpus = []
    for i in range(0, len(texts)):
        
        review = str(texts[i])
        review = review.lower() #lower case
        review = re.sub(r'\s+', ' ', review) #remove extra space
        
        review = re.sub(r'\s+', ' ', review) #remove spaces
        review = re.sub(r"^\s+", '', review) #remove space from start
        review = re.sub(r'\s+$', '', review) #remove space from the end

        corpus.append(review)
        
    return corpus

In [6]:
data_clean_text = clean_text(data_clean_punc)

In [7]:
df = pd.DataFrame(data_original)
df['preprocessed_title'] = data_clean_text
df.to_csv("titles_preprocessed.csv", header=True, index=False)
df

Unnamed: 0,title,video_id,channel_name,channel_id,publish_time,views,preprocessed_title
0,만 19세 '연봉 1억' 친구와 레스토랑 브이로그,8e7GcsDEKEE,희준,UCVa3PvcBT187XSV6pFAZQwg,2021-04-01T11:47:06Z,30038.0,만 19세 연봉 1억 친구와 레스토랑 브이로그
1,BBQ 신메뉴 '체고치' 순살 먹어봄,FYoDak3VfF4,희준,UCVa3PvcBT187XSV6pFAZQwg,2021-03-29T10:41:55Z,29102.0,bbq 신메뉴 체고치 순살 먹어봄
2,구찌 라이톤이랑 시계 후기,FuTTOzwWo58,희준,UCVa3PvcBT187XSV6pFAZQwg,2021-03-26T06:54:42Z,25022.0,구찌 라이톤이랑 시계 후기
3,엽기 로제떡볶이랑 허니 콤보 혼내줌,-48XAC_GDjk,희준,UCVa3PvcBT187XSV6pFAZQwg,2021-03-23T11:58:49Z,49551.0,엽기 로제떡볶이랑 허니 콤보 혼내줌
4,어림도 없지 바로 마라탕 먹어버림,mhehORE9WNA,희준,UCVa3PvcBT187XSV6pFAZQwg,2021-03-20T11:30:18Z,74348.0,어림도 없지 바로 마라탕 먹어버림
...,...,...,...,...,...,...,...
101159,'현실남매' 먹방하러 갔다가 진짜 싸웠습니다.,K-sH6E9YeTw,최은경tv,UCQV67-DsnpfkjPTjM_ITtSA,2019-06-21T12:14:50Z,137922.0,현실남매 먹방하러 갔다가 진짜 싸웠습니다
101160,[직업체험] DJ예나 일일 매니저 (월디페/World DJ Festival),ucEs84amVm8,최은경tv,UCQV67-DsnpfkjPTjM_ITtSA,2019-06-14T17:10:40Z,153005.0,직업체험 dj예나 일일 매니저 월디페 world dj festival
101161,저의 슈퍼모델 친구들을 소개합니다.,jSK3H-dFfE0,최은경tv,UCQV67-DsnpfkjPTjM_ITtSA,2019-06-10T09:08:41Z,151071.0,저의 슈퍼모델 친구들을 소개합니다
101162,'현실남매' 우리만 이런건가요? 남들이 이해 못하는 현실남매,OEWk-sZG5j0,최은경tv,UCQV67-DsnpfkjPTjM_ITtSA,2019-06-02T09:05:01Z,177515.0,현실남매 우리만 이런건가요 ? 남들이 이해 못하는 현실남매
