## <center>DataLab Cup 1: Text Feature Engineering</center>

#### - To import the libraries

In [1]:
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings("ignore")

#### 1. To load the datasets

In [2]:
train_data = pd.read_csv('../dataset/train.csv')
test_data  = pd.read_csv('../dataset/test.csv')


In [3]:
train_data.head()

Unnamed: 0,Id,Popularity,Page content
0,0,-1,"<html><head><div class=""article-info""> <span c..."
1,1,1,"<html><head><div class=""article-info""><span cl..."
2,2,1,"<html><head><div class=""article-info""><span cl..."
3,3,-1,"<html><head><div class=""article-info""><span cl..."
4,4,-1,"<html><head><div class=""article-info""><span cl..."


#### 2. To extract the features from the dataset

將一些我們想要用到的feature從dataset中提取出來。以下為提取的特徵:

- title
- time(year/month/day/hour/minute/second)
- number of images (num_img)
- number of videos (num_video)
- author name
- topic
- channel
- length of content 

In [14]:
from bs4 import BeautifulSoup
from datetime import datetime
# to get the attribute of the 'title', 'year/month/day/hour/minute/second', 'num_img', 'num_video', 'author name', 'topic', 'channel', 'content length'

def preprocessor(text):
    soup = BeautifulSoup(text, 'html.parser')

    # 1. to find the 'title' (body > h1)
    title = soup.find('h1').string.strip().lower()

    # 2. to find time(body > div > span > time)
    date_string = soup.find('time')['datetime'].strip().lower()
    date = datetime.strptime(date_string, "%a, %d %b %Y %H:%M:%S %z")

    year = date.year
    month = date.month
    day = date.day
    hour = date.hour
    minute = date.minute
    second = date.second

    

    # 3. to find the number of images
    num_img  = len(soup.find_all('img'))
    

    # 4. to find the number of videos
    num_video = len(soup.find_all('iframe'))
    

    # 5. to find the author name
    article_info = soup.find('div', class_='article-info')
    author = article_info.find('span', class_='author_name') or article_info.find('span', class_='byline basic')

    if (author != None):
        if (author.find('a') != None):
            author = author.find('a')
            author_name = author.get_text().lower()
        else :
            author_name = author.get_text().lower()
    else :
        author_name = 'not found'
    
    

    # 6. to find the article topic
    footer = soup.find('footer', class_='article-topics')
    topic = footer.get_text().split(': ')[1]
    
    # 7. to find the channel
    channel = soup.find('article')['data-channel'].strip().lower()
    
    '''
    print('title = ', title, type(title))
    print('time = ', year, "/", month, "/",day, " ",hour, ":",minute, ":",second, type(year))
    print('number of images = ', num_img, type(num_img))
    print('number of videos = ', num_video, type(num_video))
    print('author_name = ', author_name, type(author_name))
    print('topic = ', topic, type(topic))
    print('channel = ', channel, type(channel))
    '''
    
    # 8. to find the content length
    content = soup.body.find('section', class_='article-content').get_text()
    len_content = len(content)

    # print('topic = ', topic, type(topic))

    return title, author_name, channel, topic, year, month, day, hour, minute, second, num_img, num_video, len_content

preprocessor(train_data['Page content'][0])

("nasa's grand challenge: stop asteroids from destroying earth",
 'clara moskowitz',
 'world',
 'Asteroid, Asteroids, challenge, Earth, Space, U.S., World ',
 2013,
 6,
 19,
 15,
 4,
 30,
 1,
 0,
 3591)

In [15]:
feature_list = []

for content in (train_data['Page content']):
    feature_list.append(preprocessor(content))

df_all = pd.DataFrame(
        feature_list, 
        columns=['title', 'author_name', 'channel', 'topic', 'year', 'month', 'day', 'hour', 'minute', 'second', 'num_img', 'num_video', 'len_content'])

### 3-1 . Preprocessing - tokenization

To split the text corpora into individual elements

In [6]:
import re

def tokenizer(text):
    return re.split('\s+', text.strip())

### 3-2 . Preprocessing - Word Stemming

There are two ways of word stemming

1. PorterStemmer(Stemming): break the word in rule-besed way, which will lead to the probelm of overstemming

2. WordNetLemmatizer(Lemmatization): Stem the words will better performance, while time-consuming


In [16]:
import nltk
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')
nltk.download('omw-1.4')

def word_stemming(text):
    if type(text) == np.ndarray:
        text = text[0]
    lm = WordNetLemmatizer()
    words = re.split('\s', text.strip())
    lemmatized_words = [lm.lemmatize(word) for word in words]
    return lemmatized_words

print(word_stemming('university, universal, universities'))

['university,', 'universal,', 'university']


[nltk_data] Downloading package wordnet to /home/mfhsieh/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/mfhsieh/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [8]:
from nltk.stem.porter import PorterStemmer

def tokenizer_stem(text):
    porter = PorterStemmer()
    return [porter.stem(word) for word in re.split('\s+', text.strip())]

print(tokenizer_stem('university, universal, universities'))

['university,', 'universal,', 'univers']


### 3-3 Preprocessing - Stop-Word Removal

儘管刪除停用詞在某些情況下（例如 BoW 和特徵哈希）可以有益於簡化表示，並可能提高文字分析的準確性，但並不總是必要，特別是在使用 TF-IDF 時。是否刪除停用詞應基於文本分析任務的具體要求以及資料集的特性來進行決策

In [9]:
def stop_word_removal(text):
    return

#### 3-4 Create TF-IDF feature representation ([ref](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html))

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer

text_transformer = ColumnTransformer(
    [
        ('title preprocess', TfidfVectorizer(tokenizer=word_stemming, ngram_range=(1,1), lowercase=False), [0]),            # to process the title paragraph
        ('author name process', TfidfVectorizer(tokenizer=word_stemming, ngram_range=(1,1), lowercase=False), [1]),
        #('channel process', TfidfVectorizer(tokenizer=word_stemming, ngram_range=(1,1), lowercase=False), [2]),
        #('topic name process', TfidfVectorizer(tokenizer=word_stemming, ngram_range=(1,1), lowercase=False), [3]),
    ],
    remainder='passthrough', # do not touch the remaining data
    n_jobs=-1
)


In [19]:
tfidf = TfidfVectorizer(tokenizer=word_stemming, ngram_range=(1,1), lowercase=False)
tfidf.fit(df_all['title'])
top = 10
# get idf score of vocabularies
idf = tfidf.idf_
print('[vocabularies with smallest idf scores]')
sorted_idx = idf.argsort()

for i in range(top):
    print('%s: %.2f' %(tfidf.get_feature_names_out()[sorted_idx[i]], idf[sorted_idx[i]]))

doc_tfidf = tfidf.transform(df_all['title']).toarray()
tfidf_sum = np.sum(doc_tfidf, axis=0)
print("\n[vocabularies with highest tf-idf scores]")
for tok, v in zip(tfidf.inverse_transform(np.ones((1, tfidf_sum.shape[0])))[0][tfidf_sum.argsort()[::-1]][:top], \
                        np.sort(tfidf_sum)[::-1][:top]):
    print('{}: {}'.format(tok, v))


[vocabularies with smallest idf scores]
to: 2.53
the: 2.56
in: 2.96
a: 3.04
of: 3.06
for: 3.11
and: 3.44
is: 3.50
on: 3.53
your: 3.62

[vocabularies with highest tf-idf scores]
the: 815.311294761899
to: 779.2825126570024
a: 560.246222897993
in: 554.0700384430594
of: 530.6444224763436
for: 516.0574390632206
and: 393.9149074133082
is: 386.79499423625896
your: 381.58405078926234
you: 380.68545839235753


### 4. Model training

- XGBoost

- LightGBM

- CatBoost

#### - To split the dataset 

In [29]:
df = df_all.loc[:, [
                    'title', 
                    'author_name', 
                    # 'channel', 
                    # 'topic', 
                    # 'year', 
                    # 'month', 
                    # 'day', 
                    # 'hour', 
                    # 'minute', 
                    # 'second', 
                    # 'num_img', 
                    # 'num_video', 
                    # 'len_content'
                    ]]
df.head()

Unnamed: 0,title,author_name
0,nasa's grand challenge: stop asteroids from de...,clara moskowitz
1,google's new open source patent pledge: we won...,christina warren
2,ballin': 2014 nfl draft picks get to choose th...,sam laird
3,cameraperson fails deliver slapstick laughs,sam laird
4,nfl star helps young fan prove friendship with...,connor finnegan


In [30]:
from sklearn.model_selection import train_test_split

X_train = df.values
y_train = train_data['Popularity'].values
y_train[y_train==-1] = 0

print(X_train.shape)
print(type(X_train))
print(X_train)
print(y_train.shape)
print(type(y_train))
print(y_train)

X_train_split, X_valid_split, y_train_split, y_valid_split = train_test_split(X_train, y_train, test_size=0.3, random_state=0)

(27643, 2)
<class 'numpy.ndarray'>
[["nasa's grand challenge: stop asteroids from destroying earth"
  'clara moskowitz']
 ["google's new open source patent pledge: we won't sue unless attacked first"
  'christina warren']
 ["ballin': 2014 nfl draft picks get to choose their own walk-out music"
  'sam laird']
 ...
 ['14 dogs that frankly cannot take the heat' 'christine erickson']
 ['yahoo earnings beat estimates, but core problems remain'
  'seth fiegerman']
 ['the winners of our #curiocity contest tour austin’s tech scene'
  'megan ranney']]
(27643,)
<class 'numpy.ndarray'>
[0 1 1 ... 0 0 1]


#### - To construct the training function

In [31]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import roc_auc_score

def training(clf):
    clf_cv = cross_validate(clf, X_train, y_train, scoring='roc_auc', return_train_score=True, return_estimator=True)
    print('train score: {:.5f} (+/-{:.5f})'.format(
        np.mean(clf_cv['train_score']), np.std(clf_cv['train_score'])))
    print('valid score: {:.5f} (+/-{:.5f})'.format(
        np.mean(clf_cv['test_score']), np.std(clf_cv['test_score'])))

    clf.fit(X_train_split, y_train_split)
    print('train score: {:.5f}'.format(roc_auc_score(
        y_train_split, clf.predict_proba(X_train_split)[:, 1])))
    print('valid score: {:.5f}'.format(roc_auc_score(
        y_valid_split, clf.predict_proba(X_valid_split)[:, 1])))
    
    return clf

### 4-1. XGBOOST

In [32]:
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier

xgboost = Pipeline([('vect', text_transformer),
                  ('clf', XGBClassifier(n_estimators=100, max_depth=6, learning_rate= 0.3))])

training(xgboost)
# clf_cv = cross_validate(xgboost, X_train, y_train, scoring='roc_auc', return_train_score=True, return_estimator=True)

#xgboost = training(xgboost)



### 4-2. 