In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import precision_score, accuracy_score, recall_score, roc_auc_score
from sklearn.decomposition import TruncatedSVD, PCA
from nltk.tokenize import RegexpTokenizer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('vader_lexicon')

import re
from bs4 import BeautifulSoup
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize, pos_tag

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/chku/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
# 3. feature engineering

In [3]:
# load data
df_train = pd.read_csv('../chk_output/train_feature.csv')
df_test = pd.read_csv('../chk_output/test_feature.csv')

display(df_train.head(3))
display(df_test.head(3))

Unnamed: 0,Id,topic,channel,weekday,author,img count,title,media count,pub_date,num_words,Page Content,Popularity,day_of_month,month,day_of_week,hour,ymd
0,0,Asteroid Asteroids challenge Earth Space U.S. ...,world,Wed,,1,NASA's Grand Challenge: Stop Asteroids From De...,0,2013-06-19 15:04:30,536,"<html><head><div class=""article-info""> <span c...",-1,19,6,2,15,2013-06-19
1,1,Apps and Software Google open source opn pledg...,tech,Thu,Christina Warren,2,Google's New Open Source Patent Pledge: We Won...,0,2013-03-28 17:40:55,305,"<html><head><div class=""article-info""><span cl...",1,28,3,3,17,2013-03-28
2,2,Entertainment NFL NFL Draft Sports Television,entertainment,Wed,Sam Laird,2,Ballin': 2014 NFL Draft Picks Get to Choose Th...,25,2014-05-07 19:15:20,1011,"<html><head><div class=""article-info""><span cl...",1,7,5,2,19,2014-05-07


Unnamed: 0,Id,topic,channel,weekday,author,img count,title,media count,pub_date,num_words,Page Content,day_of_month,month,day_of_week,hour,ymd
0,27643,Entertainment Music One Direction soccer Sports,entertainment,Mon,Sam Laird,1,Soccer Star Gets Twitter Death Threats After T...,7,2013-09-09 19:47:02,475,"<html><head><div class=""article-info""><span cl...",9,9,0,19,2013-09-09
1,27644,Gadgets glass Google Google Glass Google Glass...,tech,Thu,Stan Schroeder,3,Google Glass Gets an Accessory Store,0,2013-10-31 09:25:02,142,"<html><head><div class=""article-info""><span cl...",31,10,3,9,2013-10-31
2,27645,amazon amazon kindle Business Gaming,business,Tue,Todd Wasserman,2,OUYA Gaming Console Already Sold Out on Amazon,0,2013-06-25 12:54:54,164,"<html><head><div class=""article-info""><span cl...",25,6,1,12,2013-06-25


In [4]:
# create new features by observing EDA 

In [5]:
# Time Variables

In [6]:
## articles published at weekends seems more popular
df_train['is_weekend'] = df_train['day_of_week'].apply(lambda x: 1 if x==5 or x==6 else 0)
df_test['is_weekend'] = df_test['day_of_week'].apply(lambda x: 1 if x==5 or x==6 else 0)

In [7]:
df_train.corr()['is_weekend']['Popularity']

0.09756133419798459

In [8]:
# month
def pop_month(x):
    if x == 3:
        return 1
    elif x == 10:
        return -1 # -1 means not popular
    else:
        return 0

df_train['popular_month'] = df_train['month'].apply(pop_month)
df_test['popular_month'] = df_test['month'].apply(pop_month)

In [9]:
df_train.corr()['popular_month']['Popularity']
# I should normalize 'Popularity' to have correct corr()

0.06495173808218499

In [10]:
# hours
def pop_hour(x):
    if x == 13 or x == 21:
        return -1
    elif x == 5:
        return 1
    else:
        return 0

df_train['popular_hour'] = df_train['hour'].apply(pop_hour)
df_test['popular_hour'] = df_test['hour'].apply(pop_hour)

In [11]:
df_train.corr()['popular_hour']['Popularity']

0.014911374378094084

In [12]:
# channel 
# just watch the EDA and assign weights
def pop_channel(x):
    if x == 'social-media' or x == 'tech':
        return 2
    elif x == 'marketing' or x == 'lifestyle':
        return 1
    elif x == 'world' or x == 'entertainment':
        return -2
    elif x == 'business':
        return -1
    else:
        return 0

df_train['popular_channel'] = df_train['channel'].apply(pop_channel)
df_test['popular_channel'] = df_test['hour'].apply(pop_channel)
    

In [13]:
# check
print(df_train['popular_channel'].nunique())
display(df_train.head(5))

5


Unnamed: 0,Id,topic,channel,weekday,author,img count,title,media count,pub_date,num_words,...,Popularity,day_of_month,month,day_of_week,hour,ymd,is_weekend,popular_month,popular_hour,popular_channel
0,0,Asteroid Asteroids challenge Earth Space U.S. ...,world,Wed,,1,NASA's Grand Challenge: Stop Asteroids From De...,0,2013-06-19 15:04:30,536,...,-1,19,6,2,15,2013-06-19,0,0,0,-2
1,1,Apps and Software Google open source opn pledg...,tech,Thu,Christina Warren,2,Google's New Open Source Patent Pledge: We Won...,0,2013-03-28 17:40:55,305,...,1,28,3,3,17,2013-03-28,0,1,0,2
2,2,Entertainment NFL NFL Draft Sports Television,entertainment,Wed,Sam Laird,2,Ballin': 2014 NFL Draft Picks Get to Choose Th...,25,2014-05-07 19:15:20,1011,...,1,7,5,2,19,2014-05-07,0,0,0,-2
3,3,Sports Video Videos Watercooler,watercooler,Fri,Sam Laird,1,Cameraperson Fails Deliver Slapstick Laughs,21,2013-10-11 02:26:50,187,...,-1,11,10,4,2,2013-10-11,0,-1,0,0
4,4,Entertainment instagram instagram video NFL Sp...,entertainment,Thu,Connor Finnegan,52,NFL Star Helps Young Fan Prove Friendship With...,1,2014-04-17 03:31:43,182,...,-1,17,4,3,3,2014-04-17,0,0,0,-2


In [14]:
# create ordinal features for author
print(df_train['author'].isna().sum())
print(df_train['channel'].isna().sum())
df_train['author'].fillna('N/A', inplace=True) # fill nan author

2668
0


In [15]:
def map_popularity_author(col):
    df = df_train.groupby(f'{col}').mean().reset_index().sort_values(by='Popularity', ascending=False) \
              [[f'{col}', 'Popularity']]
    df.columns=[f'{col}', 'avg_popularity']
    
    '''
    pop_5 = df[df['avg_popularity'] >= 0.7][f'{col}'].values
    pop_4 = df[(df['avg_popularity'] < 0.7) & (df['avg_popularity'] >= 0.6)][f'{col}'].values
    pop_3 = df[(df['avg_popularity'] < 0.6) & (df['avg_popularity'] >= 0.5)][f'{col}'].values
    pop_2 = df[(df['avg_popularity'] < 0.5) & (df['avg_popularity'] >= 0.4)][f'{col}'].values
    pop_1 = df[(df['avg_popularity'] < 0.4) & (df['avg_popularity'] >= 0.3)][f'{col}'].values
    pop_0 = df[df['avg_popularity'] < 0.3][f'{col}'].values
    '''
    pop_5 = df[df['avg_popularity'] >= 0.5][f'{col}'].values
    pop_2 = df[(df['avg_popularity'] >= 0.2) & (df['avg_popularity'] < 0.5)][f'{col}'].values
    unpop_2 = df[(df['avg_popularity'] <= -0.2) & (df['avg_popularity'] >= -0.5)][f'{col}'].values
    unpop_5 = df[df['avg_popularity'] < -0.5][f'{col}'].values
    
    def lambda_fxn(x):
        '''
        if x in pop_5:
            return 5
        elif x in pop_4:
            return 4
        elif x in pop_3:
            return 3
        elif x in pop_2:
            return 2
        elif x in pop_1:
            return 1
        elif x in pop_0:
            return -1
            
        # To catch news desks/sections/subsections/material in test but not in train
        else:
            return 0
        '''
        
        if x in pop_5:
            return 5
        elif x in pop_2:
            return 2
        elif x in unpop_5:
            return -5
        elif x in unpop_2:
            return -2
        else:
            return 0
        
    
    df_train[f'popular_{col}'] = df_train[f'{col}'].apply(lambda_fxn)
    df_test[f'popular_{col}'] = df_test[f'{col}'].apply(lambda_fxn)

In [16]:
map_popularity_author('author')
#print(df_train['popular_author'].nunique())
display(df_train.head(5))

Unnamed: 0,Id,topic,channel,weekday,author,img count,title,media count,pub_date,num_words,...,day_of_month,month,day_of_week,hour,ymd,is_weekend,popular_month,popular_hour,popular_channel,popular_author
0,0,Asteroid Asteroids challenge Earth Space U.S. ...,world,Wed,,1,NASA's Grand Challenge: Stop Asteroids From De...,0,2013-06-19 15:04:30,536,...,19,6,2,15,2013-06-19,0,0,0,-2,0
1,1,Apps and Software Google open source opn pledg...,tech,Thu,Christina Warren,2,Google's New Open Source Patent Pledge: We Won...,0,2013-03-28 17:40:55,305,...,28,3,3,17,2013-03-28,0,1,0,2,0
2,2,Entertainment NFL NFL Draft Sports Television,entertainment,Wed,Sam Laird,2,Ballin': 2014 NFL Draft Picks Get to Choose Th...,25,2014-05-07 19:15:20,1011,...,7,5,2,19,2014-05-07,0,0,0,-2,0
3,3,Sports Video Videos Watercooler,watercooler,Fri,Sam Laird,1,Cameraperson Fails Deliver Slapstick Laughs,21,2013-10-11 02:26:50,187,...,11,10,4,2,2013-10-11,0,-1,0,0,0
4,4,Entertainment instagram instagram video NFL Sp...,entertainment,Thu,Connor Finnegan,52,NFL Star Helps Young Fan Prove Friendship With...,1,2014-04-17 03:31:43,182,...,17,4,3,3,2014-04-17,0,0,0,-2,0


In [17]:
# setiment on headline
#(ignore now)

In [18]:
# combine topic and title
df_train['combi_text'] = df_train['topic'] + '. ' + df_train['title']
df_train['combi_text2'] = df_train['combi_text'].str.replace(r'[\!?.]+[\.]+','.', regex=True) # remove extra punctuation in headline

df_test['combi_text'] = df_test['topic'] + '. ' + df_test['title']
df_test['combi_text2'] = df_test['combi_text'].str.replace(r'[\!?.]+[\.]+','.', regex=True)

#display(df_train.head(5)['combi_text'])
#display(df_train.head(5)['combi_text2'])

In [19]:
# sentiment on combi_text
sia = SIA()
print(sia.polarity_scores(df_train['combi_text'][0]))
print(df_train['Popularity'][0]) 

{'neg': 0.271, 'neu': 0.467, 'pos': 0.262, 'compound': -0.296}
-1


In [20]:
from tqdm.notebook import tqdm
tqdm.pandas()

def get_sentiment(row):
    sentiment_dict = sia.polarity_scores(row['combi_text'])
    row['sentiment_pos'] = sentiment_dict['pos'] # positive
    row['sentiment_neu'] = sentiment_dict['neu']
    row['sentiment_neg'] = sentiment_dict['neg']
    row['sentiment_compound'] = sentiment_dict['compound']
    return row

In [21]:
## check if there's NaN in ['combi_text'], or SIA() will report error
print(df_train['combi_text'].isna().sum())
print(df_train['combi_text2'].isna().sum())
df_train['combi_text'].fillna('N/A', inplace=True) # fill nan author
df_train['combi_text2'].fillna('N/A', inplace=True) # fill nan author

36
36


In [22]:
## double check
print(df_train['combi_text'].isna().sum())
print(df_train['combi_text2'].isna().sum())

0
0


In [23]:
df_train = df_train.progress_apply(get_sentiment, axis=1)

  0%|          | 0/27643 [00:00<?, ?it/s]

In [24]:
## check if there's NaN in ['combi_text'], or SIA() will report error
print(df_test['combi_text'].isna().sum())
print(df_test['combi_text2'].isna().sum())
df_test['combi_text'].fillna('N/A', inplace=True) # fill nan author
df_test['combi_text2'].fillna('N/A', inplace=True) # fill nan author

17
17


In [25]:
## double check
print(df_train['combi_text'].isna().sum())
print(df_train['combi_text2'].isna().sum())

0
0


In [26]:
# test data
df_test = df_test.progress_apply(get_sentiment, axis=1)

  0%|          | 0/11847 [00:00<?, ?it/s]

In [27]:
## sort positive sentiment in ascending order
df_train.loc[df_train['sentiment_pos'].sort_values(ascending=False).index]

Unnamed: 0,Id,topic,channel,weekday,author,img count,title,media count,pub_date,num_words,...,popular_month,popular_hour,popular_channel,popular_author,combi_text,combi_text2,sentiment_pos,sentiment_neu,sentiment_neg,sentiment_compound
15271,15271,happy holidays Social Media Vine Challenge,social-media,Fri,Megan Ranney,2,Your jolliest rule-bending holiday greeting cards,6,2014-12-19 21:09:01,120,...,0,-1,2,0,happy holidays Social Media Vine Challenge. Yo...,happy holidays Social Media Vine Challenge. Yo...,0.737,0.263,0.000,0.9413
26164,26164,Gift guides holiday party Lifestyle Work & Play,lifestyle,Mon,James O'Brien,11,9 gifts for your favorite holiday hosts,0,2014-12-15 19:03:17,552,...,0,0,1,5,Gift guides holiday party Lifestyle Work & Pla...,Gift guides holiday party Lifestyle Work & Pla...,0.701,0.299,0.000,0.9371
26118,26118,Cute funny Sex & Love Photography Watercooler,watercooler,Sun,Laura Vitto,23,20 Perfect GIFs To Express Your Love,0,2013-07-28 18:30:58,119,...,0,0,0,0,Cute funny Sex & Love Photography Watercooler....,Cute funny Sex & Love Photography Watercooler....,0.692,0.308,0.000,0.9584
13838,13838,nobel peace prize Twitter U.S. World,world,Fri,Samantha Murphy,2,Nobel Peace Prize Winner Notified Via Twitter,0,2013-10-11 14:31:19,375,...,-1,0,-2,0,nobel peace prize Twitter U.S. World. Nobel Pe...,nobel peace prize Twitter U.S. World. Nobel Pe...,0.685,0.315,0.000,0.9545
13708,13708,adventure Video Videos Watercooler YouTube,watercooler,Thu,Charlie White,1,'Humans Are Awesome' Video Celebrates Daring A...,12,2013-06-27 20:58:42,288,...,0,0,0,2,adventure Video Videos Watercooler YouTube. 'H...,adventure Video Videos Watercooler YouTube. 'H...,0.680,0.320,0.000,0.9313
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17678,17678,Mobile Tech tweetdeck Twitter World,tech,Wed,Samantha Murphy,2,Teen Says He Exposed TweetDeck Vulnerability b...,0,2014-06-11 20:40:07,309,...,0,0,2,0,Mobile Tech tweetdeck Twitter World. Teen Says...,Mobile Tech tweetdeck Twitter World. Teen Says...,0.000,0.775,0.225,-0.2960
1847,1847,accessories Android The Future of Travel iPhon...,lifestyle,Thu,Dani Fankhauser,7,6 Bags That Charge Your Devices While You Travel,3,2013-09-19 14:50:10,829,...,0,0,1,0,accessories Android The Future of Travel iPhon...,accessories Android The Future of Travel iPhon...,0.000,1.000,0.000,0.0000
1846,1846,Apps and Software art Dev & Design Fashion Gad...,tech,Thu,Adario Strange,4,Camera-Covered Jacket Is the Ultimate in Sarto...,1,2014-10-23 03:30:05,220,...,-1,0,2,0,Apps and Software art Dev & Design Fashion Gad...,Apps and Software art Dev & Design Fashion Gad...,0.000,1.000,0.000,0.0000
17681,17681,Advertising Business Evian Evian Babies Market...,advertising,Mon,Todd Wasserman,1,"The Evian Babies Are Back, Sans Rollerskates",2,2013-04-22 13:56:46,236,...,0,-1,0,0,Advertising Business Evian Evian Babies Market...,Advertising Business Evian Evian Babies Market...,0.000,1.000,0.000,0.0000


In [28]:
# tf-idf on 'combi_text2'('topic' + 'title')

In [29]:
## define preprocessor and tokenizer

import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

nltk.download('stopwords')
stop = stopwords.words('english')

# define extra stopwords
extra_stopwords = ["ain't", "amn't", "aren't", "can't", "could've", "couldn't",
                    "daresn't", "didn't", "doesn't", "don't", "gonna", "gotta", 
                    "hadn't", "hasn't", "haven't", "he'd", "he'll", "he's", "how'd",
                    "how'll", "how's", "I'd", "I'll", "I'm", "I've", "isn't", "it'd",
                    "it'll", "it's", "let's", "mayn't", "may've", "mightn't", 
                    "might've", "mustn't", "must've", "needn't", "o'clock", "ol'",
                    "oughtn't", "shan't", "she'd", "she'll", "she's", "should've",
                    "shouldn't", "somebody's", "someone's", "something's", "that'll",
                    "that're", "that's", "that'd", "there'd", "there're", "there's", 
                    "these're", "they'd", "they'll", "they're", "they've", "this's",
                    "those're", "tis", "twas", "twasn't", "wasn't", "we'd", "we'd've",
                    "we'll", "we're", "we've", "weren't", "what'd", "what'll", 
                    "what're", "what's", "what've", "when's", "where'd", "where're",
                    "where's", "where've", "which's", "who'd", "who'd've", "who'll",
                    "who're", "who's", "who've", "why'd", "why're", "why's", "won't",
                    "would've", "wouldn't", "y'all", "you'd", "you'll", "you're", 
                    "you've", "'s", "'d", "'m", "abov", "afterward", "ai", "alon", "alreadi", "alway", "ani", 
                     "anoth", "anyon", "anyth", "anywher", "becam", "becaus", "becom", "befor", 
                     "besid", "ca", "cri", "dare", "describ", "did", "doe", "dure", "els", 
                     "elsewher", "empti", "everi", "everyon", "everyth", "everywher", "fifti", 
                     "forti", "gon", "got", "henc", "hereaft", "herebi", "howev", "hundr", "inde", 
                     "let", "ll", "mani", "meanwhil", "moreov", "n't", "na", "need", "nobodi", "noon", 
                     "noth", "nowher", "ol", "onc", "onli", "otherwis", "ought", "ourselv", "perhap", 
                     "pleas", "sever", "sha", "sinc", "sincer", "sixti", "somebodi", "someon", "someth", 
                     "sometim", "somewher", "ta", "themselv", "thenc", "thereaft", "therebi", "therefor", 
                     "togeth", "twelv", "twenti", "ve", "veri", "whatev", "whenc", "whenev", 
                    "wherea", "whereaft", "wherebi", "wherev", "whi", "wo", "anywh", "el", "elsewh", "everywh", 
                    "ind", "otherwi", "plea", "somewh", "yourselv"]

stop = stop + extra_stopwords

def preprocessor(text):
    # remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()
    
    # regex for matching emoticons, keep emoticons, ex: :), :-P, :-D
    r = '(?::|;|=|X)(?:-)?(?:\)|\(|D|P)'
    emoticons = re.findall(r, text)
    text = re.sub(r, '', text)
    
    # convert to lowercase and append all emoticons behind (with space in between)
    # replace('-','') removes nose of emoticons
    text = re.sub('[\W]+', ' ', text.lower()) + ' ' + ' '.join(emoticons).replace('-','')
    return text

def tokenizer_stem_nostop(text):
    porter = PorterStemmer()
    return [porter.stem(w) for w in re.split('\s+', text.strip()) \
            if w not in stop and re.match('[a-zA-Z]+', w)]



[nltk_data] Downloading package stopwords to /Users/chku/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [30]:
## check tokenize, stemmization and non_stopwords
tokenizer_stem_nostop(preprocessor(df_train['combi_text2'][0]) )

['asteroid',
 'asteroid',
 'challeng',
 'earth',
 'space',
 'u',
 'world',
 'nasa',
 'grand',
 'challeng',
 'stop',
 'asteroid',
 'destroy',
 'earth']

In [31]:
## define tf-idf vector
tfidf_vectorizer = TfidfVectorizer(preprocessor=preprocessor,  tokenizer=tokenizer_stem_nostop,
                                  ngram_range=(1,1), 
                                  sublinear_tf = False,
                                  dtype = np.float32)

In [32]:
## from lab code
tfidf_vectorizer.fit(df_train['combi_text2'])

top = 10
# get idf score of vocabularies
idf = tfidf_vectorizer.idf_
print('[vocabularies with smallest idf scores]')
sorted_idx = idf.argsort()

for i in range(top):
    print('%s: %.2f' %(tfidf_vectorizer.get_feature_names()[sorted_idx[i]], idf[sorted_idx[i]]))

doc_tfidf = tfidf_vectorizer.transform(df_train['combi_text2']).toarray()
tfidf_sum = np.sum(doc_tfidf, axis=0)
print("\n[vocabularies with highest tf-idf scores]")
for tok, v in zip(tfidf_vectorizer.inverse_transform(np.ones(tfidf_sum.shape[0]).reshape(1, -1))[0][tfidf_sum.argsort()[::-1]][:top], \
                        np.sort(tfidf_sum)[::-1][:top]):
    print('{}: {}'.format(tok, v))



[vocabularies with smallest idf scores]
world: 2.34
tech: 2.57
entertain: 2.68
watercool: 2.70
busi: 2.78
u: 2.79
video: 2.87
app: 2.99
softwar: 3.07
mobil: 3.07





[vocabularies with highest tf-idf scores]
video: 1070.1407470703125
world: 702.65478515625
app: 695.5721435546875
tech: 608.2487182617188
busi: 598.9265747070312
watercool: 545.3905639648438
u: 519.1368408203125
entertain: 511.7611389160156
mobil: 472.13372802734375
media: 462.6010437011719


In [33]:
## apply tf-idf on combi_text2 of training 
#all_topic = df_train['topic'].values.astype('U').tolist() + df_test['topic'].values.astype('U').tolist()
train_tfidf_matrix = tfidf_vectorizer.fit_transform(df_train['combi_text2'].values.astype('U').tolist())

#tfidf_matrix = tfidf_vectorizer.fit_transform(pd.concat(df_train['combi_text2'], df_test['combi_text2']), axis = 0)

In [34]:
## check shape
print(train_tfidf_matrix.shape)

(27643, 15724)


In [35]:
# apply hashing vector on combi_text2 of training and testing

from sklearn.feature_extraction.text import HashingVectorizer

hashvec = HashingVectorizer(n_features=1024,
                            preprocessor=preprocessor,
                            tokenizer=tokenizer_stem_nostop)


In [36]:
train_hash_matrix = hashvec.fit_transform(df_train['combi_text2'].values.astype('U').tolist())
test_hash_matrix = hashvec.fit_transform(df_test['combi_text2'].values.astype('U').tolist())



In [37]:
## check shape
print(train_hash_matrix.shape)
print(test_hash_matrix.shape)

(27643, 1024)
(11847, 1024)


In [38]:
##( or use another method ?)

In [39]:
# one-hot-encode on ['media count'], ['img count']
# 先不管這兩個features，可能要用EDA找特性
##(ignore now)

In [40]:
# 先試試不要one-hot-encode已經處理過的資料，看效果如何
##(ignore now)
'''

from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
# one-hot-encode on 'weekday', 'author', 'hour', ...
OHE = OneHotEncoder(handle_unknown='ignore')
train_ohe_channel = OHE.fit_transform(df_train['weekday'].values.reshape(-1,1)).toarray()

# OHE = OneHotEncoder(handle_unknown='ignore')
test_ohe_channel = OHE.transform(df_test['channel'].values.reshape(-1,1)).toarray()
print(train_ohe_channel.shape)
print(test_ohe_channel.shape)
'''

"\n\nfrom sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler\n# one-hot-encode on 'weekday', 'author', 'hour', ...\nOHE = OneHotEncoder(handle_unknown='ignore')\ntrain_ohe_channel = OHE.fit_transform(df_train['weekday'].values.reshape(-1,1)).toarray()\n\n# OHE = OneHotEncoder(handle_unknown='ignore')\ntest_ohe_channel = OHE.transform(df_test['channel'].values.reshape(-1,1)).toarray()\nprint(train_ohe_channel.shape)\nprint(test_ohe_channel.shape)\n"

# so far, we process 'pub_date', 'channel' -> by observing EDA,  'author' -> by giving different weight by average popularity, ('topic', 'title') -> by sentimental detect and tf-idf
# (Note: we haven't process 'Page Content' -> may turn into abstract, ('img count', 'media-count') -> may use EDA to exploring some properties

In [41]:
df_train.columns

Index(['Id', 'topic', 'channel', 'weekday', 'author', 'img count', 'title',
       'media count', 'pub_date', 'num_words', 'Page Content', 'Popularity',
       'day_of_month', 'month', 'day_of_week', 'hour', 'ymd', 'is_weekend',
       'popular_month', 'popular_hour', 'popular_channel', 'popular_author',
       'combi_text', 'combi_text2', 'sentiment_pos', 'sentiment_neu',
       'sentiment_neg', 'sentiment_compound'],
      dtype='object')

In [42]:
# drop already processed "numerical" features + 一些還沒process的numerical features
## (as ._get_numeric_data() will ignore categorical features, \
##  and all categorical features expect 'Page Content' are already processed)

df_train_dropped = df_train.drop(['weekday',  'hour', 'month', 'day_of_week', 'day_of_month',
                # not processed numerical features
               'img count', 'media count', 'num_words', 'Id', ], axis = 1)

In [43]:
df_test_dropped = df_test.drop(['weekday',  'hour', 'month', 'day_of_week', 'day_of_month',
                # not processed numerical features
               'img count', 'media count', 'num_words', 'Id', ], axis = 1)

In [44]:
df_train_dropped.columns

Index(['topic', 'channel', 'author', 'title', 'pub_date', 'Page Content',
       'Popularity', 'ymd', 'is_weekend', 'popular_month', 'popular_hour',
       'popular_channel', 'popular_author', 'combi_text', 'combi_text2',
       'sentiment_pos', 'sentiment_neu', 'sentiment_neg',
       'sentiment_compound'],
      dtype='object')

In [45]:
df_test_dropped.columns

Index(['topic', 'channel', 'author', 'title', 'pub_date', 'Page Content',
       'ymd', 'is_weekend', 'popular_month', 'popular_hour', 'popular_channel',
       'popular_author', 'combi_text', 'combi_text2', 'sentiment_pos',
       'sentiment_neu', 'sentiment_neg', 'sentiment_compound'],
      dtype='object')

In [46]:
# concat tf-idf matrix to df_train_dropped

## error: cannot concat sparse matrix
## solved: https://stackoverflow.com/questions/40570282/combine-sklearn-tfidf-with-additional-data

## for tfidf 但太大了，在算corr()會卡很久
#df_train_dropped_tfidf = pd.concat([df_train_dropped, pd.DataFrame(train_tfidf_matrix.toarray())], axis=1)

## for hashvec
df_train_dropped_hash = pd.concat([df_train_dropped, pd.DataFrame(train_hash_matrix.toarray())], axis=1)
df_test_dropped_hash = pd.concat([df_test_dropped, pd.DataFrame(test_hash_matrix.toarray())], axis=1)
display(df_train_dropped_hash.head(3))
display(df_test_dropped_hash.head(3))

Unnamed: 0,topic,channel,author,title,pub_date,Page Content,Popularity,ymd,is_weekend,popular_month,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,Asteroid Asteroids challenge Earth Space U.S. ...,world,,NASA's Grand Challenge: Stop Asteroids From De...,2013-06-19 15:04:30,"<html><head><div class=""article-info""> <span c...",-1,2013-06-19,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Apps and Software Google open source opn pledg...,tech,Christina Warren,Google's New Open Source Patent Pledge: We Won...,2013-03-28 17:40:55,"<html><head><div class=""article-info""><span cl...",1,2013-03-28,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Entertainment NFL NFL Draft Sports Television,entertainment,Sam Laird,Ballin': 2014 NFL Draft Picks Get to Choose Th...,2014-05-07 19:15:20,"<html><head><div class=""article-info""><span cl...",1,2014-05-07,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,topic,channel,author,title,pub_date,Page Content,ymd,is_weekend,popular_month,popular_hour,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,Entertainment Music One Direction soccer Sports,entertainment,Sam Laird,Soccer Star Gets Twitter Death Threats After T...,2013-09-09 19:47:02,"<html><head><div class=""article-info""><span cl...",2013-09-09,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Gadgets glass Google Google Glass Google Glass...,tech,Stan Schroeder,Google Glass Gets an Accessory Store,2013-10-31 09:25:02,"<html><head><div class=""article-info""><span cl...",2013-10-31,0,-1,0,...,0.0,0.0,0.0,0.0,0.164399,0.0,0.0,0.0,0.0,0.0
2,amazon amazon kindle Business Gaming,business,Todd Wasserman,OUYA Gaming Console Already Sold Out on Amazon,2013-06-25 12:54:54,"<html><head><div class=""article-info""><span cl...",2013-06-25,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [47]:
# ignore categorical data and find top features

## ._get_numeric_data() -> get numeric column

#df_train_dropped_tfidf._get_numeric_data().head(5)
display(df_train_dropped_hash._get_numeric_data().head(5))
display(df_test_dropped_hash._get_numeric_data().head(5))

Unnamed: 0,Popularity,is_weekend,popular_month,popular_hour,popular_channel,popular_author,sentiment_pos,sentiment_neu,sentiment_neg,sentiment_compound,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,-1,0,0,0,-2,0,0.262,0.467,0.271,-0.296,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0,1,0,2,0,0.084,0.851,0.065,0.1481,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,0,0,0,-2,0,0.141,0.859,0.0,0.4215,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-1,0,-1,0,0,0,0.246,0.538,0.215,0.1027,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-1,0,0,0,-2,0,0.556,0.444,0.0,0.9153,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,is_weekend,popular_month,popular_hour,popular_channel,popular_author,sentiment_pos,sentiment_neu,sentiment_neg,sentiment_compound,0,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,0,0,0,0,0,0.119,0.596,0.285,-0.5994,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,-1,0,0,0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.164399,0.0,0.0,0.0,0.0,0.0
2,0,0,0,0,0,0.338,0.662,0.0,0.4767,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,0,0,0,0,0.0,0.842,0.158,-0.4588,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,-1,0,0,0,0.304,0.696,0.0,0.8074,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [50]:
## find features according to corr() of each column to 'Popularity'

## for tfidf

#top_feats = abs(df_train_dropped_tfidf._get_numeric_data().corr()['Popularity']).sort_values(ascending=False).index[0:42]
#print(abs(df_train_dropped_tfidf._get_numeric_data().corr()['Popularity']).sort_values(ascending=False))

## for hashvec

feats = abs(df_train_dropped_hash._get_numeric_data().corr()['Popularity']).sort_values(ascending=False)
print(feats)

Popularity         1.000000
popular_author     0.136841
is_weekend         0.097561
popular_month      0.064952
popular_channel    0.044723
                     ...   
356                0.000048
694                0.000044
830                0.000041
313                0.000037
427                0.000012
Name: Popularity, Length: 1034, dtype: float64


In [63]:
# choose top correlated features
num_top_feats = 42
top_feats = feats.index[0:num_top_feats]

In [64]:
# some features has dropped 
#print([col for col in df_train_dropped_hash._get_numeric_data().columns if col not in top_feats])

In [65]:
# drop processed features and concat tf-idf
df_train_processed = df_train_dropped_hash[top_feats]
df_test_processed = df_test_dropped_hash[top_feats[1:]] # should not contain 'Popularity'
display(df_train.head(3))
display(df_train_processed.head(3))
display(df_test_processed.head(3))

Unnamed: 0,Id,topic,channel,weekday,author,img count,title,media count,pub_date,num_words,...,popular_month,popular_hour,popular_channel,popular_author,combi_text,combi_text2,sentiment_pos,sentiment_neu,sentiment_neg,sentiment_compound
0,0,Asteroid Asteroids challenge Earth Space U.S. ...,world,Wed,,1,NASA's Grand Challenge: Stop Asteroids From De...,0,2013-06-19 15:04:30,536,...,0,0,-2,0,Asteroid Asteroids challenge Earth Space U.S. ...,Asteroid Asteroids challenge Earth Space U.S. ...,0.262,0.467,0.271,-0.296
1,1,Apps and Software Google open source opn pledg...,tech,Thu,Christina Warren,2,Google's New Open Source Patent Pledge: We Won...,0,2013-03-28 17:40:55,305,...,1,0,2,0,Apps and Software Google open source opn pledg...,Apps and Software Google open source opn pledg...,0.084,0.851,0.065,0.1481
2,2,Entertainment NFL NFL Draft Sports Television,entertainment,Wed,Sam Laird,2,Ballin': 2014 NFL Draft Picks Get to Choose Th...,25,2014-05-07 19:15:20,1011,...,0,0,-2,0,Entertainment NFL NFL Draft Sports Television....,Entertainment NFL NFL Draft Sports Television....,0.141,0.859,0.0,0.4215


Unnamed: 0,Popularity,popular_author,is_weekend,popular_month,popular_channel,310,500,820,736,279,...,765,259,52,354,239,482,459,816,sentiment_compound,719
0,-1,0,0,0,-2,0.0,0.0,0.0,0.204124,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.296,0.0
1,1,0,0,1,2,0.0,0.0,0.0,0.158114,0.0,...,0.0,0.0,0.0,-0.316228,0.0,0.0,0.0,0.0,0.1481,0.0
2,1,0,0,0,-2,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.4215,0.0


Unnamed: 0,popular_author,is_weekend,popular_month,popular_channel,310,500,820,736,279,507,...,765,259,52,354,239,482,459,816,sentiment_compound,719
0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.5994,0.0
1,0,0,-1,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,-0.657596,0.0,0.0,0.0,0.0,0.0,0.0
2,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.4767,0.0


In [66]:
# save output
df_train_processed.to_csv('../chk_output/train_processed.csv', index=False)
df_test_processed.to_csv('../chk_output/test_processed.csv', index=False)