In [1]:
%load_ext autoreload
%autoreload 2

In [26]:
import pandas as pd
import numpy as np
import warnings
from sklearn import linear_model, preprocessing
from sklearn.model_selection import train_test_split
warnings.filterwarnings('ignore')

In [3]:
articles = pd.read_csv('../Datasets/articles_filter_en.csv', sep='\t', encoding='utf-8', index_col=[0])

In [4]:
for l in articles.site_categories.unique():
    if 'tech' in l:
        print(l)

['tech']
['computer_certification', 'tech']
['news', 'tech']
['law_government_and_politics', 'home_and_garden', 'tech', 'hacking', 'environmental_safety', 'us_government_resources']
['shopping', 'media', 'cell_phones', 'tech']
['windows', 'tech']
['cell_phones', 'tech']
['pc_support', 'tech']
['media', 'tech']
['tech', 'email']
['tech', 'law_government_and_politics', 'us_government_resources']
['databases', 'tech', 'law_government_and_politics', 'us_government_resources']
['biotech_biomedical', 'business']
['javascript', 'tech']
['media', 'cell_phones', 'tech', 'computer_reviews']
['windows', 'hobbies_and_interests', 'tech', 'video_and_computer_games']
['search_engine', 'media', 'tech']
['media', 'hacking', 'tech']


In [6]:
articles.columns

Index(['site', 'site_type', 'site_section', 'site_categories', 'domain_rank',
       'country', 'author', 'published', 'title', 'text', 'highlightText',
       'highlightTitle', 'language', 'rating', 'locations_pos',
       'locations_neu', 'locations_neg', 'organizations_pos',
       'organizations_neu', 'organizations_neg', 'persons_pos', 'persons_neu',
       'persons_neg', 'rank', 'website', 'category', 'change',
       'avg_visit_duration', 'pages/visit', 'bounce_rate', 'subcategory'],
      dtype='object')

In [47]:
X = articles[['site_type', 'site_categories', 'domain_rank', 'country', 'avg_visit_duration', 'pages/visit', 'bounce_rate', 'rating']]
y = articles[['category', 'subcategory', 'site_categories', 'site_section']]

X['has_author'] = np.where(articles.author.isnull() , False, True)

X['has_person_pos'] = (not articles.persons_pos.empty)
X['has_person_neu'] = (not articles.persons_neu.empty)
X['has_person_neg'] = (not articles.persons_neg.empty)
X['has_organizations_pos'] = (not articles.organizations_pos.empty)
X['has_organizations_neu'] = (not articles.organizations_neu.empty)
X['has_organizations_neg'] = (not articles.organizations_neg.empty)

def get_sec(time_str):
    h, m, s = str(time_str).split(':')
    return int(h) * 3600 + int(m) * 60 + int(s)

X.avg_visit_duration = X.avg_visit_duration.map(get_sec)

y['class'] = np.where((y.category == 'News and Media') & ('tech' not in y.site_categories) , 'News', 'Science')

In [9]:
y.loc[(y.category == 'Health')].groupby(['category', 'subcategory','site_categories']).size()

category  subcategory               site_categories                                                        
Health    Child Health              ['pregnancy', 'family_and_parenting']                                      100
          Conditions and Diseases   ['senor_health', 'mens_health', 'health', 'arthritis', 'womens_health']     96
                                    []                                                                           2
          Education and Resources   ['social']                                                                 100
          Healthcare Industry       ['health']                                                                   1
          Medicine                  ['orthopedics', 'health']                                                   20
                                    ['uncategorized']                                                            5
                                    []                                                 

In [23]:
y.groupby(['class']).count()

Unnamed: 0_level_0,category,subcategory,site_categories,site_section
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
News,5318,4161,5318,4641
Science,4138,2657,4138,3257


In [12]:
y.loc[(y.category == 'News and Media') & (y.subcategory == 'Technology News')]

Unnamed: 0,category,subcategory,site_categories,site_section,class
0,News and Media,Technology News,"['hobbies_and_interests', 'video_and_computer_...",http://forum.telecharger.01net.com/forum/high-...,News
1,News and Media,Technology News,"['hobbies_and_interests', 'video_and_computer_...",http://forum.telecharger.01net.com/forum/high-...,News
2,News and Media,Technology News,"['hobbies_and_interests', 'video_and_computer_...",http://forum.telecharger.01net.com/forum/high-...,News
3,News and Media,Technology News,"['hobbies_and_interests', 'video_and_computer_...",http://forum.telecharger.01net.com/forum/high-...,News
4,News and Media,Technology News,"['hobbies_and_interests', 'video_and_computer_...",http://forum.telecharger.01net.com/forum/high-...,News
5,News and Media,Technology News,"['hobbies_and_interests', 'video_and_computer_...",http://forum.telecharger.01net.com/forum/high-...,News
6,News and Media,Technology News,"['hobbies_and_interests', 'video_and_computer_...",http://forum.telecharger.01net.com/forum/high-...,News
7,News and Media,Technology News,"['hobbies_and_interests', 'video_and_computer_...",http://forum.telecharger.01net.com/forum/high-...,News
8,News and Media,Technology News,"['hobbies_and_interests', 'video_and_computer_...",http://forum.telecharger.01net.com/forum/high-...,News
9,News and Media,Technology News,"['hobbies_and_interests', 'video_and_computer_...",http://forum.telecharger.01net.com/forum/high-...,News


# Training

In [48]:
le = preprocessing.LabelEncoder()
X_2 = X.apply(le.fit_transform)

In [49]:
X_train, X_test, y_train, y_test = train_test_split(X_2, y['class'], test_size=0.4, random_state=0)

logreg = linear_model.LogisticRegression(C=1e5)

clf = logreg.fit(X_2, y['class'])

# Validation

In [50]:
clf.score(X_test, y_test)                           

0.79011366640232616