# 1- Import key Modules

In [26]:
# support both Python 2 and Python 3 with minimal overhead.
from __future__ import absolute_import, division, print_function

# I am an engineer. I care only about error not warning. So, let's be maverick and ignore warnings.
import warnings
warnings.filterwarnings('ignore')

In [27]:
import re    # for regular expressions 
import nltk  # for text manipulation 
import string 
import numpy as np 
import pandas as pd 

# 2-Loading data

In [29]:
filename1="newsCorpora.csv"
header_list = ["id", "title", "url","publisher","category","story","hostname","timestamp"]
df=pd.read_csv(filename1,sep='\t', names=header_list)
df.shape

(422419, 8)

In [30]:
df.head(2)

Unnamed: 0,id,title,url,publisher,category,story,hostname,timestamp
0,1,"Fed official says weak data caused by weather,...",http://www.latimes.com/business/money/la-fi-mo...,Los Angeles Times,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.latimes.com,1394470370698
1,2,Fed's Charles Plosser sees high bar for change...,http://www.livemint.com/Politics/H2EvwJSK2VE6O...,Livemint,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.livemint.com,1394470371207


In [31]:
# use only concerned features
df=df[["title","publisher","category"]]

In [32]:
df=df.fillna("other") 

In [33]:
df.isnull().sum()

title        0
publisher    0
category     0
dtype: int64

In [34]:
features=df[["title","publisher"]]
labels = df[['category']]
print(features.shape)
print(labels.shape)

(422419, 2)
(422419, 1)


In [35]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.20, random_state=0)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(337935, 2)
(84484, 2)
(337935, 1)
(84484, 1)


In [38]:
y_train.category.value_counts(normalize=True)

e    0.361516
b    0.274017
t    0.256659
m    0.107808
Name: category, dtype: float64

In [39]:
y_test.category.value_counts(normalize=True)

e    0.358648
b    0.276585
t    0.255788
m    0.108979
Name: category, dtype: float64

Both have same distribution. So, it is good enough.

In [40]:
X_train.head(2)

Unnamed: 0,title,publisher
30517,Toyota reaches settlement in U.S. criminal inv...,Examiner.com
16533,Sally Beauty releases info on data breach,Tribune-Review


I will use train set like this but, I will change test set for features. I shall keep only title.

In [41]:
X_test=X_test[["title"]]
X_test.shape

(84484, 1)

In [42]:
X_test.head(2)

Unnamed: 0,title
153245,iPhone 6 Release Date Pushed Back Due to Issue...
308611,Samsung Galaxy S4 vs Galaxy S3: Budget-Friendl...


### save data

In [53]:
X_test.to_csv('test_data.csv')
y_test.to_csv('test_label.csv',index=False)

# 3-Making train set

In [45]:
X_train['news']=X_train.title.str.cat(X_train.publisher)

In [46]:
X_train.head(2)

Unnamed: 0,title,publisher,news
30517,Toyota reaches settlement in U.S. criminal inv...,Examiner.com,Toyota reaches settlement in U.S. criminal inv...
16533,Sally Beauty releases info on data breach,Tribune-Review,Sally Beauty releases info on data breachTribu...


In [47]:
X_train.news[0]

'Fed official says weak data caused by weather, should not slow taperLos Angeles Times'

In [48]:
df_train=X_train[["news"]]
df_train["category"]=y_train

In [49]:
df_train.shape

(337935, 2)

In [50]:
df_train.isnull().sum()

news        0
category    0
dtype: int64

In [51]:
df_train.head(2)

Unnamed: 0,news,category
30517,Toyota reaches settlement in U.S. criminal inv...,t
16533,Sally Beauty releases info on data breachTribu...,b


In [52]:
df_train.to_csv('train_data.csv')

We can also use a smaller random sample i.e 10K

In [55]:
sample_data=df_train.sample(10000)
sample_data.shape

(10000, 2)

In [56]:
sample_data.head()

Unnamed: 0,news,category
22513,Top 5 Reasons Why 'Divergent' Star Kate Winsle...,e
283690,Vessyl Bottle Tracks Your Drink And Its Health...,t
120019,Conjoined twins to leave Dallas hospital month...,m
126710,Did Lorde get married in Vegas? Royals singer ...,e
325748,Free HIV testing available Thursday afternoon ...,m


In [57]:
sample_data.category.value_counts(normalize=True)

e    0.3507
b    0.2746
t    0.2583
m    0.1164
Name: category, dtype: float64

again consistent with original data

In [58]:
sample_data.to_csv('train_data_10000.csv')