In [1]:
import pandas as pd
import numpy as np
import pickle
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

In [2]:
with open("News_dataset.pickle", 'rb') as data:
    df = pickle.load(data)

In [3]:
df.head()

Unnamed: 0,File_Name,Content,Category,Complete_Filename,id,News_length
0,001.txt,Ad sales boost Time Warner profit\r\n\r\nQuart...,business,001.txt-business,1,2569
1,002.txt,Dollar gains on Greenspan speech\r\n\r\nThe do...,business,002.txt-business,1,2257
2,003.txt,Yukos unit buyer faces loan claim\r\n\r\nThe o...,business,003.txt-business,1,1557
3,004.txt,High fuel prices hit BA's profits\r\n\r\nBriti...,business,004.txt-business,1,2421
4,005.txt,Pernod takeover talk lifts Domecq\r\n\r\nShare...,business,005.txt-business,1,1575


In [4]:
df['Content_Parsed_1'] = df['Content'].str.replace("\r", " ")
df['Content_Parsed_1'] = df['Content_Parsed_1'].str.replace("\n", " ")
df['Content_Parsed_1'] = df['Content_Parsed_1'].str.replace("    ", " ")
df['Content_Parsed_1'] = df['Content_Parsed_1'].str.replace('"', '')

In [5]:
# lowercase
df['Content_Parsed_2'] = df['Content_Parsed_1'].str.lower()        

# remove punctuations
punctuation_signs = list("?:!.,;")                                  
df['Content_Parsed_3'] = df['Content_Parsed_2']

for punct_sign in punctuation_signs:
    df['Content_Parsed_3'] = df['Content_Parsed_3'].str.replace(punct_sign, '')


df['Content_Parsed_4'] = df['Content_Parsed_3'].str.replace("'s", "")     

In [6]:
df.iloc[5]['Content_Parsed_4']

'japan narrowly escapes recession japan economy teetered on the brink of a technical recession in the three months to september figures show revised figures indicated growth of just 01% - and a similar-sized contraction in the previous quarter on an annual basis the data suggests annual growth of just 02% suggesting a much more hesitant recovery than had previously been thought a common technical definition of a recession is two successive quarters of negative growth the government was keen to play down the worrying implications of the data i maintain the view that japan economy remains in a minor adjustment phase in an upward climb and we will monitor developments carefully said economy minister heizo takenaka but in the face of the strengthening yen making exports less competitive and indications of weakening economic conditions ahead observers were less sanguine it painting a picture of a recovery much patchier than previously thought said paul sheard economist at lehman brothers in

# WordNet Lemmatization

In [7]:
nltk.download('punkt')
nltk.download('wordnet')

nltk.download('averaged_perceptron_tagger')
from nltk.corpus import wordnet 

[nltk_data] Downloading package punkt to C:\Users\Mitesh Manoj
[nltk_data]     Adake\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Mitesh Manoj
[nltk_data]     Adake\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Mitesh Manoj
[nltk_data]     Adake\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [8]:
lemmatizer = WordNetLemmatizer()

In [9]:
# Convert nltk tag to wordnet tag
def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None

In [10]:
def lemmatize_sentence(sentence):
    # tokenize the sentence and find the POS tag for each token
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))  
    # tuple of (token, wordnet_tag)
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            # if there is no available tag, append the token as is
            lemmatized_sentence.append(word)
        else:        
            # else use the tag to lemmatize the token
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return " ".join(lemmatized_sentence)

In [11]:
nrows = len(df)
lemmatized_text_list = []

for row in range(0, nrows):
    lemmatized_text = lemmatize_sentence(df.loc[row]['Content_Parsed_4'])
    lemmatized_text_list.append(lemmatized_text)

df['Content_Parsed_5'] = lemmatized_text_list    

In [12]:
df.iloc[5]['Content_Parsed_5']

'japan narrowly escape recession japan economy teeter on the brink of a technical recession in the three month to september figure show revised figure indicate growth of just 01 % - and a similar-sized contraction in the previous quarter on an annual basis the data suggest annual growth of just 02 % suggest a much more hesitant recovery than have previously be think a common technical definition of a recession be two successive quarter of negative growth the government be keen to play down the worrying implication of the data i maintain the view that japan economy remain in a minor adjustment phase in an upward climb and we will monitor development carefully say economy minister heizo takenaka but in the face of the strengthen yen make export less competitive and indication of weaken economic condition ahead observer be less sanguine it paint a picture of a recovery much patchy than previously think say paul sheard economist at lehman brother in tokyo improvement in the job market appa

# Stop Words Removal

In [13]:
stop_words = list(stopwords.words('english'))

In [14]:
df['Content_Parsed_6'] = df['Content_Parsed_5']

for stop_word in stop_words:

    regex_stopword = r"\b" + stop_word + r"\b"
    df['Content_Parsed_6'] = df['Content_Parsed_6'].str.replace(regex_stopword, '')

In [15]:
df.loc[5]['Content_Parsed_6']

'japan narrowly escape recession japan economy teeter   brink   technical recession   three month  september figure show revised figure indicate growth   01 % -   similar-sized contraction   previous quarter   annual basis  data suggest annual growth   02 % suggest  much  hesitant recovery   previously  think  common technical definition   recession  two successive quarter  negative growth  government  keen  play   worrying implication   data  maintain  view  japan economy remain   minor adjustment phase   upward climb    monitor development carefully say economy minister heizo takenaka    face   strengthen yen make export less competitive  indication  weaken economic condition ahead observer  less sanguine  paint  picture   recovery much patchy  previously think say paul sheard economist  lehman brother  tokyo improvement   job market apparently  yet  fee   domestic demand  private consumption   02 %   third quarter'

In [16]:
# Removing the old content_parsed columns

list_columns = ["File_Name", "Category", "Complete_Filename", "Content", "Content_Parsed_6"]
df = df[list_columns]

df = df.rename(columns={'Content_Parsed_6': 'Content_Parsed'})

# Label Encoding

In [17]:
category_codes = {
    'business': 0,
    'entertainment': 1,
    'politics': 2,
    'sport': 3,
    'tech': 4
}

# Category mapping
df['Category_Code'] = df['Category']
df = df.replace({'Category_Code':category_codes})

In [18]:
df.head()

Unnamed: 0,File_Name,Category,Complete_Filename,Content,Content_Parsed,Category_Code
0,001.txt,business,001.txt-business,Ad sales boost Time Warner profit\r\n\r\nQuart...,ad sale boost time warner profit quarterly pro...,0
1,002.txt,business,002.txt-business,Dollar gains on Greenspan speech\r\n\r\nThe do...,dollar gain greenspan speech dollar hit hi...,0
2,003.txt,business,003.txt-business,Yukos unit buyer faces loan claim\r\n\r\nThe o...,yukos unit buyer face loan claim owner embat...,0
3,004.txt,business,004.txt-business,High fuel prices hit BA's profits\r\n\r\nBriti...,high fuel price hit ba profit british airway ...,0
4,005.txt,business,005.txt-business,Pernod takeover talk lifts Domecq\r\n\r\nShare...,pernod takeover talk lift domecq share uk dri...,0


In [19]:
tfidf = TfidfVectorizer()
tfidf_vectorized = tfidf.fit_transform(df['Content_Parsed'])

In [20]:
df_tfidf_vectorized = pd.DataFrame(tfidf_vectorized.toarray(), columns=tfidf.get_feature_names())
df_tfidf_vectorized.head()

Unnamed: 0,000,0001,00051,001,002,003,004secs,007,01,0100,...,zoom,zooropa,zornotza,zorro,zubair,zuluaga,zurich,zutons,zvonareva,zvyagintsev
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
# Non-zero elements of vector
[i for i in list(df_tfidf_vectorized.loc[0]) if i]

[0.07049552217154262,
 0.07049552217154262,
 0.06243614727234318,
 0.03269809289193229,
 0.051657226357623265,
 0.026365272052807164,
 0.07677120150944808,
 0.07049552217154262,
 0.052773135309825754,
 0.07049552217154262,
 0.07049552217154262,
 0.07049552217154262,
 0.05167224379413272,
 0.05947665439113074,
 0.07049552217154262,
 0.04940446853513591,
 0.055501150291405066,
 0.06469243973819575,
 0.046108795146465965,
 0.04940446853513591,
 0.04697463982251519,
 0.09335290710711319,
 0.04863914669648686,
 0.02446102199888857,
 0.027518851293778932,
 0.03312723422312383,
 0.02754809306008016,
 0.3654383383191459,
 0.02599457381062731,
 0.045320597735662146,
 0.019435572062414957,
 0.03331776122221313,
 0.07049552217154262,
 0.021237474689931368,
 0.032818583167046604,
 0.06242989220745722,
 0.03448309004101827,
 0.03885481346518116,
 0.05726618000277693,
 0.05633946551607402,
 0.026106979845709588,
 0.07049552217154262,
 0.030212731990061165,
 0.02903259982548572,
 0.024747084692480933