In [1]:
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
#from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
text= """US President Joe Biden says his administration is looking into what happened at a food distribution site in Gaza — where local health officials say more than 100 people were killed and hundreds more injured — and he admitted the incident is going to complicate negotiations in the region.

“We’re checking that out right now; there are two competing versions of what happened. I don’t have an answer yet,” the president told CNN’s Arlette Saenz at the White House on Thursday.

Asked by Saenz if he worried the deaths would complicate negotiations, he responded: “Oh, I know it will.”

But Biden still expressed optimism that a deal on the hostages and a potential ceasefire could be reached soon. """

In [10]:
text="""President-elect Joe Biden and his transition team are preparing for an early, all-out push to pass an ambitious new stimulus bill, while also drawing up plans for a flurry of executive actions aimed at delivering on campaign promises and undoing the Trump administration's efforts to undermine key government agencies.
Biden will be inaugurated in January with a pressing mandate to confront simultaneous and interwoven public health, economic and racial crises. At the same time, his team will take over the work of spearheading one of the most complicated, politically fraught mass vaccination campaigns in American history.
Biden's agenda for his first 100 days in office will, according to both those close to him and outside groups in contact with his top aides, center on two key avenues of action: the passage of a broad economic aid package and, where legislation is not necessary, a series of executive actions aimed at advancing his priorities. Containing the Covid-19 pandemic, launching an economic recovery and tackling racial inequality are his most urgent priorities, transition officials say."""

In [8]:
# this performs word tokenization -> this is used in simple models where each word is a feature
from nltk.tokenize import word_tokenize
nltk.download('punkt_tab', download_dir='/Users/goncalojardim/Desktop/Personal/IH/ironhack-v4-data-lessons/nlp_env/nltk_data')

tokens = word_tokenize(text)
tokens[-30:]

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/goncalojardim/Desktop/Personal/IH/ironhack-v4-
[nltk_data]     data-lessons/nlp_env/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


['actions',
 'aimed',
 'at',
 'advancing',
 'his',
 'priorities',
 '.',
 'Containing',
 'the',
 'Covid-19',
 'pandemic',
 ',',
 'launching',
 'an',
 'economic',
 'recovery',
 'and',
 'tackling',
 'racial',
 'inequality',
 'are',
 'his',
 'most',
 'urgent',
 'priorities',
 ',',
 'transition',
 'officials',
 'say',
 '.']

In [11]:
# we often remove punctuation after tokenization since punctuation is unlikely to be a good predictive feature
tokens = [word for word in tokens if word.isalnum()]
tokens[:15]

['Joe',
 'Biden',
 'and',
 'his',
 'transition',
 'team',
 'are',
 'preparing',
 'for',
 'an',
 'early',
 'push',
 'to',
 'pass',
 'an']

In [12]:
# Alternative
# this performs sentence tokenizations -> can be used if you want to treat each sentence as a "feature"
from nltk.tokenize import sent_tokenize
sent_tokenize(text)

["President-elect Joe Biden and his transition team are preparing for an early, all-out push to pass an ambitious new stimulus bill, while also drawing up plans for a flurry of executive actions aimed at delivering on campaign promises and undoing the Trump administration's efforts to undermine key government agencies.",
 'Biden will be inaugurated in January with a pressing mandate to confront simultaneous and interwoven public health, economic and racial crises.',
 'At the same time, his team will take over the work of spearheading one of the most complicated, politically fraught mass vaccination campaigns in American history.',
 "Biden's agenda for his first 100 days in office will, according to both those close to him and outside groups in contact with his top aides, center on two key avenues of action: the passage of a broad economic aid package and, where legislation is not necessary, a series of executive actions aimed at advancing his priorities.",
 'Containing the Covid-19 pan

In [18]:
# Part of speech can be a useful feature in itself, but is also heavily used in making lemmatization and stemming more effective
nltk.download('averaged_perceptron_tagger_eng')
nltk.pos_tag(tokens,lang='eng')[:30]
#explanation of all these codes can be found here: https://medium.com/@gianpaul.r/tokenization-and-parts-of-speech-pos-tagging-in-pythons-nltk-library-2d30f70af13b

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/goncalojardim/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


[('Joe', 'NNP'),
 ('Biden', 'NNP'),
 ('and', 'CC'),
 ('his', 'PRP$'),
 ('transition', 'NN'),
 ('team', 'NN'),
 ('are', 'VBP'),
 ('preparing', 'VBG'),
 ('for', 'IN'),
 ('an', 'DT'),
 ('early', 'JJ'),
 ('push', 'NN'),
 ('to', 'TO'),
 ('pass', 'VB'),
 ('an', 'DT'),
 ('ambitious', 'JJ'),
 ('new', 'JJ'),
 ('stimulus', 'NN'),
 ('bill', 'NN'),
 ('while', 'IN'),
 ('also', 'RB'),
 ('drawing', 'VBG'),
 ('up', 'RP'),
 ('plans', 'NNS'),
 ('for', 'IN'),
 ('a', 'DT'),
 ('flurry', 'NN'),
 ('of', 'IN'),
 ('executive', 'JJ'),
 ('actions', 'NNS')]

In [None]:
# stemming can be done as cleaning technique -> treats prefixes and suffixes.
from nltk.stem import PorterStemmer

ps = PorterStemmer()
stemmed = [ps.stem(w) for w in tokens]
stemmed[:15]

In [17]:
# lemmatization is a more context aware version of stemming, where we take the actual roots of individual words
# the problem is that such a dictionary may not exist for all languages and that it does not know what to do with new words
nltk.download('wordnet') # wordnet is the most well known lemmatizer for english
nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet


lemmatizer = WordNetLemmatizer()
lemmatized = [lemmatizer.lemmatize(word) for word in tokens]
lemmatized[:30]

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/goncalojardim/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/goncalojardim/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


['Joe',
 'Biden',
 'and',
 'his',
 'transition',
 'team',
 'are',
 'preparing',
 'for',
 'an',
 'early',
 'push',
 'to',
 'pas',
 'an',
 'ambitious',
 'new',
 'stimulus',
 'bill',
 'while',
 'also',
 'drawing',
 'up',
 'plan',
 'for',
 'a',
 'flurry',
 'of',
 'executive',
 'action']

In [19]:
# lemmatization may still be a bit weak, mostly because the lemmatizer would like a bit more information about context to make decisions
display(lemmatizer.lemmatize("was"))
display(lemmatizer.lemmatize("was",wordnet.VERB))
display(lemmatizer.lemmatize("better"))
display(lemmatizer.lemmatize("better",wordnet.ADJ))
display(lemmatizer.lemmatize("canning"))
display(lemmatizer.lemmatize("canning",wordnet.NOUN))
display(lemmatizer.lemmatize("canning",wordnet.VERB))

'wa'

'be'

'better'

'good'

'canning'

'canning'

'can'

In [20]:
# let's apply this to the all the newsfeed
nltk.download('averaged_perceptron_tagger')

# unfortunately pos_tag and lemmatize use different codes for parts of speech
def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper() # gets first letter of POS categorization
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV
            }
    return tag_dict.get(tag, wordnet.NOUN) # get returns second argument if first key does not exist

lemmatizer = WordNetLemmatizer()
lemmatized = [lemmatizer.lemmatize(word,get_wordnet_pos(word)) for word in tokens]
lemmatized[:15]


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/goncalojardim/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


['Joe',
 'Biden',
 'and',
 'his',
 'transition',
 'team',
 'be',
 'prepare',
 'for',
 'an',
 'early',
 'push',
 'to',
 'pas',
 'an']

In [21]:
#removal of stopwords allows us to reduce the noise in the data to focus on the signal
from nltk.corpus import stopwords
nltk.download('stopwords')

without_sw = [word for word in lemmatized if not word in stopwords.words()]
without_sw[:15]

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/goncalojardim/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['Joe',
 'Biden',
 'transition',
 'team',
 'prepare',
 'early',
 'push',
 'ambitious',
 'stimulus',
 'bill',
 'draw',
 'plan',
 'flurry',
 'executive',
 'action']

In [22]:
" ".join(without_sw)

'Joe Biden transition team prepare early push ambitious stimulus bill draw plan flurry executive action aim deliver campaign promise undo Trump administration effort undermine key government agency Biden inaugurate January press mandate confront simultaneous interwoven public health economic racial crisis At time team work spearhead complicate politically fraught mass vaccination campaign American history Biden agenda 100 day office accord close group contact top aide center key avenue action passage broad economic aid package legislation series executive action aim advance priority Containing pandemic launch economic recovery tackle racial inequality urgent priority transition official'

In [23]:
text.split('.')[1]

'\nBiden will be inaugurated in January with a pressing mandate to confront simultaneous and interwoven public health, economic and racial crises'

In [None]:
without_sw

In [24]:
from sklearn.feature_extraction.text import CountVectorizer
bow_vect = CountVectorizer()
# fit creates one entry for each different word seen
bow_vect.fit([" ".join(without_sw)])

In [25]:
set(without_sw)

{'100',
 'American',
 'At',
 'Biden',
 'Containing',
 'January',
 'Joe',
 'Trump',
 'accord',
 'action',
 'administration',
 'advance',
 'agency',
 'agenda',
 'aid',
 'aide',
 'aim',
 'ambitious',
 'avenue',
 'bill',
 'broad',
 'campaign',
 'center',
 'close',
 'complicate',
 'confront',
 'contact',
 'crisis',
 'day',
 'deliver',
 'draw',
 'early',
 'economic',
 'effort',
 'executive',
 'flurry',
 'fraught',
 'government',
 'group',
 'health',
 'history',
 'inaugurate',
 'inequality',
 'interwoven',
 'key',
 'launch',
 'legislation',
 'mandate',
 'mass',
 'office',
 'official',
 'package',
 'pandemic',
 'passage',
 'plan',
 'politically',
 'prepare',
 'press',
 'priority',
 'promise',
 'public',
 'push',
 'racial',
 'recovery',
 'series',
 'simultaneous',
 'spearhead',
 'stimulus',
 'tackle',
 'team',
 'time',
 'top',
 'transition',
 'undermine',
 'undo',
 'urgent',
 'vaccination',
 'work'}

In [26]:
bow_vect.transform(['Joe Biden transition team prepare early push ambitious stimulus bill draw plan flurry executive action aim deliver campaign promise undo Trump administration effort undermine key government agency']).toarray()

array([[0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
        1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0]])

In [27]:
bow_vect.transform(['economic economic']).toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [28]:
bow_vect.transform(['Joe work ambitious ambitious ambitoud economic rabbit']).toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]])

In [29]:
bow_vect.transform(['100']).toarray()

array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [30]:
bow_vect.transform(['goncalo']).toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [31]:
#transform only considers the words that have been seen in fit
bow_vect.transform(['accord stimulus bill bill goncalo']).toarray()

array([[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

# News clustering example

In [34]:
# corpus of 120k news headlines, here shortened to 10k
url = "https://raw.githubusercontent.com/GoncaloJardim/ironhack-v4-data-lessons/main/data/news.csv"


all_news = pd.read_csv(url)
all_news.head(10)

Unnamed: 0,news
0,SAN FRANCISCO (CBS.MW) -- Dell Inc. said Thurs...
1,American Phil Mickelson registered a 59 to win...
2,French President Jacques Chirac and British Pr...
3,"As many as 15,000 New Zealanders will be force..."
4,The group led by al Qaeda ally Abu Musab al-Z...
5,The number of US call centers is expected to d...
6,"PITCHING: Braves left-hander Mike Hampton, al..."
7,Bank of America Corp. has said it #39;s bringi...
8,It appears Dario Franchitti of Scotland is get...
9,"Two days ago, before he would add another chap..."


In [35]:
all_news.shape

(10000, 1)

In [37]:
all_news.iloc[1]['news']

'American Phil Mickelson registered a 59 to win the Grand Slam of Golf in Kauai, Hawaii. The Masters champion had an eagle putt on the 18th for a record 58 but missed and tapped in for a birdie and a 59, equalling the lowest score in stroke-play history.'

In [38]:
# same process as before, but for all lines
#tokenize, lowercase, remove punctuation

def tokenizer_and_remove_punctuation(row):
  tokens = word_tokenize(row['news'])
  return [word.lower() for word in tokens if word.isalpha()]

all_news['tokenized'] = all_news.apply(tokenizer_and_remove_punctuation,axis=1)
all_news.head()

Unnamed: 0,news,tokenized
0,SAN FRANCISCO (CBS.MW) -- Dell Inc. said Thurs...,"[san, francisco, dell, said, thursday, its, pr..."
1,American Phil Mickelson registered a 59 to win...,"[american, phil, mickelson, registered, a, to,..."
2,French President Jacques Chirac and British Pr...,"[french, president, jacques, chirac, and, brit..."
3,"As many as 15,000 New Zealanders will be force...","[as, many, as, new, zealanders, will, be, forc..."
4,The group led by al Qaeda ally Abu Musab al-Z...,"[the, group, led, by, al, qaeda, ally, abu, mu..."


In [39]:
# lemmatize with part of speech helpers

lemmatizer = WordNetLemmatizer()

def lemmatizer_with_pos(row):
  return [lemmatizer.lemmatize(word,get_wordnet_pos(word)) for word in row['tokenized']]

all_news['lemmatized'] = all_news.apply(lemmatizer_with_pos,axis=1)
all_news.head()

Unnamed: 0,news,tokenized,lemmatized
0,SAN FRANCISCO (CBS.MW) -- Dell Inc. said Thurs...,"[san, francisco, dell, said, thursday, its, pr...","[san, francisco, dell, say, thursday, it, prof..."
1,American Phil Mickelson registered a 59 to win...,"[american, phil, mickelson, registered, a, to,...","[american, phil, mickelson, register, a, to, w..."
2,French President Jacques Chirac and British Pr...,"[french, president, jacques, chirac, and, brit...","[french, president, jacques, chirac, and, brit..."
3,"As many as 15,000 New Zealanders will be force...","[as, many, as, new, zealanders, will, be, forc...","[a, many, a, new, zealander, will, be, force, ..."
4,The group led by al Qaeda ally Abu Musab al-Z...,"[the, group, led, by, al, qaeda, ally, abu, mu...","[the, group, lead, by, al, qaeda, ally, abu, m..."


In [40]:
# remove stopwords

def remove_sw(row):
  return list(set(row['lemmatized']).difference(stopwords.words()))

all_news['no_stopwords'] = all_news.apply(remove_sw,axis=1)
all_news.head()

Unnamed: 0,news,tokenized,lemmatized,no_stopwords
0,SAN FRANCISCO (CBS.MW) -- Dell Inc. said Thurs...,"[san, francisco, dell, said, thursday, its, pr...","[san, francisco, dell, say, thursday, it, prof...","[profit, laptop, percent, rise, san, earlier, ..."
1,American Phil Mickelson registered a 59 to win...,"[american, phil, mickelson, registered, a, to,...","[american, phil, mickelson, register, a, to, w...","[hawaii, low, record, grand, tapped, kauai, ma..."
2,French President Jacques Chirac and British Pr...,"[french, president, jacques, chirac, and, brit...","[french, president, jacques, chirac, and, brit...","[number, tony, french, chirac, international, ..."
3,"As many as 15,000 New Zealanders will be force...","[as, many, as, new, zealanders, will, be, forc...","[a, many, a, new, zealander, will, be, force, ...","[pain, found, attack, alternative, zealander, ..."
4,The group led by al Qaeda ally Abu Musab al-Z...,"[the, group, led, by, al, qaeda, ally, abu, mu...","[the, group, lead, by, al, qaeda, ally, abu, m...","[behead, found, musab, lead, islamist, stateme..."


In [41]:
# put all this cleaning together

def re_blob(row):
  return " ".join(row['no_stopwords'])

all_news['clean_blob'] = all_news.apply(re_blob,axis=1)
all_news.head()

Unnamed: 0,news,tokenized,lemmatized,no_stopwords,clean_blob
0,SAN FRANCISCO (CBS.MW) -- Dell Inc. said Thurs...,"[san, francisco, dell, said, thursday, its, pr...","[san, francisco, dell, say, thursday, it, prof...","[profit, laptop, percent, rise, san, earlier, ...",profit laptop percent rise san earlier gear bo...
1,American Phil Mickelson registered a 59 to win...,"[american, phil, mickelson, registered, a, to,...","[american, phil, mickelson, register, a, to, w...","[hawaii, low, record, grand, tapped, kauai, ma...",hawaii low record grand tapped kauai master wi...
2,French President Jacques Chirac and British Pr...,"[french, president, jacques, chirac, and, brit...","[french, president, jacques, chirac, and, brit...","[number, tony, french, chirac, international, ...",number tony french chirac international relati...
3,"As many as 15,000 New Zealanders will be force...","[as, many, as, new, zealanders, will, be, forc...","[a, many, a, new, zealander, will, be, force, ...","[pain, found, attack, alternative, zealander, ...",pain found attack alternative zealander double...
4,The group led by al Qaeda ally Abu Musab al-Z...,"[the, group, led, by, al, qaeda, ally, abu, mu...","[the, group, lead, by, al, qaeda, ally, abu, m...","[behead, found, musab, lead, islamist, stateme...",behead found musab lead islamist statement mos...


In [42]:
#let's take only the most common 1000 words
bow_vect = CountVectorizer(max_features=1000)
# fit creates one entry for each different word seen
X = bow_vect.fit_transform(all_news['clean_blob']).toarray()

In [43]:
all_news['clean_blob'].iloc[0]

'profit laptop percent rise san earlier gear boost thursday pc year francisco maker'

In [44]:
as_df = pd.DataFrame(X,columns=bow_vect.get_feature_names_out())
as_df.head()

Unnamed: 0,abu,abuse,access,accord,account,accounting,accuse,acquire,acquisition,act,...,worth,wound,yahoo,yankee,yard,yasser,year,yesterday,york,young
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [45]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=6,random_state=100)
kmeans.fit(X)
pred = kmeans.predict(X)

In [47]:
predict_df = pd.concat([all_news['news'],pd.DataFrame(pred,columns=['class'])],axis=1)
predict_df.head()

Unnamed: 0,news,class
0,SAN FRANCISCO (CBS.MW) -- Dell Inc. said Thurs...,5
1,American Phil Mickelson registered a 59 to win...,2
2,French President Jacques Chirac and British Pr...,1
3,"As many as 15,000 New Zealanders will be force...",4
4,The group led by al Qaeda ally Abu Musab al-Z...,4


In [48]:
pd.set_option('display.max_colwidth', None)

In [49]:
# War in Israel
predict_df[predict_df['class']==0]

Unnamed: 0,news,class
467,"Israeli Prime Minister Ariel Sharon is not known for a kindly disposition toward the Palestinians, nor for a gentle approach to the issue of a Palestinian state.",0
526,"An Israeli helicopter fired three missiles\at an unknown target in a Gaza refugee camp on Wednesday,\Palestinian witnesses said, one day after militants killed 16\Israelis in bus bombings in southern Israel.",0
550,"TWO Palestinian women, who planned to blow themselves up in a double suicide bombing in Tel Aviv, gave themselves up to Israeli troops at a West Bank checkpoint, Israeli military sources said today.",0
814,Israeli troops mistook three Egyptian police officers for Palestinian militants and shot them dead yesterday along the Gaza Strip #39;s border,0
822,Israel's High Court said on Thursday it was upholding a decision by the country's attorney-general to drop a bribery case against Prime Minister Ariel Sharon.,0
...,...,...
9771,"Palestinian leader Mahmoud Abbas called\Israel ""the Zionist enemy"" Tuesday, unprecedented language for\the relative moderate who is expected to succeed Yasser Arafat.",0
9824,JERUSALEM : The Israeli cabinet was set to approve a package of compensation for settlers who are to be uprooted from their homes as part of Prime Minister Ariel Sharon #39;s Gaza pullout plan.,0
9919,"JERUSALEM : Beleaguered Israeli Prime Minister Ariel Sharon persisted with plans to form a new government including the main opposition Labour party, bolstered by Washington #39;s backing for the expansion of West Bank settlements.",0
9948,Israel #39;s cabinet will today decide on a bill that would set out how to implement prime minister Ariel Sharon #39;s plan to withdraw Israeli settlers from the Gaza Strip.,0


In [50]:
# Politics news/ Overall news
predict_df[predict_df['class']==1]

Unnamed: 0,news,class
2,"French President Jacques Chirac and British Prime Minister Tony Blair maintained Thursday that relations between their countries were not strained by their disagreements over the Iraq war, as evidenced by their cooperation on a number of international",1
18,"French Finance Minister Nicolas Sarkozy is resigning when he meets today with Prime Minister Jean- Pierre Raffarin, freeing him to use his new position as head of the ruling party to prepare for the 2007 presidential elections.",1
20,Texas authorities are investigating claims that a US mother left her seven adopted children in Nigeria and went to work in Iraq.,1
23,"Indonesian police on Saturday released security camera images of a truck bombing outside the Australian Embassy, and investigators found traces of explosives in a room rented by two Malaysian militants wanted in the blast. Also Saturday, around 1,000 members of a hardline Muslim group rallied in downtown Jakarta against Thursday's attack, which killed nine people, two of them suspected suicide bombers...",1
25,"A special committee of independent directors at building products manufacturer Royal Group Technologies Ltd. (Woodbridge, ON) has fired president and CEO Douglas Dunsmuir and senior vice-president and CFO",1
...,...,...
9946,": Suspected Muslim insurgents attacked an army unit protecting Buddhist monks at a monastery early Friday, killing one of the soldiers as sectarian violence continued in southern Thailand, police said.",1
9949,Several workers are believed to have been killed and others injured after a contruction site collapsed at Dubai airport. The workers were trapped under rubble at the site of a \$4.,1
9952,At least two people have been killed and 18 others injured in an explosion at a concert by an Indian Bollywood star in Colombo. Police said a hand-grenade ripped through the front stands as Shahrukh Khan ended his performance on Saturday.,1
9958,"Their name means ""It's Time."" Their slogan shows a ticking clock, and their ability to bring thousands of students to the streets is spooking the government as the country prepares for Sunday's pivotal presidential election.",1


In [51]:

predict_df[predict_df['class']==3]

Unnamed: 0,news,class
22,"I.B.M. said that it was in talks to settle a lawsuit contending that a pension plan adopted by the company discriminated against 140,000 older workers.",3
26,"The federal government releases a list Wednesday lauding Fortune 500 firms that make a major effort to ease commutes for their employees, which helps ease traffic congestion and air pollution. Intel is No. 1 on the list of 69 companies to be designated as Best Workplaces for Commuters by the EPA.",3
40,"research) is being sued by several California cities and counties, which accuse the company of charging inflated prices due to its alleged monopoly control of the PC operating system market.",3
48,"In an era of widespread media\consolidation, Internet media company Yahoo Inc. \believes television networks, movie studios and music companies\should look to it as a partner rather than a merger candidate,\Yahoo Chief Executive Terry Semel said on Tuesday.",3
63,"Avon Products Inc., the world #39;s largest direct seller of cosmetics, reported its first US sales decline in five years, sending the company #39;s shares down the most since 2000.",3
...,...,...
9960,"Australian shopping center giant Westfield is teaming up with property group Multiplex and Aldersgate Investments to make a cash offer for Duelguide, the company which owns British developer Chelsfield.",3
9974,The company that makes a popular battery operated vacuum cleaner has announced a voluntary recall of the product. Procter and Gamble says their Sweep-Vac by Swiffer could overheat and possibly catch fire.,3
9983,"TiVo Inc., maker of digital\television video recorders, will next year add ways for viewers\to see advertising and corporate logos even as they try to skip\commercials, the company said on Wednesday.",3
9986,"U.S. regulators on Thursday said big local phone companies will not have to share new fiber-optic lines built for high-speed Internet and other services, spurring one carrier to speed roll-out plans.",3


In [52]:
# Economy 
predict_df[predict_df['class']==5]

Unnamed: 0,news,class
0,"SAN FRANCISCO (CBS.MW) -- Dell Inc. said Thursday its third-quarter profit rose 25 percent from a year earlier as the No. 1 personal-computer maker boosted sales of its PCs, laptops and other gear by 18 percent.",5
11,Stocks moved higher Friday as a stronger than expected retail sales report showed that higher oil prices aren't scaring consumers away from spending. Federal Reserve Chairman Alan Greenspan's positive comments on oil prices also encouraged investors...,5
28,"Even though we knew it was coming, journalists seem to have plenty to write about now that underdogs Sprint and Nextel have announced their \$35 billion nuptial plans, including a rumor that one of the industry's big dogs would try to spoil the party. &lt;FONT face=""verdana,MS Sans Serif,arial,helvetica"" size=""-2"" color=""#666666""&gt;&lt;B&gt;-washingtonpost.com&lt;/B&gt;&lt;/FONT&gt;",5
47,"Aerospace giant Boeing Co. reported a 78 percent jump in third-quarter earnings, buoyed by a strong defense business, and raised its estimate for full-year profits due to a favorable tax outlook.",5
50,"Investment by businesses in foreign markets fell 18 percent in 2003 to \$560 billion as the global economy continued to struggle, but should improve this year as growth speeds up, the United Nations said Wednesday.",5
...,...,...
9928,"OPEC oil producers and Chancellor\Schroeder of Germany, which is a prominent consumer, agree that\the remorseless rise in the price of crude has so far had\little impact on global economic growth.",5
9934,"Shipments of personal computers in the third quarter rose 12 percent, driven by strong corporate demand and ongoing strength in Europe, the Middle East and Africa, a market research firm said on Monday.",5
9936,"Oil futures prices surpassed \$54 a barrel Tuesday on supply concerns in Nigeria and the Gulf of Mexico, where hurricane damage to pipelines and production platforms could curb output for months.",5
9962,"Hong Kong shares rose slightly on Wednesday, tracking gains on Wall Street as the US Federal Reserve announced a rate hike. The blue-chip Hang Seng Index climbed 35.02 points, or 0.25 percent, to 14,078.54.",5
