# Feature engineering

In [1]:
import pickle
from os.path import join
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
from sklearn.model_selection import train_test_split

In [2]:
df_path = "D:\\hse\\3 course\\course_work\\CourseWork2020\\02. News Classifier\\01. Data exploration"
df_path2 = join(df_path, 'News_dataset.pickle')
with open(df_path2, 'rb') as data:
    df = pickle.load(data)

In [3]:
df.head()

Unnamed: 0,File_Name,Content,Category,Complete_Filename,id,News_length
0,001.txt,Ad sales boost Time Warner profit\r\n\r\nQuart...,business,001.txt-business,1,2569
1,002.txt,Dollar gains on Greenspan speech\r\n\r\nThe do...,business,002.txt-business,1,2257
2,003.txt,Yukos unit buyer faces loan claim\r\n\r\nThe o...,business,003.txt-business,1,1557
3,004.txt,High fuel prices hit BA's profits\r\n\r\nBriti...,business,004.txt-business,1,2421
4,005.txt,Pernod takeover talk lifts Domecq\r\n\r\nShare...,business,005.txt-business,1,1575


In [4]:
df.loc[3]['Content']

'High fuel prices hit BA\'s profits\r\n\r\nBritish Airways has blamed high fuel prices for a 40% drop in profits.\r\n\r\nReporting its results for the three months to 31 December 2004, the airline made a pre-tax profit of Â£75m ($141m) compared with Â£125m a year earlier. Rod Eddington, BA\'s chief executive, said the results were "respectable" in a third quarter when fuel costs rose by Â£106m or 47.3%. BA\'s profits were still better than market expectation of Â£59m, and it expects a rise in full-year revenues.\r\n\r\nTo help offset the increased price of aviation fuel, BA last year introduced a fuel surcharge for passengers.\r\n\r\nIn October, it increased this from Â£6 to Â£10 one-way for all long-haul flights, while the short-haul surcharge was raised from Â£2.50 to Â£4 a leg. Yet aviation analyst Mike Powell of Dresdner Kleinwort Wasserstein says BA\'s estimated annual surcharge revenues - Â£160m - will still be way short of its additional fuel costs - a predicted extra Â£250m. Tu

## 1. Text preparation

In [5]:
df['Content_parsed'] = df['Content']
# deleting splecial symbols
df['Content_parsed'] = df['Content_parsed'].str.replace('\r',' ')
df['Content_parsed'] = df['Content_parsed'].str.replace('\n',' ')
# deleting tabs
df['Content_parsed'] = df['Content_parsed'].str.replace('    ',' ')
# deleting quotes
df['Content_parsed'] = df['Content_parsed'].str.replace('"', '')
# to lower
df['Content_parsed'] = df['Content_parsed'].str.lower()
# deleting punctuation
punctuation_signs = list("?:!.,;")
for punct_sign in punctuation_signs:
    df['Content_parsed'] = df['Content_parsed'].str.replace(punct_sign, '')
# deleting possessive pronoun
df['Content_parsed'] = df['Content_parsed'].str.replace("'s", "")

In [6]:
df.head()

Unnamed: 0,File_Name,Content,Category,Complete_Filename,id,News_length,Content_parsed
0,001.txt,Ad sales boost Time Warner profit\r\n\r\nQuart...,business,001.txt-business,1,2569,ad sales boost time warner profit quarterly pr...
1,002.txt,Dollar gains on Greenspan speech\r\n\r\nThe do...,business,002.txt-business,1,2257,dollar gains on greenspan speech the dollar ha...
2,003.txt,Yukos unit buyer faces loan claim\r\n\r\nThe o...,business,003.txt-business,1,1557,yukos unit buyer faces loan claim the owners o...
3,004.txt,High fuel prices hit BA's profits\r\n\r\nBriti...,business,004.txt-business,1,2421,high fuel prices hit ba profits british airway...
4,005.txt,Pernod takeover talk lifts Domecq\r\n\r\nShare...,business,005.txt-business,1,1575,pernod takeover talk lifts domecq shares in uk...


### Lemmatization

In [7]:
# Downloading punkt and wordnet from NLTK
nltk.download('punkt')
print("------------------------------------------------------------")
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Егор\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Егор\AppData\Roaming\nltk_data...


------------------------------------------------------------


[nltk_data]   Package wordnet is already up-to-date!


True

In [8]:
# Saving the lemmatizer into an object
wordnet_lemmatizer = WordNetLemmatizer()

In [9]:
nrows = len(df)
lemmatized_text_list = []

for row in range(0, nrows):
    
    # Create an empty list containing lemmatized words
    lemmatized_list = []
    
    # Save the text and its words into an object
    text = df.loc[row]['Content_parsed']
    text_words = text.split(" ")

    # Iterate through every word to lemmatize
    for word in text_words:
        lemmatized_list.append(wordnet_lemmatizer.lemmatize(word, pos="v"))
        
    # Join the list
    lemmatized_text = " ".join(lemmatized_list)
    
    # Append to the list containing the texts
    lemmatized_text_list.append(lemmatized_text)

In [10]:
df['Content_parsed'] = lemmatized_text_list

### Stop words

In [11]:
# Downloading the stop words list
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Егор\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:
# Loading the stop words in english
stop_words = list(stopwords.words('english'))

In [13]:
stop_words[0:11]

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've"]

In [14]:
for stop_word in stop_words:
    regex_stopword = r"\b" + stop_word + r"\b"
    df['Content_parsed'] = df['Content_parsed'].str.replace(regex_stopword, '')

In [15]:
df.loc[5]['Content']

'Japan narrowly escapes recession\r\n\r\nJapan\'s economy teetered on the brink of a technical recession in the three months to September, figures show.\r\n\r\nRevised figures indicated growth of just 0.1% - and a similar-sized contraction in the previous quarter. On an annual basis, the data suggests annual growth of just 0.2%, suggesting a much more hesitant recovery than had previously been thought. A common technical definition of a recession is two successive quarters of negative growth.\r\n\r\nThe government was keen to play down the worrying implications of the data. "I maintain the view that Japan\'s economy remains in a minor adjustment phase in an upward climb, and we will monitor developments carefully," said economy minister Heizo Takenaka. But in the face of the strengthening yen making exports less competitive and indications of weakening economic conditions ahead, observers were less sanguine. "It\'s painting a picture of a recovery... much patchier than previously thoug

In [16]:
df.loc[5]['Content_parsed']

'japan narrowly escape recession japan economy teeter   brink   technical recession   three months  september figure show revise figure indicate growth   01% -   similar-sized contraction   previous quarter   annual basis  data suggest annual growth   02% suggest  much  hesitant recovery   previously  think  common technical definition   recession  two successive quarter  negative growth  government  keen  play   worry implications   data  maintain  view  japan economy remain   minor adjustment phase   upward climb    monitor developments carefully say economy minister heizo takenaka    face   strengthen yen make export less competitive  indications  weaken economic condition ahead observers  less sanguine  paint  picture   recovery much patchier  previously think say paul sheard economist  lehman brothers  tokyo improvements   job market apparently  yet  fee   domestic demand  private consumption   02%   third quarter'

## 2. Label coding

In [18]:
category_codes = {
    'business': 0,
    'entertainment': 1,
    'politics': 2,
    'sport': 3,
    'tech': 4
}

In [19]:
# Category mapping
df['Category_Code'] = df['Category']
df = df.replace({'Category_Code':category_codes})

In [20]:
df.head()

Unnamed: 0,File_Name,Content,Category,Complete_Filename,id,News_length,Content_parsed,Category_Code
0,001.txt,Ad sales boost Time Warner profit\r\n\r\nQuart...,business,001.txt-business,1,2569,ad sales boost time warner profit quarterly pr...,0
1,002.txt,Dollar gains on Greenspan speech\r\n\r\nThe do...,business,002.txt-business,1,2257,dollar gain greenspan speech dollar hit hi...,0
2,003.txt,Yukos unit buyer faces loan claim\r\n\r\nThe o...,business,003.txt-business,1,1557,yukos unit buyer face loan claim owners emba...,0
3,004.txt,High fuel prices hit BA's profits\r\n\r\nBriti...,business,004.txt-business,1,2421,high fuel price hit ba profit british airways ...,0
4,005.txt,Pernod takeover talk lifts Domecq\r\n\r\nShare...,business,005.txt-business,1,1575,pernod takeover talk lift domecq share uk dri...,0


## 3. Train-test split

In [21]:
X_train, X_test, y_train, y_test = train_test_split(df['Content_parsed'], 
                                                    df['Category_Code'], 
                                                    test_size=0.15, 
                                                    random_state=8)

## 2. 