# Feature Engineering
---
1. **read in raw datasets**
2. **Label coding**: topics to integers
3. **Text Cleaning**: cleaning of special characters
4. **Text representation**: use of TF-IDF scores to represent text.

In [7]:
import os
import joblib
from pprint import pprint

import numpy as np
import pandas as pd
import sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

# 1. Reading Raw Data

In [8]:
df_train = pd.read_csv("training.csv")
df_train.head()
df_train.shape

Unnamed: 0,article_number,article_words,topic
0,1,"open,absent,cent,cent,cent,stock,inflow,rate,k...",FOREX MARKETS
1,2,"morn,stead,end,end,day,day,day,patch,patch,pat...",MONEY MARKETS
2,3,"socc,socc,world,world,recent,law,fifa,fifa,fif...",SPORTS
3,4,"open,forint,forint,forint,forint,cent,cent,ste...",FOREX MARKETS
4,5,"morn,complet,weekend,minut,minut,minut,arrow,d...",IRRELEVANT


(9500, 3)

# 2. Label Encoding

In [9]:
# fitting a encoder on the training topics
label_train = df_train.topic

# encode labels (alphabetic order) to integers 0-10
label_encoder = LabelEncoder().fit(label_train)
y_train = label_encoder.transform(label_train)

topic_to_label = {k: v for (k, v) in zip(\
                                         label_encoder.classes_, 
                                         label_encoder.transform(label_encoder.classes_)
                                        )}
print("topic: label")
pprint(topic_to_label)
# Insert the encoded column in to original dataframe
df_train["label"] = y_train
df_train.head()

topic: label
{'ARTS CULTURE ENTERTAINMENT': 0,
 'BIOGRAPHIES PERSONALITIES PEOPLE': 1,
 'DEFENCE': 2,
 'DOMESTIC MARKETS': 3,
 'FOREX MARKETS': 4,
 'HEALTH': 5,
 'IRRELEVANT': 6,
 'MONEY MARKETS': 7,
 'SCIENCE AND TECHNOLOGY': 8,
 'SHARE LISTINGS': 9,
 'SPORTS': 10}


Unnamed: 0,article_number,article_words,topic,label
0,1,"open,absent,cent,cent,cent,stock,inflow,rate,k...",FOREX MARKETS,4
1,2,"morn,stead,end,end,day,day,day,patch,patch,pat...",MONEY MARKETS,7
2,3,"socc,socc,world,world,recent,law,fifa,fifa,fif...",SPORTS,10
3,4,"open,forint,forint,forint,forint,cent,cent,ste...",FOREX MARKETS,4
4,5,"morn,complet,weekend,minut,minut,minut,arrow,d...",IRRELEVANT,6


# 3. Text Cleaning

In [10]:
# remove "_" in each document
text_train = df_train.article_words.apply(lambda x: x.replace('_', ''))

# 4. Text Representing
## 4.1 Bulid Count Vector

In [12]:
#fit the CountVectorizer to text_train
count_vector = CountVectorizer().fit(text_train)
features = count_vector.get_feature_names()

print(f"Vocabulary size: {len(count_vector.vocabulary_)}")
print(f"\nThe first 40 features:\n{features[:40]}")
print("\nFeatures 20010 to 20050:\n{}".format(features[20010:20040]))
print("\nEvery 1000th feature:\n{}".format(features[::1000]))

Vocabulary size: 35817

The first 40 features:
['a1', 'a1b1', 'a2', 'a3', 'a300', 'a300b4', 'a320', 'a330', 'a340', 'a4', 'a5', 'a6', 'a7', 'a78', 'a7e', 'aa', 'aa1', 'aa2', 'aa3', 'aaa', 'aacount', 'aad', 'aadj', 'aag', 'aah', 'aahp', 'aair', 'aalborg', 'aalst', 'aalton', 'aaltonaa', 'aama', 'aamir', 'aamodt', 'aandewiel', 'aap', 'aapc', 'aapt', 'aaqib', 'aaquib']

Features 20010 to 20050:
['mercad', 'mercado', 'mercantil', 'mercaton', 'merced', 'mercen', 'merch', 'merchandis', 'merci', 'mercilon', 'merck', 'merckx', 'mercosur', 'mercur', 'mercurio', 'mercy', 'merdior', 'mere', 'merebank', 'meret', 'merg', 'merial', 'merid', 'meridian', 'meridor', 'merin', 'meris', 'merisel', 'merit', 'meriwether']

Every 1000th feature:
['a1', 'altamir', 'atkin', 'beke', 'boulton', 'cardiothorac', 'clarin', 'credent', 'desant', 'dump', 'est', 'flap', 'genev', 'guotai', 'hockeyroo', 'inflexibl', 'joke', 'koek', 'leppan', 'mainstay', 'mentheor', 'movie', 'nitchipourenk', 'ota', 'person', 'prevar', 'rea

## 4.2 TF-IDF

In [13]:
tf_idf = TfidfVectorizer().fit(text_train)

## 4.3 Export Results

In [18]:
import joblib
import os

files_to_export = {\
                   "Data/label_encoder.joblib": label_encoder,  # label encoder fitted to df_train.topic
                   "Data/df_train.joblib": df_train,            # add transformed label column to raw dataset
                   "Data/text_train.joblib": text_train,        # cleaned df_train.article_words
                   "Data/y_train.joblib": y_train,              # final training label
                   "Data/count_vector.joblib": count_vector,    # CountVectorizer fitted to text_train
                   "Data/tf_idf.joblib": tf_idf                 # tf-idf vectorizer fitted to text_train
                  }

Data/export_list.joblib exported


In [16]:
# df_train
for file_name, obj in files_to_export.items():
    if not os.path.exists(file_name):
        with open(file_name, "wb") as file:
            joblib.dump(obj, file)
            print(f"{file_name} exported")
    else:
        print(f"Warn: {file_name} already exits...")


Warn: Data/label_encoder.joblib already exits...
Warn: Data/df_train.joblib already exits...
Warn: Data/text_train.joblib already exits...
Warn: Data/y_train.joblib already exits...
Warn: Data/count_vector.joblib already exits...
Warn: Data/tf_idf.joblib already exits...
