In [10]:
# Code for hiding seaborn warnings
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")
import altair as alt
# need to 'pip install vega' before using renderer
alt.renderers.enable("notebook")
# Code for hiding seaborn warnings
import warnings
warnings.filterwarnings("ignore")
from imblearn.over_sampling import SMOTE
from sklearn import preprocessing,metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, learning_curve
%matplotlib inline
#fix random seed
np.random.seed(1)

In [11]:
df = pd.read_csv('training.csv')
df.head(5)

Unnamed: 0,article_number,article_words,topic
0,1,"open,absent,cent,cent,cent,stock,inflow,rate,k...",FOREX MARKETS
1,2,"morn,stead,end,end,day,day,day,patch,patch,pat...",MONEY MARKETS
2,3,"socc,socc,world,world,recent,law,fifa,fifa,fif...",SPORTS
3,4,"open,forint,forint,forint,forint,cent,cent,ste...",FOREX MARKETS
4,5,"morn,complet,weekend,minut,minut,minut,arrow,d...",IRRELEVANT


In [12]:
df = df[df.topic!='IRRELEVANT']
data_x = df.article_words
data_y = df.topic

train_x,dev_x,train_y,dev_y = train_test_split(data_x,data_y,test_size = 0.1,shuffle=False)
en_train_y = preprocessing.LabelEncoder().fit_transform(train_y)
en_dev_y = preprocessing.LabelEncoder().fit_transform(dev_y)
categories=list(set(df['topic']))
print(categories)

['BIOGRAPHIES PERSONALITIES PEOPLE', 'SPORTS', 'ARTS CULTURE ENTERTAINMENT', 'FOREX MARKETS', 'SCIENCE AND TECHNOLOGY', 'MONEY MARKETS', 'HEALTH', 'DEFENCE', 'DOMESTIC MARKETS', 'SHARE LISTINGS']


### Transform Training Text to TF-IDF 

In [13]:
vectorizer = TfidfVectorizer().fit(train_x)
train_vectors = vectorizer.transform(train_x)
dev_vectors = vectorizer.transform(dev_x)
print(train_vectors.shape,test_vectors.shape)

(4289, 23012) (4289, 18)


From the output above, it could be seen that the extracted TF-IDF vectors from training set are sparse, as there are only 81 non-zero features over 23012 features.

### Using SMOTE to Solve Data Imbalance

 First, by plotting each topic's frequency, it could be seen that the distribution of topics is imbalanced.

In [14]:
bars = alt.Chart(train_y.to_frame()).mark_bar(size=50).encode(
    x=alt.X("topic"),
    y=alt.Y("count():Q", axis=alt.Axis(title='Number of articles')),
    tooltip=[alt.Tooltip('count()', title='Number of articles'), 'topic'],
    color='topic'

)

text = bars.mark_text(
    align='center',
    baseline='bottom',
).encode(
    text='count()'
)

(bars + text).properties(
    height=300, 
    width=700,
    title = "Number of articles in each category",
)


<vega.vegalite.VegaLite at 0x1a1d2f9410>



Considering in the data set, the topic 'MONEY MARKET' occupies the majority, which might, indirectly, lead to biased prediction to itself, we need to adjust each topic's proportion in the training data set. In this project, SMOTE (Synthetic Minority Over-sampling TEchnique) is used to solve the imbalance problem.

In [15]:
smote = SMOTE(random_state=1)
smo_x,smo_y = smote.fit_sample(train_vectors,en_train_y)