In [2]:
# libraries

import zipfile, re, logging
from io import BytesIO
import re
import os
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

# 1. Extracting a ZIP File

In [3]:
# Script to extract all the nested zip files

with zipfile.ZipFile("Data.zip", "r") as zfile:
    for name in zfile.namelist():
        if re.search(r'\.zip$', name) is not None:
            zfiledata = BytesIO(zfile.read(name))
            #zfile.extractall()
            with zipfile.ZipFile(zfiledata) as zfile2:
                zfile2.extractall("./Data/")                    

# 2. Extracting Some Information from XML Files

In [4]:
# extracting some information from xml file

listText = []
listHeadlines = []
listFileName = []
listItemID = []

for filename in os.listdir('./Data/'):
    if filename.endswith('.xml'):
        with open(os.path.join('./Data/', filename)) as f:
            strings = f.read() 
            f.close()
            matchesText = re.findall(r"(?<=<text>).*?(?=</text>)", strings, flags=re.DOTALL)
            matchesHeadlines = re.findall(r"(?<=<headline>).*?(?=</headline>)", strings, flags=re.DOTALL)
            matchesItemID = re.findall(r"<newsitem(?:\D+=\"\S*\")*\s+itemid=\"(\d*)\"", strings, flags=re.DOTALL)
            for text in matchesText:
                listText.append(text)
            for headline in matchesHeadlines:
                listHeadlines.append(headline)
            listFileName.append(filename)
            for itemid in matchesItemID:
                listItemID.append(itemid)    
# removing some tags/characetrs from text

listText = [txt.replace('<p>', ' ').replace('</p>', ' ').replace('\n', ' ') for txt in listText]

In [5]:
from bs4 import BeautifulSoup

listTopics = []
listTemp = []
listPublishedDate = []
ListBipTopics = []

for filename in os.listdir('./Data/'):
    if filename.endswith('.xml'):
        with open(os.path.join('./Data/', filename)) as f:
            strings = f.read() 
            soup = BeautifulSoup(strings)
            listset=soup("codes","bip:topics:1.0")
            for top in listset:
                listTemp += [a['code'] for a in top.findAll('code',{'code':True})]
            listTopics.append(listTemp)
            listTemp =[]    
            
            inputTag = soup(attrs={"element" : "dc.date.published"})
            output = inputTag[0]['value']
            listPublishedDate.append(output)

for sublist in listTopics:
    s = [str(i) for i in sublist]   
    res = ",".join(s) 
    ListBipTopics.append(res)

# 3. Extracting a Dataframe

In [5]:
def extractDataframe(HeadlinesList, TextList, BitopicsList, PublishedDateList, ItemIDList, FileNamesList, columnsList):
    df = pd.DataFrame(list(zip(HeadlinesList,TextList,BitopicsList,PublishedDateList,ItemIDList,FileNamesList)), columns=columnsList)
    return df

In [6]:
columns = ['HeadLine','Text','Bi:Topics','Date Published','Itemid','XMLfileName']
df = extractDataframe(listHeadlines,listText,ListBipTopics,listPublishedDate,listItemID,listFileName,columns)

In [7]:
df.head()

Unnamed: 0,HeadLine,Text,Bi:Topics,Date Published,Itemid,XMLfileName
0,Canadian Occidental mounts rival Wascana bid.,Canadian Occidental Petroleum Ltd. emerged o...,C181,1997-03-18,326914,326914newsML.xml
1,"Gruma, Maseca to receive syndicated loan - bank.",Bank of America will launch a three-year $12...,C173,1997-03-18,326915,326915newsML.xml
2,Too early to call Krupp bid hostile - Deutsche...,Deutsche Bank AG management board member Rol...,"C18,C181,CCAT",1997-03-18,326916,326916newsML.xml
3,"FOCUS - Euro bourses fret over Wall St, electi...",European bourses fell on Tuesday even before...,"M11,M13,M132,M14,M142,MCAT",1997-03-18,326917,326917newsML.xml
4,"French stocks fall, Alcatel posts big gain.",French shares closed lower on Tuesday in the...,"G152,M11",1997-03-18,326918,326918newsML.xml


# 4. Finding Unique Values for Bi:Topics

In [8]:
columnNames = ['code','topic']
codes = pd.read_csv('./Data/topic_codes.txt', sep="\t", engine="python", names = columnNames)
codes.drop(codes.index[[0,1]],inplace=True)
dictTopics = dict(zip(codes.code, codes.topic))

In [9]:
listAllTopics = []
uniqueTpoics = []

# function to get unique topics
def uniqueTopics(dataframe, columnName):
    for element in dataframe[columnName]:
        strings = element.split(',')
        listAllTopics.append(strings)
        flatList = [ item for elem in listAllTopics for item in elem]
    for item in flatList: 
        if item not in uniqueTpoics: 
            uniqueTpoics.append(item) 
    return uniqueTpoics

In [10]:
listUniqueTopics = uniqueTopics(df, 'Bi:Topics')
headersList = [ dictTopics.get(item,item) for item in listUniqueTopics ]

print("The Total Number of Unique Topics",len(headersList),"n\nAll Possible Values for bi:topics are given bellow \n\n",headersList)

The Total Number of Unique Topics 103 n
All Possible Values for bi:topics are given bellow 

 ['MERGERS/ACQUISITIONS', 'LOANS/CREDITS', 'OWNERSHIP CHANGES', 'CORPORATE/INDUSTRIAL', 'EQUITY MARKETS', 'MONEY MARKETS', 'FOREX MARKETS', 'COMMODITY MARKETS', 'METALS TRADING', 'MARKETS', 'EC CORPORATE POLICY', 'SOFT COMMODITIES', 'GOVERNMENT/SOCIAL', 'DOMESTIC POLITICS', 'WAR, CIVIL WAR', 'DISASTERS AND ACCIDENTS', 'BIOGRAPHIES, PERSONALITIES, PEOPLE', 'RELIGION', 'BOND MARKETS', 'PERFORMANCE', 'ACCOUNTS/EARNINGS', 'LEADING INDICATORS', 'ECONOMICS', 'STRATEGY/PLANS', 'CRIME, LAW ENFORCEMENT', 'CONTRACTS/ORDERS', 'FUNDING/CAPITAL', 'SHARE CAPITAL', 'REGULATION/POLICY', 'EUROPEAN COMMUNITY', 'EC AGRICULTURE POLICY', 'GOVERNMENT FINANCE', 'EXPENDITURE/REVENUE', 'EC MONETARY/ECONOMIC', 'EC EXTERNAL RELATIONS', 'DEFENCE', 'INTERNATIONAL RELATIONS', 'ECONOMIC PERFORMANCE', 'MARKETS/MARKETING', 'CAPACITY/FACILITIES', 'MONETARY/ECONOMIC', 'INTERBANK MARKETS', 'COMMENT/FORECASTS', 'LABOUR', 'EMPLOYME

# 5. Preprocessing the Text Data

In [8]:
def preprocessedData(dataframe, textColumn):
    dataframe[textColumn] = dataframe[textColumn].map(lambda x: re.sub(r'\W+', ' ', x))   #removing special character
    dataframe[textColumn] = dataframe[textColumn].map(lambda x: re.sub(r'\d+', '', x))    # removing all the digits
    dataframe[textColumn] = dataframe[textColumn].map(lambda x: x.lower())                # converting into lower case

    # tokenize the words
    dataframe[textColumn] = dataframe[textColumn].map(lambda x: nltk.word_tokenize(x))

    # remove stop words
    stop = stopwords.words('english')
    dataframe[textColumn] = dataframe[textColumn].map(lambda x: [item for item in x if item not in stop])

    #lemmatization
    lemmatizer=WordNetLemmatizer()
    dataframe[textColumn] = dataframe[textColumn].map(lambda x: [lemmatizer.lemmatize(item) for item in x])


In [9]:
preprocessedData(df,'Text')

# 6. Extracting Features and Labels

In [10]:
def featureExtraction(dataframe, textColumn, topicsColumn):
    countVect = CountVectorizer(tokenizer=lambda x: x, lowercase=False).fit_transform(dataframe[textColumn])
    tfidfTrans = TfidfTransformer()
    tfidfOfText = tfidfTrans.fit_transform(countVect)
    print("Features Shape",tfidfOfText.shape)
    dataframe[topicsColumn]=dataframe[topicsColumn].str.split(',').str[0]
    dataframe[topicsColumn] = dataframe[topicsColumn].astype('category')
    dataframe[topicsColumn] = dataframe[topicsColumn].cat.codes
    print("Target Shape",dataframe[topicsColumn].shape)
    return tfidfOfText,dataframe[topicsColumn]

In [11]:
Features, Target= featureExtraction(df,'Text','Bi:Topics')

Features Shape (48375, 92535)
Target Shape (48375,)


# 7. Dividing the Dataset using Train/Test Split

Dividing the Dataset into train and test is necessary to check how well the model generlize the data. we need some kind assurity that model fits the pattern of data well, in other words, the value of bias and variance.

Here I am using Train/Test validation method rather than cross validation, to split the data as it is very simple to use. Cross Validation mostly used when we have very less number of data or to set the hyperparameters. Hence, If we have enough amount of data, Train/Test is a better method to split the data for faster implementation of algorithm as well as to avoid comuptational cost .

In [12]:
# a function to split the data into Train and Test set
from sklearn.model_selection import train_test_split
def splitDataset(Feature, Target, testSize, randomState):
    X_train, X_test, y_train, y_test = train_test_split(Feature, Target, test_size=testSize, random_state=randomState)
    return X_train, X_test, y_train, y_test

# 8. A function to Generate Classifier

In [16]:
def generateClassifier(features, labels, classifier,gamma='scale',kernel='linear',estimators=200):
    X_train, X_test, y_train, y_test = splitDataset(features, labels, 0.3, 25)
    if(classifier == DecisionTreeClassifier):
        clf = classifier()
    elif(classifier == SVC):
        clf = classifier(gamma=gamma, kernel=kernel)
    elif(classifier == RandomForestClassifier):
        clf = classifier(n_estimators=estimators)
    elif(classifier == LinearRegression):
        clf = classifier()
    
    clf = clf.fit(X_train, y_train)
    predicted = clf.predict(X_test)
    return predicted, y_test

# 9. A Function for Evaluating a Model

Here, I am using Accuracy score to evaluate the model to check ho well the model is doing on the test dataset.
Accuracy score- Because it is a classification problem.

-> Why Accuracy Measument? - To know how well the model will perform in the future.

In [17]:
def evaluateModel(y_test, predictedValues):
    accuracyScore = np.mean(predictedValues == y_test)
    return accuracyScore

# 10. Implementing Five Classifiers

In [18]:
from sklearn.tree import DecisionTreeClassifier
valuesPredicated, y_test = generateClassifier(Features, Target, DecisionTreeClassifier)
accuracyScoreDT = evaluateModel(y_test, valuesPredicated)
print("The Accuracy for Decision Tree model is", accuracyScoreDT)

The Accuracy for Decision Tree model is 0.6340522290360366


In [19]:
from sklearn.svm import SVC
valuesPredicatedSVM, y_test = generateClassifier(Features, Target, SVC)
accuracyScoreSVC = evaluateModel(y_test, valuesPredicatedSVM)
print("The Accuracy for SVM model is", accuracyScoreSVC)

The Accuracy for SVM model is 0.8047267966650589


In [20]:
from sklearn.ensemble import RandomForestClassifier
valuesPredicatedRF, y_test = generateClassifier(Features, Target, RandomForestClassifier,200)
accuracyScoreRF = evaluateModel(y_test, valuesPredicatedRF)
print("The Accuracy for Random Forest model is", accuracyScoreRF)

The Accuracy for Random Forest model is 0.7283814511127954


In [21]:
from sklearn.linear_model import LinearRegression
valuesPredicatedLR, y_test = generateClassifier(Features, Target, LinearRegression)
accuracyScoreLR = evaluateModel(y_test, valuesPredicatedLR)
print("The Accuracy for Random Forest model is", accuracyScoreLR)

The Accuracy for Random Forest model is 0.0


In [15]:
import numpy as np
import matplotlib.pyplot as plt
from keras import models, layers, optimizers, datasets, utils

X_train, X_test, y_train, y_test = splitDataset(Features, Target, 0.3, 25)
y_train = utils.to_categorical(y_train, 103)
y_test = utils.to_categorical(y_test, 103)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

mlpInputs = layers.Input(shape=(92535,))
n = layers.Dense(128, activation='relu')(mlpInputs)
n = layers.Dense(128, activation='relu')(n)
n = layers.Dense(128, activation='relu')(n)
n = layers.Dense(128, activation='relu')(n)
outcomes= layers.Dense(103, activation='softmax')(n)

mlpModel = models.Model(inputs=mlpInputs, outputs=outcomes)

mlpModel.compile(loss='categorical_crossentropy',
              optimizer='Nadam', metrics=['accuracy'])

history=mlpModel.fit(X_train, y_train, batch_size=256, epochs=10, validation_data=(X_test, y_test))
valScore = mlpModel.evaluate(X_test, y_test)
print('Test Loss Value:', valScore[0],'Test Accuracy Score:', valScore[1])

(33862, 92535)
(14513, 92535)
(33862, 103)
(14513, 103)
Train on 33862 samples, validate on 14513 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss Value: 1.400886609997352 Test Accuracy Score: 0.7543581616604984


### GriSearch for Tuning the HyperParameters

In [18]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
X_train, X_test, y_train, y_test = splitDataset(Features, Target, 0.3, 25)

svm = SVC()
params = {
        'kernel':['linear','poly','sigmoid'],
        'gamma': ['auto', 'scale']
        }
grid = GridSearchCV(svm, params)
grid.fit(X_train,y_train)
grid.best_params_  




In [19]:
predSVM = grid.predict(X_test)
accuracyScoreGridSVM = evaluateModel(y_test, predSVM)
print("The Accuracy for SVM model after hyparameter tuning is", accuracyScoreGridSVM)

The Accuracy for SVM model after hyparameter tuning is 0.810452527329295


### The best Model is SVM

SVM got the highest accuracy out of all the classifiers because it generalizes the complex relationship within the dataset. Although, it takes more time to train the data, SVM is a Robust algorithm- which means that it is not very sensitive to outliers.

SVM is not very prone to overfiiting and also SVM is generally used with large number of features, which is the situation over here.

All in all, the best algorithm for any dataset is the one, which gives the highest accuracy without over fitting and thats what SVM is doing for the given dataset.

## Reference:
[1] https://docs.python.org/3/library/zipfile.html

[2] https://www.crummy.com/software/BeautifulSoup/bs4/doc/

[3] https://docs.python.org/3/library/functions.html#map