In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf

In [0]:
hansard = pd.read_csv('/content/gdrive/My Drive/hansard-1995-2018.csv')


In [0]:
hansard = hansard.drop(hansard.columns[[0]], axis=1) 

In [0]:
hansard.shape

(1380017, 11)

In [0]:
hansard.head()

Unnamed: 0,pp_id,speech,year,as_speaker,proper_name,word_count,gender,party,party_group,government,age
0,1995-01-10.1.6,To ask the Secretary of State for Employment i...,1995,False,Tom Cox,71.0,Male,Labour,Labour,False,65.0
1,1995-01-10.1.7,"The latest available figures show that, in Nov...",1995,False,Phillip Oppenheim,86.0,Male,Conservative,Conservative,True,38.0
2,1995-01-10.1.8,"Whatever the Minister may say, is he aware tha...",1995,False,Tom Cox,138.0,Male,Labour,Labour,False,65.0
3,1995-01-10.1.9,What the hon. Gentleman does not say is that m...,1995,False,Phillip Oppenheim,157.0,Male,Conservative,Conservative,True,38.0
4,1995-01-10.10.0,The only person who is ducking questions is th...,1995,False,Phillip Oppenheim,237.0,Male,Conservative,Conservative,True,38.0


In [0]:
hansard.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1380017 entries, 0 to 1380016
Data columns (total 11 columns):
pp_id          1380017 non-null object
speech         1380014 non-null object
year           1380017 non-null int64
as_speaker     1380017 non-null bool
proper_name    1380017 non-null object
word_count     1304511 non-null float64
gender         1380017 non-null object
party          1380017 non-null object
party_group    1304511 non-null object
government     1300361 non-null object
age            1300318 non-null float64
dtypes: bool(1), float64(2), int64(1), object(7)
memory usage: 106.6+ MB


**Task to be completed :-**
1. Assign Speech to a Party Group (Mutli-class classification)
2. Identify sentiment in a speech (in favour/against)
3. Topic Modelling
4. Speech Generation

The 'Party_Group' class column contains some NULL entries as seen from the 'hansard.info()' call. We will drop the rows with null entries as the splitting function requires non-null column. We can also impute the entries based on the information from 'Party' coulmn but we will not do it for now. Let's drop the NULL entries from Speech and Party_Group columns

In [0]:
hansard = hansard.dropna(subset=['speech', 'party_group'])
hansard.shape

(1304511, 11)

In [0]:
# We have 5 Party_Groups. ** verify if balancing of classes is needed in this case

hansard['party_group'].value_counts() / len(hansard)

Labour              0.424828
Conservative        0.414913
Liberal Democrat    0.075994
Other               0.065620
SNP                 0.018645
Name: party_group, dtype: float64

Let us split the data into train, validation and test sets.
We will use Stratified Sampling from sklearn for splitting the data. Stratified Sampling ensures that we maintain the same ratio of classes in the test data as the training set. This helps in avoiding any sampling bias.

In [0]:
from sklearn.model_selection import StratifiedShuffleSplit

# Split data into 10% test and 90% train set

split = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=42)
for train_index, test_index in split.split(hansard, hansard['party_group']):
  train_data = hansard.loc[train_index]
  hansard_test = hansard.loc[test_index]

In [0]:
print('Party_Group distribution in training set: ')
print(train_data['party_group'].value_counts() / len(hansard))
print('\nParty_Group distribution in test set: ')
print(hansard_test['party_group'].value_counts() / len(hansard))

Party_Group distribution in training set: 
Labour              0.382345
Conservative        0.373421
Liberal Democrat    0.068394
Other               0.059058
SNP                 0.016781
Name: party_group, dtype: float64

Party_Group distribution in test set: 
Labour              0.042483
Conservative        0.041491
Liberal Democrat    0.007600
Other               0.006562
SNP                 0.001864
Name: party_group, dtype: float64


In [0]:
# Split training data into 20% validation and 80% train set

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=43)
for training_index, val_index in split.split(train_data, train_data['party_group']):
  hansard_train = train_data.loc[training_index]
  hansard_val = train_data.loc[val_index]

In [0]:
print('Party_Group distribution in training set: ')
print(hansard_train['party_group'].value_counts() / len(hansard_train))
print('\nParty_Group distribution in validation set: ')
print(hansard_val['party_group'].value_counts() / len(hansard_val))
print('\nParty_Group distribution in test set: ')
print(hansard_test['party_group'].value_counts() / len(hansard_test))

Party_Group distribution in training set: 
Labour              0.400061
Conservative        0.353652
Liberal Democrat    0.074318
Other               0.060898
SNP                 0.011064
Name: party_group, dtype: float64

Party_Group distribution in validation set: 
Labour              0.399396
Conservative        0.352465
Liberal Democrat    0.074647
Other               0.061922
SNP                 0.011401
Name: party_group, dtype: float64

Party_Group distribution in test set: 
Labour              0.424831
Conservative        0.414911
Liberal Democrat    0.075997
Other               0.065618
SNP                 0.018643
Name: party_group, dtype: float64


We can see that the datasets have a similar distribution of the class variable


In [0]:
# written the datasets to a file and deleted the variables to save memory. Now we will only work on the train data
#del hansard
hansard_test.to_csv('hansard_test.csv')
hansard_val.to_csv('hansard_val.csv')
hansard_train.to_csv('hansard_train.csv')

In [0]:
del hansard_test
del hansard_val

In [0]:
hansard_train = pd.read_csv('/content/gdrive/My Drive/hansard_train.csv')

We will use the 'Speech' column to build feature vectors for classification. But first let us clean the text. We will
1.  Remove stopwords and do stemming using nltk
2.  Use Tokenizer API to remove punctuation and lowercase words. It will tokenize the words. We will also introduce an out-of-vocabulary token to account for unseen words in the test data

We will create a new column 'clean_speech' and append to the training data. The original column will not be removed because we may need to perform different pre-processing on the text depending on the task, example for Speech Generation, we need to keep the stopwords or punctuation as it is necessary for generating meaningful sentences.

In [0]:
hansard_train = hansard_train.dropna()

In [0]:
hansard_train.shape

(845316, 12)

In [0]:
from sklearn.feature_extraction import text
from nltk.stem.porter import * 
import re

stopword = text.ENGLISH_STOP_WORDS
stemmer = PorterStemmer()

# remove stopwords and stem the words
hansard_train['clean_speech'] = hansard_train['speech'].str.replace(r'\b(' + r'|'.join(stopword) + r')\b\s*', '')
hansard_train['clean_speech'] = hansard_train['speech'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))

In [0]:
hansard_train.head()

Unnamed: 0.1,Unnamed: 0,pp_id,speech,year,as_speaker,proper_name,word_count,gender,party,party_group,government,age,clean_speech
0,570445,2005-11-21.1233.1,May I put it to my hon. Friend that he ought v...,2005.0,False,John Bercow,186.0,Male,Conservative,Conservative,False,42.0,may I put it to my hon. friend that he ought v...
1,257741,1999-11-03.391.2,There might be a slight saving in the DSS pens...,1999.0,False,Harry Barnes,51.0,Male,Labour,Labour,True,63.0,there might be a slight save in the dss pensio...
2,812564,2010-06-03.613.2,"c ""Thank you, Mr Deputy Speaker. It is unusual...",2010.0,False,David Miliband,1395.0,Male,Labour,Labour,False,44.0,"c ""thank you, Mr deputi speaker. It is unusu i..."
4,1033291,2013-09-10.848.4,"c ""One pressure that applies equally in Wales ...",2013.0,False,Chris Bryant,234.0,Male,Labour,Labour,False,51.0,"c ""one pressur that appli equal in wale and in..."
5,955066,2012-06-18.701.1,It is slightly disingenuous of the hon. Gentle...,2012.0,False,Tom Harris,144.0,Male,Labour,Labour,False,48.0,It is slightli disingenu of the hon. gentleman...


Let us visualize the most frequent words used in the speeches for different party_groups using a Wordcloud

In [0]:
from wordcloud import WordCloud

def create_wordcloud(partyname):
  # combine all the tweets and build a wordCloud for 'Labour' party
  text = ' '.join([sent for sent in hansard_train['clean_speech'][hansard_train['party_group'] == partyname]])
  wordcloud = WordCloud(width=400, height=300, random_state=21, max_font_size=110).generate(text)
  plt.figure(figsize=(10, 7)) 
  plt.imshow(wordcloud, interpolation="bilinear") 
  plt.axis('off')

from wordcloud import WordCloud
from collections import Counter

wc = WordCloud()

counts_all = Counter()

with open('path/to/file.txt', 'r') as f:
    for line in f:  # Here you can also use the Cursor
        counts_line = wc.process_text(line)
        counts_all.update(counts_line)

wc.generate_from_frequencies(counts_all)
wc.to_file('/tmp/wc.png')

In [0]:
create_wordcloud('Labour')

In [0]:
create_wordcloud('Conservative')

In [0]:
create_wordcloud('Liberal Democrat')

In [0]:
create_wordcloud('Other')

In [0]:
create_wordcloud('SNP')