In [1]:
import pandas as pd

In [2]:
column_names = [
    'action', 
    'background', 
    'additional_details', 
    'created_at', 
    'moderation_threshold_reached_at', 
    'opened_at', 
    'updated_at', 
    'topics', 
    'state'
]
df = pd.read_csv(r'data/all_petitions_data.csv', names = column_names)

In [3]:
df.dtypes

action                             object
background                         object
additional_details                 object
created_at                         object
moderation_threshold_reached_at    object
opened_at                          object
updated_at                         object
topics                             object
state                              object
dtype: object

In [4]:
df['created_at'].head(1)

0    2020-03-02T12:06:20.235Z
Name: created_at, dtype: object

In [5]:
df = df[['created_at', 'action', 'background', 'additional_details', 'topics']]

In [6]:
df.head()

Unnamed: 0,created_at,action,background,additional_details,topics
0,2020-03-02T12:06:20.235Z,Protect the Environment – support carbon neutr...,"In November 2019, Members of the UK Youth Parl...",To preserve this world for the next generation...,[]
1,2020-03-03T11:21:56.566Z,Bring Hailsham’s roads back to a their former ...,“My aim is to get Hailsham streets and roads b...,I am wanting to start a petition because the r...,[]
2,2020-03-03T11:22:53.628Z,Give holiday caravan/lodge owners the same sta...,Owners of holiday caravans and lodges need the...,As a multi-million pound consumer industry wit...,[]
3,2020-03-03T11:23:13.336Z,Increase the criteria for a successful referen...,"Referendums can and have been deeply divisive,...",I propose increasing the threshold to 60%+ of ...,[]
4,2020-03-03T11:23:23.361Z,Allow conference speakers to be able to speak ...,"As of right now, you require a Tier 5 visa to ...",This is a negative for the country as a whole ...,[]


In [7]:
df['created_at'] = pd.to_datetime(df['created_at'])

In [8]:
df.head()

Unnamed: 0,created_at,action,background,additional_details,topics
0,2020-03-02 12:06:20.235000+00:00,Protect the Environment – support carbon neutr...,"In November 2019, Members of the UK Youth Parl...",To preserve this world for the next generation...,[]
1,2020-03-03 11:21:56.566000+00:00,Bring Hailsham’s roads back to a their former ...,“My aim is to get Hailsham streets and roads b...,I am wanting to start a petition because the r...,[]
2,2020-03-03 11:22:53.628000+00:00,Give holiday caravan/lodge owners the same sta...,Owners of holiday caravans and lodges need the...,As a multi-million pound consumer industry wit...,[]
3,2020-03-03 11:23:13.336000+00:00,Increase the criteria for a successful referen...,"Referendums can and have been deeply divisive,...",I propose increasing the threshold to 60%+ of ...,[]
4,2020-03-03 11:23:23.361000+00:00,Allow conference speakers to be able to speak ...,"As of right now, you require a Tier 5 visa to ...",This is a negative for the country as a whole ...,[]


In [9]:
df.dtypes

created_at            datetime64[ns, UTC]
action                             object
background                         object
additional_details                 object
topics                             object
dtype: object

In [10]:
df.isnull().sum()

created_at               0
action                   0
background               0
additional_details    1997
topics                   0
dtype: int64

In [11]:
df.fillna({'action': '', 'background': '', 'additional_details': ''}, inplace = True)
df.isnull().sum()

created_at            0
action                0
background            0
additional_details    0
topics                0
dtype: int64

In [12]:
df['full_text'] = df['action'].str.cat(df[['background', 'additional_details']], sep = ' ')

In [13]:
df = df[['created_at', 'full_text', 'topics']]

In [14]:
df.loc[df['topics'].apply(lambda x: len(x) > 0)]

Unnamed: 0,created_at,full_text,topics
0,2020-03-02 12:06:20.235000+00:00,Protect the Environment – support carbon neutr...,[]
1,2020-03-03 11:21:56.566000+00:00,Bring Hailsham’s roads back to a their former ...,[]
2,2020-03-03 11:22:53.628000+00:00,Give holiday caravan/lodge owners the same sta...,[]
3,2020-03-03 11:23:13.336000+00:00,Increase the criteria for a successful referen...,[]
4,2020-03-03 11:23:23.361000+00:00,Allow conference speakers to be able to speak ...,[]
...,...,...,...
36954,2022-02-15 10:49:45.037000+00:00,Increase flexible working hours - in order to ...,[]
36955,2022-02-15 11:05:26.856000+00:00,create a separate price cap for renewable ener...,[]
36956,2022-02-15 12:50:28.884000+00:00,BRING BACK HOT RODS TO KFC MAKE KFC BRING THEM...,[]
36957,2022-02-15 14:07:45.190000+00:00,Nasal Spray We need a nasal spray for Autistic...,[]


In [15]:
df.at[0, 'topics']

'[]'

In [16]:
df.loc[df['topics'].apply(lambda x: len(x) > 2)]

Unnamed: 0,created_at,full_text,topics
56,2020-03-03 16:38:18.401000+00:00,Introduce Mandatory Ethnicity Pay Gap Reportin...,['race-and-equality']
57,2020-03-03 16:59:34.743000+00:00,Suspend sanctions against Iran for the duratio...,['covid-19']
111,2020-03-04 04:40:41.026000+00:00,Parliament to protect housing benefit payments...,['covid-19']
117,2020-03-04 07:53:13.862000+00:00,Ban spitting in the street especially in light...,['covid-19']
159,2020-03-04 15:11:39.565000+00:00,Will the gov support self-employed individuals...,['covid-19']
...,...,...,...
36829,2022-02-08 13:21:24.521000+00:00,Create an Operation Rescript medal for Armed F...,['covid-19']
36847,2022-02-08 23:49:33.707000+00:00,the govt must order a full inquiry into the im...,['covid-19']
36854,2022-02-09 12:59:18.741000+00:00,Do not let the government end the legal requir...,['covid-19']
36861,2022-02-09 17:23:20.975000+00:00,Confirm all subtopics that will be in 2022 GCS...,['covid-19']


In [17]:
# Set key dates for petitions over a given date range
start_date = pd.to_datetime('2020-04-01')
end_date = pd.to_datetime('2021-03-31')

In [18]:
# petitions over the financial year of 2020
petitions_2020fy = df.loc[(df.created_at.dt.date >= start_date.date()) & (df.created_at.dt.date <= end_date.date())].copy()
petitions_2020fy.reset_index(drop = True, inplace = True)

In [19]:
#convert created_at to month commencing
import datetime
month_commencing = petitions_2020fy.pop('created_at')
petitions_2020fy['month_commencing'] = month_commencing.apply(lambda el: pd.Timestamp(datetime.datetime(el.year, el.month, 1)))

In [20]:
petitions_2020fy.to_csv(r'data/petitions_fiscal_year_2020.csv', index = False)