# Setup

In [2]:
import pandas as pd

In [3]:
final = pd.read_csv("Combined_TOYCOMP_BNF_NHS_data.csv") # Import the TOY dataset
print(final.shape)

(3652, 43)


# Dropping NA and unuseful columns

In [4]:
print(final.isna().sum())  # Count the number of NA rows in that column

Unnamed: 0                           0
timepoint                            0
e8...                                0
name                                 0
address_1                            0
address_2                          117
address_3                           39
area                               381
postcode                             0
national_grouping                    0
high_level_health_geography          0
date_open                            0
date_close                        3652
status_code                          0
subtype                              0
commissioner                         0
setting_all_gp_reference             0
type                                 0
ccg_code                             0
ons_ccg_code                         0
sex                                  0
age                                  0
number_of_patients                   0
organisation_code                    0
ccg/pct                              0
primary_care_organisation

In [5]:
# Drop columns that are uninformative to the machine learning
final.drop(["Unnamed: 0", "amended_record_indicator", "period", "setting_all_gp_reference", "Unnamed: 0", "timepoint", "address_1", "name"], axis=1, inplace = True)
final.shape #Shows that the correct nmumber of columns have been dropped

(3652, 36)

In [6]:
#indexNames = final[final['date_close'] != 'NaN' ].index #Run this to potentially remove any columns that are NaN
    #Not sure it actually works though
#final.drop(indexNames, inplace=True)
#final.shape

In [7]:
final.drop(["date_close"], axis=1, inplace = True) # Delete the now empty column

# Categorical columns section

Which columns might need converting?  
Shows that columns either contain booleans or objects

In [8]:
final['area'].unique() # Shows that area is probably good to keep on a large scale 
#Drop the other area realted variables - see q answered notesbook 20.2.24
final['bnf.section'].unique()

array([ 3,  4,  7,  6,  2, 12, 30, 22, 60,  1, 10, 27, 80,  8,  5,  9, 94,
       35, 96, 21, 85, 11, 15, 50, 29, 28, 45, 14, 24, 70])

In [9]:
# Creating final data set to use, with formatted columns
final.drop(["address_2", "status_code", "subtype", "organisation_code", "postcode", "primary_care_organisation_type", "address_3", "bnf.chemical", "bnf.letters", "bnf.code", "practice", "ons_ccg_code", "sex", "age"], axis=1, inplace = True) # Drop the address columns, in favour of using area

In [10]:
# Assign an ordered list of numbers 
final['ccg_code1'] = final.groupby('ccg_code').ngroup()
final['high_level_health_geography1'] = final.groupby('high_level_health_geography').ngroup()
final['commissioner1'] = final.groupby('commissioner').ngroup()
final['sha1'] = final.groupby('sha').ngroup()
final['bnf.name1'] = final.groupby('bnf.name').ngroup()
final['e8...1'] = final.groupby('e8...').ngroup()

#Then drop the old columns
final.drop(["ccg_code", "type", "ccg/pct", "high_level_health_geography", "sha", "bnf.name", "commissioner", 'e8...'], axis=1, inplace = True)

In [11]:
# Assign into columns of types = ONE HOT ENCODING
# Create a dataset that converts categorical variables into a separate column per variable with 1/0
print(final.shape)
print('-'*20)
final = pd.concat([final, pd.get_dummies(final['area'])], 1) # Trying this tactic with status_code
print(final.shape)
print('-'*20)
final = pd.concat([final, pd.get_dummies(final['national_grouping'])], 1)
print('-'*20)
print(list(final.columns))  # Can see all the column names added on
final.drop(["area", "national_grouping"], axis=1, inplace = True)

(3652, 19)
--------------------
(3652, 120)
--------------------
--------------------
['area', 'national_grouping', 'date_open', 'number_of_patients', 'join_parent_date', 'left_parent_date', 'items', 'nic', 'act.cost', 'quantity', 'bnf.chapter', 'bnf.section', 'bnf.paragraph', 'ccg_code1', 'high_level_health_geography1', 'commissioner1', 'sha1', 'bnf.name1', 'e8...1', 'ABINGDON  OXFORDSHIRE', 'ALENCON LINK  BASINGSTOKE', 'BARLBOROUGH  CHESTERFIELD', 'BEDFORDSHIRE', 'BERKSHIRE', 'BEVERLEY', 'BIRMINGHAM', 'BRADFORD', 'BRISTOL', 'BROOKLANDS  MILTON KEYNES', 'BROWNHILLS', 'BUCKINGHAMSHIRE', 'CAMBRIDGESHIRE', 'CHESHIRE', 'CHESTER  CHESHIRE', 'CLEVELAND', 'CO.DURHAM', 'COLCHESTER ESSEX', 'CORNWALL', 'COUNTY DURHAM', 'COVENTRY', 'CUMBRIA', 'DERBY', 'DERBYSHIRE', 'DEVIZES  WILTSHIRE', 'DEVON', 'DONCASTER', 'DORSET', 'DOVER  KENT', 'EAST SUSSEX', 'EAST YORKSHIRE', 'ENFIELD  MIDDLESEX', 'ESSEX', 'GLOUCESTER', 'GLOUCESTERSHIRE', 'GREAT BARR BIRMINGHAM', 'HAMPSHIRE', 'HARTLEPOOL CLEVELAND', 'HEREF

In [12]:
# Drop the columns that have been converted

In [13]:
# Turn it into  a numeric variable
final['bnf.chapter'] = pd.to_numeric(final['bnf.chapter'])

In [14]:
print(final.info()) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3652 entries, 0 to 3651
Columns: 125 entries, date_open to Y63
dtypes: float64(8), int64(9), uint8(108)
memory usage: 870.3 KB
None


In [265]:
import csv 
final.to_csv("final.csv")


In [15]:

final.head()

Unnamed: 0,date_open,number_of_patients,join_parent_date,left_parent_date,items,nic,act.cost,quantity,bnf.chapter,bnf.section,...,WOOLSTON SOUTHAMPTON,WORCESTERSHIRE,YORK,Y56,Y58,Y59,Y60,Y61,Y62,Y63
0,19740401.0,6132.0,19990401.0,20020331.0,1.0,5.02,419.13,476.0,4,3,...,0,0,0,1,0,0,0,0,0,0
1,19740401.0,6132.0,20020401.0,20130331.0,1.0,5.02,419.13,476.0,4,3,...,0,0,0,1,0,0,0,0,0,0
2,19870901.0,8755.0,20130401.0,,21.0,62.25,16.29,12.0,9,4,...,0,0,0,1,0,0,0,0,0,0
3,19870901.0,8755.0,20010401.0,20020331.0,21.0,62.25,16.29,12.0,9,4,...,0,0,0,1,0,0,0,0,0,0
4,19870901.0,8755.0,19990401.0,20010331.0,21.0,62.25,16.29,12.0,9,4,...,0,0,0,1,0,0,0,0,0,0
