# Initial look at the toy dataset

### Aims:   
    Try and cut down the toy dataset to just the rows that are being used   
    Get categorical variables into a format that can be used.  

In [50]:
import seaborn as sns
import matplotlib as plt
import pandas as pd

In [51]:
toy = pd.read_csv('Combined_TOY_NHS_data.csv')
toy = pd.DataFrame(toy) # Turn toy into a pandas data frame

In [52]:
print(list(toy.columns)) # Print a list of all the columns
print('-'*20)
print(toy.shape) # Print the shae of the dataset

['Unnamed: 0', 'timepoint', 'E8...', 'Name', 'Address_1', 'Address_2', 'Address_3', 'Area', 'Postcode', 'National_grouping', 'High_level_health_geography', 'Date_open', 'Date_close', 'Status_code', 'Subtype', 'Commissioner', 'Setting_all_gp_reference', 'TYPE', 'CCG_CODE', 'ONS_CCG_CODE', 'SEX', 'AGE', 'NUMBER_OF_PATIENTS', 'Organisation_code', 'CCG/PCT', 'Primary_care_organisation_type', 'Join_parent_date', 'Left_parent_date', 'Amended_record_indicator', 'SHA', 'PRACTICE', 'BNF.CODE', 'BNF.NAME', 'ITEMS', 'NIC', 'ACT.COST', 'QUANTITY', 'PERIOD']
--------------------
(53236, 38)


In [53]:
def rename_unname(df):
    for col in df:
        if col.startswith('Unnamed'):
            df.drop(col,axis=1, inplace=True)
rename_unname(toy) # Drop the extra column labelled unnamed
toy.columns = [x.lower() for x in toy.columns] # Turn all the column titles into  lower case
print(list(toy.columns))

['timepoint', 'e8...', 'name', 'address_1', 'address_2', 'address_3', 'area', 'postcode', 'national_grouping', 'high_level_health_geography', 'date_open', 'date_close', 'status_code', 'subtype', 'commissioner', 'setting_all_gp_reference', 'type', 'ccg_code', 'ons_ccg_code', 'sex', 'age', 'number_of_patients', 'organisation_code', 'ccg/pct', 'primary_care_organisation_type', 'join_parent_date', 'left_parent_date', 'amended_record_indicator', 'sha', 'practice', 'bnf.code', 'bnf.name', 'items', 'nic', 'act.cost', 'quantity', 'period']


### Filtering the dataset to remove columns that have more than 2 NA's

In [54]:
toycomp = toy.dropna(thresh = toy.shape[1]-2) 
print(toycomp.describe())

       timepoint     date_open  date_close  setting_all_gp_reference  \
count     3652.0  3.652000e+03         0.0                    3652.0   
mean    201911.0  1.976598e+07         NaN                       4.0   
std          0.0  7.125229e+04         NaN                       0.0   
min     201911.0  1.974040e+07         NaN                       4.0   
25%     201911.0  1.974040e+07         NaN                       4.0   
50%     201911.0  1.974040e+07         NaN                       4.0   
75%     201911.0  1.974040e+07         NaN                       4.0   
max     201911.0  2.017090e+07         NaN                       4.0   

       number_of_patients  join_parent_date  left_parent_date  \
count         3652.000000      3.652000e+03      2.869000e+03   
mean         10502.274096      2.004848e+07      2.007300e+07   
std           6359.228447      5.788054e+04      5.579184e+04   
min              7.000000      1.999040e+07      2.000033e+07   
25%           6376.000000 

# Difference in cost
Try and work out the difference in cost per item, try and classify by it or high or low to give a binary variable

In [55]:
toycomp.head()


Unnamed: 0,timepoint,e8...,name,address_1,address_2,address_3,area,postcode,national_grouping,high_level_health_geography,...,amended_record_indicator,sha,practice,bnf.code,bnf.name,items,nic,act.cost,quantity,period
16,201911.0,E84020,JAI MEDICAL CENTRE (BRENT),82 STAG LANE,EDGWARE,MIDDLESEX,,HA8 5LP,Y56,Q71,...,0.0,Q61,E84020,0403010V0AAARAR,Ganfort_Eye Dps 300mcg/5mg,1.0,5.02,419.13,476.0,201911.0
18,201911.0,E84020,JAI MEDICAL CENTRE (BRENT),82 STAG LANE,EDGWARE,MIDDLESEX,,HA8 5LP,Y56,Q71,...,0.0,Q61,E84020,0403010V0AAARAR,Ganfort_Eye Dps 300mcg/5mg,1.0,5.02,419.13,476.0,201911.0
97,201911.0,E87701,THE ABINGDON HEALTH CENTRE,THE ABINGDON HEALTH CTRE,88-92 EARLS COURT ROAD,KENSINGTON,LONDON,W8 6EG,Y56,Q71,...,0.0,Q52,E87701,090402000BBRRA0,Fludroxycortide_Tape 7.5cm x 20cm,21.0,62.25,16.29,12.0,201911.0
98,201911.0,E87701,THE ABINGDON HEALTH CENTRE,THE ABINGDON HEALTH CTRE,88-92 EARLS COURT ROAD,KENSINGTON,LONDON,W8 6EG,Y56,Q71,...,0.0,Q52,E87701,090402000BBRRA0,Fludroxycortide_Tape 7.5cm x 20cm,21.0,62.25,16.29,12.0,201911.0
99,201911.0,E87701,THE ABINGDON HEALTH CENTRE,THE ABINGDON HEALTH CTRE,88-92 EARLS COURT ROAD,KENSINGTON,LONDON,W8 6EG,Y56,Q71,...,0.0,Q52,E87701,090402000BBRRA0,Fludroxycortide_Tape 7.5cm x 20cm,21.0,62.25,16.29,12.0,201911.0


In [56]:
import csv
toycomp.to_csv("Combined_TOYCOMP_NHS_data.csv")