"""

NOTES:

this data source is an .ods file. Also, it's too long to reliably do with databaker.

For now I've just saved as .xls and used pandas dataframes to restructure it.
Longer term we may need some python to bridge ods->xls so we can pass the filname in as an argument to create
a pipeline.

CONVENTIONS

* oldDf - the "old dataframe". Whatever CSV or xls sheet is loaded at the time.
* newDf - the "new dataframe". A "csv in memory" we're gradually constructing.

"""

In [17]:
# Load the xls. Print the list of sheet names

import pandas as pd

xl = pd.ExcelFile("prc-pfa-mar2013-onwards-tables.xls")
sheets = xl.sheet_names

print(sheets)
print(sheets[-3:])

['Notes_', '2012-13', '2013-14', '2014-15', '2015-16', '2016-17', '2017-18']
['2015-16', '2016-17', '2017-18']


## Build Initial File

Pretty self explanatory. CHOICE_financialTocalendar lets you turn the calendar->financial years conversion on and off as needed.

This will have the dimensions (minus geo code) BUT will also include columns to build the hierarchy (we'll drop them once it's built).

end result is a dataframe variable called 'combinedDf' and some feedback on cube size and sparsity per tab.


In [18]:

CHOICE_financialTocalendar = False           # do we want to hacky-convert to calendar years and quarters


allDataframes = []
for sheetName in sheets[-3:]: # skip sheet index 0 - the Notes tab
    
    # Turn sheet into "old" dataframe. Create new blank dataframe
    oldDf = xl.parse(sheetName)
    newDf = pd.DataFrame()
    
    # Drop the rows with non-place names in "Force Name".
    # Needs to happen now, before we start copying columns

    non_geographic = ["Financial Fraud Action UK", "CIFAS", "Action Fraud", "British Transport Police"]
        
    for ng in non_geographic:
        oldDf = oldDf[oldDf["Force Name"] != ng]
    
    
    # Create a Column: V4_1
    # =========================
    
    # Find whatever they're calling the obs column in this sheet
    if "Force Offences" in oldDf.columns.values:
        offenceCol = "Force Offences"
    elif "Number of Offences" in oldDf.columns.values:
        offenceCol = "Number of Offences"
    else:
        raise ValueError("Cannot identify ONS col. Expecting one of: Number Of Offences, Force Offence. Tab: " +  sheetName)
        
    newDf["V4_0"] = oldDf[offenceCol]
    
    
    # Create Columns: Time | Time_codelist
    # =========================
    if CHOICE_financialTocalendar:
        
        """
        financial-calendar year transform
        
        Example:
        2012/2013 1        2012 Q2
        2012/2013 2        2012 Q3
        2012/2013 3        2012 Q4
        2012/2013 4        2013 Q1
        """
        
        # Start by swapping the quarters column
        LOOKUP_quarters = {1:"Q2", 2:"Q3", 3:"Q4", 4:"Q1"}
        oldDf["Financial Quarter"] = oldDf["Financial Quarter"].map(lambda x: LOOKUP_quarters[x])
        
        # Cut years based on quarters (quarter 1 is the year after the / in "YYYY/YY", the others are before)
        oldDf["Financial Year"][oldDf["Financial Quarter"] == "Q1"] = oldDf["Financial Year"].map(lambda x: x[:2] + x.split("/")[1][-2:])
        oldDf["Financial Year"][oldDf["Financial Quarter"] != "Q1"] = oldDf["Financial Year"].map(lambda x: x[:4])

        newDf["Time_codelist"] = "Quarters"
        newDf["Time"] = oldDf["Financial Year"] + ' ' + oldDf["Financial Quarter"]
        
    else:
        
        # Take time as-is but add the Q to quarter
        newDf["Time_codelist"] = "Quarters"
        newDf["Time"] = oldDf["Financial Year"] + " " + oldDf["Financial Quarter"].map(lambda x: "Q" + str(x))
        

    # Create Columns: Geography | Geography_codelist
    # =========================

    newDf["Geography_codelist"] = ""
    newDf["Geography"] = oldDf["Force Name"]

    
    # Create Columns: Offence | Offence_codelist
    # =========================
    
    newDf["Offence_codelist"] = oldDf["Offence Code"]
    newDf["Offence"] = oldDf["Offence Description"]
        
        
    # Data we'll need to build hierarchy (will drop it before we output the V4)
    newDf["Offence Group"] = oldDf["Offence Group"]
    newDf["Offence Subgroup"] = oldDf["Offence Subgroup"]
    newDf["Offence Description"] = oldDf["Offence Description"]
                
    allDataframes.append(newDf)


# Combine all
combinedDf = pd.concat(allDataframes)

combinedDf.columns.values

array(['V4_0', 'Time_codelist', 'Time', 'Geography_codelist', 'Geography',
       'Offence_codelist', 'Offence', 'Offence Group', 'Offence Subgroup',
       'Offence Description'], dtype=object)

In [19]:
"""
Code Changes

combinedDf["Offence_codelist"] = combinedDf["Offence_codelist"].map(lambda x: str(x).replace("1/4.1/4.2/4.10", "1/4.1/4.x"))
combinedDf["Offence_codelist"] = combinedDf["Offence_codelist"].map(lambda x: str(x).replace("1/4.1/4.10/4.2", "1/4.1/4.x"))

combinedDf["Offence_codelist"][combinedDf["Offence_codelist"] == "62A"] = "62A-65"
combinedDf["Offence_codelist"][combinedDf["Offence_codelist"] == "65"] = "62A-65"

"""

'\nCode Changes\n\ncombinedDf["Offence_codelist"] = combinedDf["Offence_codelist"].map(lambda x: str(x).replace("1/4.1/4.2/4.10", "1/4.1/4.x"))\ncombinedDf["Offence_codelist"] = combinedDf["Offence_codelist"].map(lambda x: str(x).replace("1/4.1/4.10/4.2", "1/4.1/4.x"))\n\ncombinedDf["Offence_codelist"][combinedDf["Offence_codelist"] == "62A"] = "62A-65"\ncombinedDf["Offence_codelist"][combinedDf["Offence_codelist"] == "65"] = "62A-65"\n\n'

In [20]:

"""
Label changes.

combinedDf["Offence"] = combinedDf["Offence"].map(lambda x: str(x).replace("Abuse of children through prostitution and pornography", "Abuse of children through sexual exploitation"))
combinedDf["Offence"] = combinedDf["Offence"].map(lambda x: str(x).replace("Causing death by driving: unlicensed drivers etc.", "Causing death or serious injury by driving: unlicensed drivers etc."))
combinedDf["Offence"] = combinedDf["Offence"].map(lambda x: str(x).replace("Abuse of children through prostitution and pornography", "Abuse of children through sexual exploitation"))

"""

'\nLabel changes.\n\ncombinedDf["Offence"] = combinedDf["Offence"].map(lambda x: str(x).replace("Abuse of children through prostitution and pornography", "Abuse of children through sexual exploitation"))\ncombinedDf["Offence"] = combinedDf["Offence"].map(lambda x: str(x).replace("Causing death by driving: unlicensed drivers etc.", "Causing death or serious injury by driving: unlicensed drivers etc."))\ncombinedDf["Offence"] = combinedDf["Offence"].map(lambda x: str(x).replace("Abuse of children through prostitution and pornography", "Abuse of children through sexual exploitation"))\n\n'

In [21]:

"""
Move "outcomes only" to a datamarker

# Tempcolumn to hold T/F for outcome(s) only
combinedDf["hasOutComesOnly"] = False  # default
combinedDf["hasOutComesOnly"][combinedDf["Offence"].map(lambda x: "outcome only" in x.lower())] = True
combinedDf["hasOutComesOnly"][combinedDf["Offence"].map(lambda x: "outcomes only" in x.lower())] = True

# Remove "Outcome only" or "outcomes only" and replace with a data marker)
combinedDf["Offence"] = combinedDf["Offence"].map(lambda x: x.replace("(outcome only)", ""))
combinedDf["Offence"] = combinedDf["Offence"].map(lambda x: x.replace("(outcomes only)", ""))
combinedDf["Offence"] = combinedDf["Offence"].map(lambda x: x.strip())

# Add a data marking, and remove the temo column
combinedDf["Data_Marking"][combinedDf["hasOutComesOnly"] == True] = "*"
combinedDf = combinedDf.drop("hasOutComesOnly", axis=1)

"""

'\nMove "outcomes only" to a datamarker\n\n# Tempcolumn to hold T/F for outcome(s) only\ncombinedDf["hasOutComesOnly"] = False  # default\ncombinedDf["hasOutComesOnly"][combinedDf["Offence"].map(lambda x: "outcome only" in x.lower())] = True\ncombinedDf["hasOutComesOnly"][combinedDf["Offence"].map(lambda x: "outcomes only" in x.lower())] = True\n\n# Remove "Outcome only" or "outcomes only" and replace with a data marker)\ncombinedDf["Offence"] = combinedDf["Offence"].map(lambda x: x.replace("(outcome only)", ""))\ncombinedDf["Offence"] = combinedDf["Offence"].map(lambda x: x.replace("(outcomes only)", ""))\ncombinedDf["Offence"] = combinedDf["Offence"].map(lambda x: x.strip())\n\n# Add a data marking, and remove the temo column\ncombinedDf["Data_Marking"][combinedDf["hasOutComesOnly"] == True] = "*"\ncombinedDf = combinedDf.drop("hasOutComesOnly", axis=1)\n\n'

In [22]:
"""
There is one instance of a data point appearing with both outcomes and non outcomes with different data 
bedfordshire 2012/2013 1 (or 2012 Q2 if calendar) for codes 62A and 65. this created a duplicate line when we move
"outcomes" to a datamarker.


check = combinedDf[(combinedDf["Offence_codelist"] == "62A-65") & 
                   ((combinedDf["Time"] == "2012/13 Q4") | 
                    (combinedDf["Time"] == "2012/13 Q3") |
                    (combinedDf["Time"] == "2012/13 Q2") |
                    (combinedDf["Time"] == "2012/13 Q1"))
                    &
                   (combinedDf["Geography"] == "Bedfordshire")]
check
"""

'\nThere is one instance of a data point appearing with both outcomes and non outcomes with different data \nbedfordshire 2012/2013 1 (or 2012 Q2 if calendar) for codes 62A and 65. this created a duplicate line when we move\n"outcomes" to a datamarker.\n\n\ncheck = combinedDf[(combinedDf["Offence_codelist"] == "62A-65") & \n                   ((combinedDf["Time"] == "2012/13 Q4") | \n                    (combinedDf["Time"] == "2012/13 Q3") |\n                    (combinedDf["Time"] == "2012/13 Q2") |\n                    (combinedDf["Time"] == "2012/13 Q1"))\n                    &\n                   (combinedDf["Geography"] == "Bedfordshire")]\ncheck\n'

In [23]:
"""
Have removed the one without the data marker.


unwanted = combinedDf[(combinedDf["Offence_codelist"] == "62A-65") & 
                   (combinedDf["Time"] == "2012/13 Q4") &
                   (combinedDf["Geography"] == "Bedfordshire") &
                   (combinedDf["Offence Description"] == "Violent disorder (outcomes only)")]

combinedDf = combinedDf.drop(combinedDf.index[unwanted.index])

"""

'\nHave removed the one without the data marker.\n\n\nunwanted = combinedDf[(combinedDf["Offence_codelist"] == "62A-65") & \n                   (combinedDf["Time"] == "2012/13 Q4") &\n                   (combinedDf["Geography"] == "Bedfordshire") &\n                   (combinedDf["Offence Description"] == "Violent disorder (outcomes only)")]\n\ncombinedDf = combinedDf.drop(combinedDf.index[unwanted.index])\n\n'

---

## Get Geography Codes

Have used the"police force areas" csv from the Open Geography Portal to look these up.
http://geoportal.statistics.gov.uk/datasets/police-force-areas-december-2017-names-and-codes-in-the-united-kingdom

included in repo as "Police_Force_Areas_December_2016.csv

In [24]:

policeAreasCSV = pd.read_csv("Police_Force_Areas_December_2016.csv")

# sample 5 lines
policeAreasCSV[:5]

Unnamed: 0,PFA16CD,PFA16NM,FID
0,E23000001,Metropolitan Police,1
1,E23000002,Cumbria,2
2,E23000003,Lancashire,3
3,E23000004,Merseyside,4
4,E23000005,Greater Manchester,5


In [25]:

# cut the blanks at the end
policeAreasCSV.fillna("", inplace=True)
policeAreasCSV = policeAreasCSV[policeAreasCSV["PFA16NM"] != ""]

# deal with idiotic rephrasing
policeAreasCSV["PFA16NM"] = policeAreasCSV["PFA16NM"].map(lambda x: x.replace("City of London","London, City of"))

# build a dict/map to lookup
keys = list(policeAreasCSV["PFA16NM"].unique())
values = list(policeAreasCSV["PFA16CD"].unique())
policeLookup = dict(zip(keys, values))

# Insert the codes
combinedDf["Geography_codelist"] = combinedDf["Geography"].map(lambda x: policeLookup[x])

# Sanity check
combinedDf["Geography_codelist"].unique()

array(['E23000036', 'E23000026', 'E23000023', 'E23000006', 'E23000013',
       'E23000002', 'E23000018', 'E23000035', 'E23000039', 'E23000008',
       'W15000004', 'E23000028', 'E23000037', 'E23000005', 'W15000002',
       'E23000030', 'E23000027', 'E23000012', 'E23000032', 'E23000003',
       'E23000021', 'E23000020', 'E23000034', 'E23000004', 'E23000001',
       'E23000024', 'W15000001', 'E23000009', 'E23000022', 'E23000007',
       'E23000019', 'W15000003', 'E23000011', 'E23000015', 'E23000025',
       'E23000031', 'E23000033', 'E23000029', 'E23000017', 'E23000016',
       'E23000014', 'E23000010', 'E23000038'], dtype=object)


---

## Build Offences hierarchy

NOTE - offences is presented in a semi-hierarcical way. With only the children having data and the layers above 
only being used to categorise. Will insert holding-codes for the upper layers (shouldn't matter since they're not 
exposed) and build it anyway to see what it looks like.


In [26]:
"""

Using the columns:
offence Description | Offence Group | Offence Subgroup | Offence Code

We need to build a CSV with the structure:
Codelist | Code | Label| ParentCode

so we can use build a cypher file with:
https://github.com/ONSdigital/dp-hierarchy-builder/tree/cmd-develop/cmd/hierarchy-transformer
"""

codeList = "offence"

# Initialise with a generic parent
"""
hierarchyCSV = {
    "Codelist":[codeList],
    "Code":["CrimeParent"],
    "Label":["Offences"],
    "ParentCode":[" "]
}
"""

LOOKUP_OG = {}
# Create codes for the top level "Offence Group"
for OG in combinedDf["Offence Group"].unique():
    LOOKUP_OG.update({OG:OG.replace(" ", "-").lower().strip()})
    
# Sanity check dict/map
from pprint import pprint
pprint(LOOKUP_OG)
    

{'Criminal damage and arson': 'criminal-damage-and-arson',
 'Drug offences': 'drug-offences',
 'Fraud offences': 'fraud-offences',
 'Miscellaneous crimes against society': 'miscellaneous-crimes-against-society',
 'Possession of weapons offences': 'possession-of-weapons-offences',
 'Public order offences': 'public-order-offences',
 'Robbery': 'robbery',
 'Sexual offences': 'sexual-offences',
 'Theft offences': 'theft-offences',
 'Violence against the person': 'violence-against-the-person'}


In [27]:

# Now add them to our hierarhcyCSV and sanity check it too
hierarchyCSV = {
    "Codelist":[],
    "Code":[],
    "Label":[],
    "ParentCode":[]
}

for offence in LOOKUP_OG:
    hierarchyCSV["Codelist"].append(codeList)
    hierarchyCSV["Code"].append(LOOKUP_OG[offence])
    hierarchyCSV["Label"].append(offence)
    hierarchyCSV["ParentCode"].append("")
    

# Make dataframe and sanity check
newDf = pd.DataFrame.from_dict(hierarchyCSV)
newDf
    

Unnamed: 0,Code,Codelist,Label,ParentCode
0,miscellaneous-crimes-against-society,offence,Miscellaneous crimes against society,
1,sexual-offences,offence,Sexual offences,
2,theft-offences,offence,Theft offences,
3,criminal-damage-and-arson,offence,Criminal damage and arson,
4,violence-against-the-person,offence,Violence against the person,
5,drug-offences,offence,Drug offences,
6,possession-of-weapons-offences,offence,Possession of weapons offences,
7,public-order-offences,offence,Public order offences,
8,robbery,offence,Robbery,
9,fraud-offences,offence,Fraud offences,


In [28]:

# Going to have to iterate and compare for next level. this will be slow

# We need to track each label so we only add it once.
labelsAlreadySeen = []
LOOKUP_SG = {}

for index, row in combinedDf.iterrows():
    
    if row["Offence Subgroup"] not in labelsAlreadySeen:
        
        # will need to code-ify the label again
        code = row["Offence"].replace(" ", "-").lower().strip()
        
        hierarchyCSV["Codelist"].append(codeList)
        hierarchyCSV["Code"].append(code)
        hierarchyCSV["Label"].append(row["Offence Subgroup"])
        hierarchyCSV["ParentCode"].append(LOOKUP_OG[row["Offence Group"]])
        
        labelsAlreadySeen.append(row["Offence Subgroup"])
        
        # Will need another lookup dict/map for last stage
        LOOKUP_SG.update({row["Offence Subgroup"]:code})

# Sanity check. Use last 10 rows
newDf = pd.DataFrame.from_dict(hierarchyCSV)
newDf[-10:]



Unnamed: 0,Code,Codelist,Label,ParentCode
25,other-offences-against-the-state-or-public-order,offence,Public order offences,public-order-offences
26,rape-of-a-female-aged-16-and-over,offence,Rape,sexual-offences
27,robbery-of-business-property,offence,Robbery of business property,robbery
28,robbery-of-personal-property,offence,Robbery of personal property,robbery
29,shoplifting,offence,Shoplifting,theft-offences
30,theft-from-the-person,offence,Theft from the person,theft-offences
31,theft-from-vehicle,offence,Theft from a vehicle,theft-offences
32,theft-or-unauthorised-taking-of-a-pedal-cycle,offence,Bicycle theft,theft-offences
33,trafficking-in-controlled-drugs,offence,Trafficking of drugs,drug-offences
34,bankruptcy-and-insolvency-(outcomes-only),offence,Fraud offences to 2012/13,fraud-offences


In [29]:
"""

IMPORTANT - Homicide code
--------------------------

For whatever reason they seems to be intermitantly changing between two codes to represent homicide.
We need to make sure it really is intermitant.

i.e make damn sure no homicide figures are in twice with differing codes.

homicideCheckFrame = pd.DataFrame()

homicideCheckFrame["Time"] = combinedDf["Time"]
homicideCheckFrame["Offence Code"] = combinedDf["Offence_codelist"]
homicideCheckFrame["Offence Description"] = combinedDf["Offence"]

# Create a "just Homicide" dataframe
homicideCheckFrame = homicideCheckFrame[homicideCheckFrame["Offence Description"] == "Homicide"]

# Concatenate time and code into a new columm
homicideCheckFrame["TimeAndCrime"] = homicideCheckFrame["Time"] + homicideCheckFrame["Offence Code"]

# Slice down to unique combinatons of that
uniqueTimeCodeCombos = homicideCheckFrame["TimeAndCrime"].unique()

for combo in uniqueTimeCodeCombos:
    print(combo[:7], "  |  ", combo[7:])

    
combinedDf.columns.values

"""

'\n\nIMPORTANT - Homicide code\n--------------------------\n\nFor whatever reason they seems to be intermitantly changing between two codes to represent homicide.\nWe need to make sure it really is intermitant.\n\ni.e make damn sure no homicide figures are in twice with differing codes.\n\nhomicideCheckFrame = pd.DataFrame()\n\nhomicideCheckFrame["Time"] = combinedDf["Time"]\nhomicideCheckFrame["Offence Code"] = combinedDf["Offence_codelist"]\nhomicideCheckFrame["Offence Description"] = combinedDf["Offence"]\n\n# Create a "just Homicide" dataframe\nhomicideCheckFrame = homicideCheckFrame[homicideCheckFrame["Offence Description"] == "Homicide"]\n\n# Concatenate time and code into a new columm\nhomicideCheckFrame["TimeAndCrime"] = homicideCheckFrame["Time"] + homicideCheckFrame["Offence Code"]\n\n# Slice down to unique combinatons of that\nuniqueTimeCodeCombos = homicideCheckFrame["TimeAndCrime"].unique()\n\nfor combo in uniqueTimeCodeCombos:\n    print(combo[:7], "  |  ", combo[7:])\n\n

In [None]:

# We need to track each label so we only add it once.
# AND we also need to make sure each label:code combination is consistant.
labelsAlreadySeen = {}
codesAlreadySeen = {}
issuesLog = []

for index, row in combinedDf.iterrows():
    
        # Strip any trailing spaces etc in code
        code = row["Offence_codelist"]
        code = str(code).strip()

        hierarchyCSV["Codelist"].append(codeList)
        hierarchyCSV["Code"].append(code)
        hierarchyCSV["Label"].append(row["Offence"])
        hierarchyCSV["ParentCode"].append(LOOKUP_SG[row["Offence Subgroup"]])
        
        # Throw an error is the same label is present with mutiple representative codes
        # .... this is how we knew about Homicide (see above)
        
        if row["Offence"] not in labelsAlreadySeen.keys():
            
            # Make sure we havn't had this code already for another label
            if code in codesAlreadySeen.keys():
                errorOut = """
                A single value has multiple codes
                {code}
                {a}
                {b}
                """.format(code=code,
                           a=row["Offence"] ,
                           b=codesAlreadySeen[code])
                if errorOut not in issuesLog:
                    issuesLog.append(errorOut)
                
            labelsAlreadySeen.update({row["Offence"]:code})
            codesAlreadySeen.update({code:row["Offence"]})
            
        else:
 
            # We've seen if before. Check that it matching to the same code.
            if labelsAlreadySeen[row["Offence"]] != code:
                errorOut = """
                A single code use for multiple vales:
                {label}
                {a}
                {b}
                """.format(label=row["Offence"],
                           a=labelsAlreadySeen[row["Offence"]], 
                           b=row["Offence_codelist"])
                if errorOut not in issuesLog:
                    issuesLog.append(errorOut)

if len(issuesLog) > 0:
    print("Error: There is not a 1-to-1 relationship between desciption and code")
    for issue in issuesLog:  
        print(issue)
else:
    print("Codes are fine.")
    
    # Output our hierarchy file
    newDf = pd.DataFrame.from_dict(hierarchyCSV)
    
    # Restrcture for order
    # TODO - in place!
    outFile = pd.DataFrame()
    outFile["Codelist"] = newDf["Codelist"]
    outFile["Code"] = newDf["Code"]
    outFile["Label"] = newDf["Label"]
    outFile["ParentCode"] = newDf["ParentCode"]
    
    import csv
    
    outFile.to_csv("Hirarchy_CSV_Offences.csv", quoting=csv.QUOTE_ALL, index=False)
    
    # Sanity check, first 20 rows
    newDf[:20]
    

Codes are fine.


In [None]:

"""
Combine each dataframe into once big one. Then check sparsity and output to csv
"""

# Drop the unwanted columns we only needed for building the hierarchy
unWanted = ["Offence Description","Offence Group","Offence Subgroup","Offence Code"]
for column in combinedDf.columns.values:
    if column in unWanted:
        combinedDf = combinedDf.drop(column, axis=1)

# Check all the sparsity:
d1 = len(combinedDf["Offence"].unique())
d2 = len(combinedDf["Geography"].unique())
d3 = len(combinedDf["Time"].unique())
cubeSize = d1 * d2 * d3
    
print("COMBINED CUBE")
print("Logical Cube Size: ", cubeSize)
print("Number of rows:    ", len(combinedDf))
print("Sparsity:          ", 100 - ((100/cubeSize)*len(combinedDf)))
print("")

#newDf.to_csv("lastSample.csv", index=False)

if CHOICE_financialTocalendar:
    outName = "CALENDAR_CimeWithHomeOffice.csv"
else:
    outName = "FINANCE_CimeWithHomeOffice.csv"
    
combinedDf.to_csv(outName, index=False)
print("CSV Written to V4")


COMBINED CUBE
Logical Cube Size:  58480
Number of rows:     55040
Sparsity:           5.882352941176478



---

## Build codelists

We need codelist IDs for the hierarchy so need to sort this out first.



In [None]:

# geography
codelist = pd.DataFrame()
codelist["geography"] = combinedDf["Geography"]
codelist["geography_codelist"] = combinedDf["Geography_codelist"]
codelist = codelist.drop_duplicates()
codelist.to_csv("CL_geography_codelist.csv", index=False)


# offence
codelist = pd.DataFrame()
codelist["offence"] = combinedDf["Offence"]
codelist["offence_codelist"] = combinedDf["Offence_codelist"]
codelist = codelist.drop_duplicates()
codelist.to_csv("CL_offence_codelist.csv", index=False)
