### Notebook to identify possible issues in incoming data

### Imports

In [235]:
import pandas as pd

In [236]:
plants_df = pd.read_json("../plants.json")
plants_df.set_index('plant_id', inplace=True)
plants_df.columns.name = plants_df.index.name
plants_df.index.name = None
plants_df

plant_id,name,temperature,origin_location,botanist,last_watered,soil_moisture,recording_taken,images,scientific_name
1,Venus flytrap,13.701703,"{'latitude': 43.74, 'longitude': -11.5098, 'ci...","{'name': 'Kenneth Buckridge', 'email': 'kennet...",2025-09-23T13:51:41.000Z,91.470924,2025-09-23T16:19:22.710Z,,
2,Corpse flower,13.734596,"{'latitude': 47.8428, 'longitude': -48.7087, '...","{'name': 'Ms. Diana King', 'email': 'ms..diana...",2025-09-23T14:58:34.000Z,95.193972,2025-09-23T16:19:23.282Z,,
3,Rafflesia arnoldii,15.435406,"{'latitude': -25.4878, 'longitude': -36.1349, ...","{'name': 'Eduardo Okuneva II', 'email': 'eduar...",2025-09-23T13:58:19.000Z,91.684596,2025-09-23T16:19:24.081Z,,
4,Black bat flower,16.003995,"{'latitude': 63.3661, 'longitude': 46.4049, 'c...","{'name': 'Wilson Welch', 'email': 'wilson.welc...",2025-09-23T14:56:07.000Z,95.138595,2025-09-23T16:19:24.524Z,,
5,Pitcher plant,16.386299,"{'latitude': 82.8917, 'longitude': 0.6279, 'ci...","{'name': 'Benny Block', 'email': 'benny.block@...",2025-09-23T13:57:08.000Z,91.945308,2025-09-23T16:19:25.974Z,"{'license': 451, 'license_name': 'CC0 1.0 Univ...",[Sarracenia catesbaei]
6,Wollemi pine,14.877701,"{'latitude': -40.3521, 'longitude': -12.1055, ...","{'name': 'Iris Jenkins', 'email': 'iris.jenkin...",2025-09-23T13:03:31.000Z,88.825326,2025-09-23T16:19:26.757Z,"{'license': 451, 'license_name': 'CC0 1.0 Univ...",[Wollemia nobilis]
8,Bird of paradise,15.596134,"{'latitude': 54.1635, 'longitude': 8.6662, 'ci...","{'name': 'Bradford Mitchell DVM', 'email': 'br...",2025-09-23T13:33:20.000Z,90.872321,2025-09-23T16:19:27.425Z,"{'license': 451, 'license_name': 'CC0 1.0 Univ...",[Heliconia schiedeana 'Fire and Ice']
9,Cactus,12.018504,"{'latitude': 22.1228, 'longitude': -11.0358, '...","{'name': 'Jo Baumbach', 'email': 'jo.baumbach@...",2025-09-23T14:19:58.000Z,93.064892,2025-09-23T16:19:28.902Z,"{'license': 451, 'license_name': 'CC0 1.0 Univ...",[Pereskia grandifolia]
10,Dragon tree,13.42834,"{'latitude': -85.7462, 'longitude': 178.9976, ...","{'name': 'Terrance Leuschke', 'email': 'terran...",2025-09-23T14:15:42.000Z,93.045346,2025-09-23T16:19:29.571Z,,
11,Asclepias Curassavica,15.886254,"{'latitude': 89.0252, 'longitude': -108.526, '...","{'name': 'Chester Smith', 'email': 'chester.sm...",2025-09-23T13:32:16.000Z,90.533085,2025-09-23T16:19:30.930Z,"{'license': 4, 'license_name': 'Attribution Li...",[Asclepias curassavica]


In [237]:
def add_columns(df):
    "Some of these columns was stored in a dict under one column extrapolated them out"
    df['lat'] = df['origin_location'].apply(lambda x: x['latitude'])
    df['long'] = df['origin_location'].apply(lambda x: x['longitude'])
    df['city'] = df['origin_location'].apply(lambda x: x['city'])
    df['country'] = df['origin_location'].apply(lambda x: x['country'])
    df['botanist_name'] = df['botanist'].apply(lambda x: x['name'])
    df['email'] = df['botanist'].apply(lambda x: x['email'])
    df['phone'] = df['botanist'].apply(lambda x: x['phone'])
    return df

In [238]:
def drop_columns(df):
    "remove unnecessary columns"
    df = df.drop(columns=['origin_location', 'botanist', 'images'])
    return df

In [239]:
def change_type_to_date(df):
    "changed types of 2 columns to datetime"
    df['last_watered'] = pd.to_datetime(df['last_watered'])
    df['recording_taken'] = pd.to_datetime(df['recording_taken'])
    return df

In [240]:
plants_df = add_columns(plants_df)
plants_df = drop_columns(plants_df)
plants_df = change_type_to_date(plants_df)
plants_df

plant_id,name,temperature,last_watered,soil_moisture,recording_taken,scientific_name,lat,long,city,country,botanist_name,email,phone
1,Venus flytrap,13.701703,2025-09-23 13:51:41+00:00,91.470924,2025-09-23 16:19:22.710000+00:00,,43.74,-11.5098,Stammside,Albania,Kenneth Buckridge,kenneth.buckridge@lnhm.co.uk,763.914.8635 x57724
2,Corpse flower,13.734596,2025-09-23 14:58:34+00:00,95.193972,2025-09-23 16:19:23.282000+00:00,,47.8428,-48.7087,Floshire,American Samoa,Ms. Diana King,ms..diana.king@lnhm.co.uk,673.641.8851
3,Rafflesia arnoldii,15.435406,2025-09-23 13:58:19+00:00,91.684596,2025-09-23 16:19:24.081000+00:00,,-25.4878,-36.1349,Dale City,Mozambique,Eduardo Okuneva II,eduardo.okuneva.ii@lnhm.co.uk,408-816-2276 x87051
4,Black bat flower,16.003995,2025-09-23 14:56:07+00:00,95.138595,2025-09-23 16:19:24.524000+00:00,,63.3661,46.4049,West Tedboro,Taiwan,Wilson Welch,wilson.welch@lnhm.co.uk,(953) 607-4239 x328
5,Pitcher plant,16.386299,2025-09-23 13:57:08+00:00,91.945308,2025-09-23 16:19:25.974000+00:00,[Sarracenia catesbaei],82.8917,0.6279,North Felicia,Saint Kitts and Nevis,Benny Block,benny.block@lnhm.co.uk,687-647-1094
6,Wollemi pine,14.877701,2025-09-23 13:03:31+00:00,88.825326,2025-09-23 16:19:26.757000+00:00,[Wollemia nobilis],-40.3521,-12.1055,Ferryfort,Mauritius,Iris Jenkins,iris.jenkins@lnhm.co.uk,288.875.3012 x4682
8,Bird of paradise,15.596134,2025-09-23 13:33:20+00:00,90.872321,2025-09-23 16:19:27.425000+00:00,[Heliconia schiedeana 'Fire and Ice'],54.1635,8.6662,Edwardfurt,Liberia,Bradford Mitchell DVM,bradford.mitchell.dvm@lnhm.co.uk,(230) 859-2277 x3537
9,Cactus,12.018504,2025-09-23 14:19:58+00:00,93.064892,2025-09-23 16:19:28.902000+00:00,[Pereskia grandifolia],22.1228,-11.0358,Port Johan,Cayman Islands,Jo Baumbach,jo.baumbach@lnhm.co.uk,976-364-3090
10,Dragon tree,13.42834,2025-09-23 14:15:42+00:00,93.045346,2025-09-23 16:19:29.571000+00:00,,-85.7462,178.9976,North Adriel,Nicaragua,Terrance Leuschke,terrance.leuschke@lnhm.co.uk,1-661-425-6823 x4455
11,Asclepias Curassavica,15.886254,2025-09-23 13:32:16+00:00,90.533085,2025-09-23 16:19:30.930000+00:00,[Asclepias curassavica],89.0252,-108.526,Dorianland,Mali,Chester Smith,chester.smith@lnhm.co.uk,1-730-711-3377 x08275


In [241]:
plants_df['phone'] = plants_df['phone'].str.replace(
    r'(\(|\))', '', regex=True).replace(r'x(.*)', '', regex=True).replace(r'^1-', '', regex=True)
plants_df['phone'] = plants_df['phone'].str.rstrip(' ').str.replace('.', '-').str.replace(' ', '-')
plants_df

plant_id,name,temperature,last_watered,soil_moisture,recording_taken,scientific_name,lat,long,city,country,botanist_name,email,phone
1,Venus flytrap,13.701703,2025-09-23 13:51:41+00:00,91.470924,2025-09-23 16:19:22.710000+00:00,,43.74,-11.5098,Stammside,Albania,Kenneth Buckridge,kenneth.buckridge@lnhm.co.uk,763-914-8635
2,Corpse flower,13.734596,2025-09-23 14:58:34+00:00,95.193972,2025-09-23 16:19:23.282000+00:00,,47.8428,-48.7087,Floshire,American Samoa,Ms. Diana King,ms..diana.king@lnhm.co.uk,673-641-8851
3,Rafflesia arnoldii,15.435406,2025-09-23 13:58:19+00:00,91.684596,2025-09-23 16:19:24.081000+00:00,,-25.4878,-36.1349,Dale City,Mozambique,Eduardo Okuneva II,eduardo.okuneva.ii@lnhm.co.uk,408-816-2276
4,Black bat flower,16.003995,2025-09-23 14:56:07+00:00,95.138595,2025-09-23 16:19:24.524000+00:00,,63.3661,46.4049,West Tedboro,Taiwan,Wilson Welch,wilson.welch@lnhm.co.uk,953-607-4239
5,Pitcher plant,16.386299,2025-09-23 13:57:08+00:00,91.945308,2025-09-23 16:19:25.974000+00:00,[Sarracenia catesbaei],82.8917,0.6279,North Felicia,Saint Kitts and Nevis,Benny Block,benny.block@lnhm.co.uk,687-647-1094
6,Wollemi pine,14.877701,2025-09-23 13:03:31+00:00,88.825326,2025-09-23 16:19:26.757000+00:00,[Wollemia nobilis],-40.3521,-12.1055,Ferryfort,Mauritius,Iris Jenkins,iris.jenkins@lnhm.co.uk,288-875-3012
8,Bird of paradise,15.596134,2025-09-23 13:33:20+00:00,90.872321,2025-09-23 16:19:27.425000+00:00,[Heliconia schiedeana 'Fire and Ice'],54.1635,8.6662,Edwardfurt,Liberia,Bradford Mitchell DVM,bradford.mitchell.dvm@lnhm.co.uk,230-859-2277
9,Cactus,12.018504,2025-09-23 14:19:58+00:00,93.064892,2025-09-23 16:19:28.902000+00:00,[Pereskia grandifolia],22.1228,-11.0358,Port Johan,Cayman Islands,Jo Baumbach,jo.baumbach@lnhm.co.uk,976-364-3090
10,Dragon tree,13.42834,2025-09-23 14:15:42+00:00,93.045346,2025-09-23 16:19:29.571000+00:00,,-85.7462,178.9976,North Adriel,Nicaragua,Terrance Leuschke,terrance.leuschke@lnhm.co.uk,661-425-6823
11,Asclepias Curassavica,15.886254,2025-09-23 13:32:16+00:00,90.533085,2025-09-23 16:19:30.930000+00:00,[Asclepias curassavica],89.0252,-108.526,Dorianland,Mali,Chester Smith,chester.smith@lnhm.co.uk,730-711-3377


In [242]:
plants_df['scientific_name'] = plants_df['scientific_name'].apply(lambda x: str(x) if pd.notna(x) else x)
plants_df['scientific_name'] = plants_df['scientific_name'].str.replace(r'\[|\]', '', regex=True)
plants_df

plant_id,name,temperature,last_watered,soil_moisture,recording_taken,scientific_name,lat,long,city,country,botanist_name,email,phone
1,Venus flytrap,13.701703,2025-09-23 13:51:41+00:00,91.470924,2025-09-23 16:19:22.710000+00:00,,43.74,-11.5098,Stammside,Albania,Kenneth Buckridge,kenneth.buckridge@lnhm.co.uk,763-914-8635
2,Corpse flower,13.734596,2025-09-23 14:58:34+00:00,95.193972,2025-09-23 16:19:23.282000+00:00,,47.8428,-48.7087,Floshire,American Samoa,Ms. Diana King,ms..diana.king@lnhm.co.uk,673-641-8851
3,Rafflesia arnoldii,15.435406,2025-09-23 13:58:19+00:00,91.684596,2025-09-23 16:19:24.081000+00:00,,-25.4878,-36.1349,Dale City,Mozambique,Eduardo Okuneva II,eduardo.okuneva.ii@lnhm.co.uk,408-816-2276
4,Black bat flower,16.003995,2025-09-23 14:56:07+00:00,95.138595,2025-09-23 16:19:24.524000+00:00,,63.3661,46.4049,West Tedboro,Taiwan,Wilson Welch,wilson.welch@lnhm.co.uk,953-607-4239
5,Pitcher plant,16.386299,2025-09-23 13:57:08+00:00,91.945308,2025-09-23 16:19:25.974000+00:00,'Sarracenia catesbaei',82.8917,0.6279,North Felicia,Saint Kitts and Nevis,Benny Block,benny.block@lnhm.co.uk,687-647-1094
6,Wollemi pine,14.877701,2025-09-23 13:03:31+00:00,88.825326,2025-09-23 16:19:26.757000+00:00,'Wollemia nobilis',-40.3521,-12.1055,Ferryfort,Mauritius,Iris Jenkins,iris.jenkins@lnhm.co.uk,288-875-3012
8,Bird of paradise,15.596134,2025-09-23 13:33:20+00:00,90.872321,2025-09-23 16:19:27.425000+00:00,"""Heliconia schiedeana 'Fire and Ice'""",54.1635,8.6662,Edwardfurt,Liberia,Bradford Mitchell DVM,bradford.mitchell.dvm@lnhm.co.uk,230-859-2277
9,Cactus,12.018504,2025-09-23 14:19:58+00:00,93.064892,2025-09-23 16:19:28.902000+00:00,'Pereskia grandifolia',22.1228,-11.0358,Port Johan,Cayman Islands,Jo Baumbach,jo.baumbach@lnhm.co.uk,976-364-3090
10,Dragon tree,13.42834,2025-09-23 14:15:42+00:00,93.045346,2025-09-23 16:19:29.571000+00:00,,-85.7462,178.9976,North Adriel,Nicaragua,Terrance Leuschke,terrance.leuschke@lnhm.co.uk,661-425-6823
11,Asclepias Curassavica,15.886254,2025-09-23 13:32:16+00:00,90.533085,2025-09-23 16:19:30.930000+00:00,'Asclepias curassavica',89.0252,-108.526,Dorianland,Mali,Chester Smith,chester.smith@lnhm.co.uk,730-711-3377


______________________
#### Emails

- Cannot have special characters right before the '@'
- Cannot start with a special character
- Cannot have two special characters in a row

In [243]:
import re
plants_df['email'] = plants_df['email'].str.replace(
    r'^[\W_]+', '', regex=True).replace(
    r'[\W_]+@', '@', regex=True)

plants_df['email'] = plants_df['email'].apply(lambda x: re.sub(r'([\W_])\1+', r'\1', x))
plants_df

plant_id,name,temperature,last_watered,soil_moisture,recording_taken,scientific_name,lat,long,city,country,botanist_name,email,phone
1,Venus flytrap,13.701703,2025-09-23 13:51:41+00:00,91.470924,2025-09-23 16:19:22.710000+00:00,,43.74,-11.5098,Stammside,Albania,Kenneth Buckridge,kenneth.buckridge@lnhm.co.uk,763-914-8635
2,Corpse flower,13.734596,2025-09-23 14:58:34+00:00,95.193972,2025-09-23 16:19:23.282000+00:00,,47.8428,-48.7087,Floshire,American Samoa,Ms. Diana King,ms.diana.king@lnhm.co.uk,673-641-8851
3,Rafflesia arnoldii,15.435406,2025-09-23 13:58:19+00:00,91.684596,2025-09-23 16:19:24.081000+00:00,,-25.4878,-36.1349,Dale City,Mozambique,Eduardo Okuneva II,eduardo.okuneva.ii@lnhm.co.uk,408-816-2276
4,Black bat flower,16.003995,2025-09-23 14:56:07+00:00,95.138595,2025-09-23 16:19:24.524000+00:00,,63.3661,46.4049,West Tedboro,Taiwan,Wilson Welch,wilson.welch@lnhm.co.uk,953-607-4239
5,Pitcher plant,16.386299,2025-09-23 13:57:08+00:00,91.945308,2025-09-23 16:19:25.974000+00:00,'Sarracenia catesbaei',82.8917,0.6279,North Felicia,Saint Kitts and Nevis,Benny Block,benny.block@lnhm.co.uk,687-647-1094
6,Wollemi pine,14.877701,2025-09-23 13:03:31+00:00,88.825326,2025-09-23 16:19:26.757000+00:00,'Wollemia nobilis',-40.3521,-12.1055,Ferryfort,Mauritius,Iris Jenkins,iris.jenkins@lnhm.co.uk,288-875-3012
8,Bird of paradise,15.596134,2025-09-23 13:33:20+00:00,90.872321,2025-09-23 16:19:27.425000+00:00,"""Heliconia schiedeana 'Fire and Ice'""",54.1635,8.6662,Edwardfurt,Liberia,Bradford Mitchell DVM,bradford.mitchell.dvm@lnhm.co.uk,230-859-2277
9,Cactus,12.018504,2025-09-23 14:19:58+00:00,93.064892,2025-09-23 16:19:28.902000+00:00,'Pereskia grandifolia',22.1228,-11.0358,Port Johan,Cayman Islands,Jo Baumbach,jo.baumbach@lnhm.co.uk,976-364-3090
10,Dragon tree,13.42834,2025-09-23 14:15:42+00:00,93.045346,2025-09-23 16:19:29.571000+00:00,,-85.7462,178.9976,North Adriel,Nicaragua,Terrance Leuschke,terrance.leuschke@lnhm.co.uk,661-425-6823
11,Asclepias Curassavica,15.886254,2025-09-23 13:32:16+00:00,90.533085,2025-09-23 16:19:30.930000+00:00,'Asclepias curassavica',89.0252,-108.526,Dorianland,Mali,Chester Smith,chester.smith@lnhm.co.uk,730-711-3377
