Import the data and get the info for all of the variable data types (dtype). One the first things to always look for is to make sure all of the dates/times are in the same format.

In [102]:
# Import Data

import pandas as pd
import numpy as np
from datetime import datetime, timezone

maritime_df = pd.read_csv('C:/Users/NicholasThompson/pythonProjects/dataCleaning/threeBucket/mt_natuna_20220919T135650_maritimeTraffic_sample.csv')

spire_df = pd.read_csv('C:/Users/NicholasThompson/pythonProjects/dataCleaning/threeBucket/bquxjob_474ef954_1837a61a5a3_spire-sample.csv')

orbCommStream_df = pd.read_csv('C:/Users/NicholasThompson/pythonProjects/dataCleaning/threeBucket/bquxjob_42451d86_1837a6b583a_orbComm-stream_sample.csv')


In [103]:
maritime_df.TIMESTAMP.info()

<class 'pandas.core.series.Series'>
RangeIndex: 65 entries, 0 to 64
Series name: TIMESTAMP
Non-Null Count  Dtype 
--------------  ----- 
65 non-null     object
dtypes: object(1)
memory usage: 648.0+ bytes


In [104]:
Mtimestamp_array = maritime_df[["TIMESTAMP"]]
Mtimestamp_array.sample(5)

Unnamed: 0,TIMESTAMP
61,9/19/2022 13:44
63,9/19/2022 13:45
6,9/19/2022 13:40
59,9/19/2022 13:36
19,9/19/2022 13:01


In [105]:
spire_df.MovementDateTime.info()

<class 'pandas.core.series.Series'>
RangeIndex: 65 entries, 0 to 64
Series name: MovementDateTime
Non-Null Count  Dtype 
--------------  ----- 
65 non-null     object
dtypes: object(1)
memory usage: 648.0+ bytes


In [106]:
Stimestamp_array = spire_df[["MovementDateTime"]]
Stimestamp_array.sample(5)

Unnamed: 0,MovementDateTime
2,2021-07-28T08:57:05Z
34,2021-07-24T20:32:41Z
15,2021-07-19T19:58:28Z
55,2021-07-30T23:58:27Z
26,2021-07-27T21:51:36Z


In [107]:
orbCommStream_df.datetime_UTC.info()

<class 'pandas.core.series.Series'>
RangeIndex: 65 entries, 0 to 64
Series name: datetime_UTC
Non-Null Count  Dtype 
--------------  ----- 
65 non-null     object
dtypes: object(1)
memory usage: 648.0+ bytes


In [108]:
Otimestamp_array = orbCommStream_df[["datetime_UTC"]]
Otimestamp_array.sample(5)

Unnamed: 0,datetime_UTC
20,2022-09-26T15:24:36Z
51,2022-09-26T15:24:44Z
62,2022-09-26T15:24:44Z
37,2022-09-26T15:24:40Z
4,2022-09-26T15:24:00Z


Fix the date data types to be `datetime` from pandas

In [113]:
# Inspect maritime time in detail
maritime_df["TIMESTAMP"] = datetime.now(tz=timezone.utc)
maritime_df["TIMESTAMP"] = pd.to_datetime(maritime_df["TIMESTAMP"])
maritime_df["TIMESTAMP"] = maritime_df["TIMESTAMP"].replace(tzinfo=None)
maritime_df.TIMESTAMP.info() # Note: dtype is datetime64[ns]
print("\n")
maritime_df.TIMESTAMP.sample(5) 

TypeError: replace() got an unexpected keyword argument 'tzinfo'

In [110]:
# Inspect spire time in detail
spire_df["MovementDateTime"] = datetime.now(tz=timezone.utc)
print(spire_df['MovementDateTime'])
spire_df.info() # Note: dtype is datetime64[ns, UTC]
print("\n")
spire_df.sample(5)
print("\n")

0    2022-09-26 20:50:59.348475+00:00
1    2022-09-26 20:50:59.348475+00:00
2    2022-09-26 20:50:59.348475+00:00
3    2022-09-26 20:50:59.348475+00:00
4    2022-09-26 20:50:59.348475+00:00
                   ...               
60   2022-09-26 20:50:59.348475+00:00
61   2022-09-26 20:50:59.348475+00:00
62   2022-09-26 20:50:59.348475+00:00
63   2022-09-26 20:50:59.348475+00:00
64   2022-09-26 20:50:59.348475+00:00
Name: MovementDateTime, Length: 65, dtype: datetime64[ns, UTC]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65 entries, 0 to 64
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype              
---  ------            --------------  -----              
 0   MMSI              65 non-null     int64              
 1   Latitude          65 non-null     float64            
 2   Longitude         65 non-null     float64            
 3   Speed             65 non-null     float64            
 4   Heading           65 non-null     float64            
 5  

In [None]:
# Remove time zone
spire_df['MovementDateTime'].dt.tz_convert(None) # Remove time zone.
spire_df.MovementDateTime.info()
print("\n")
spire_df.MovementDateTime.sample(5)


Subset the data, make col-names identical, inspect.

In [14]:
# Subset Data

maritimeDF_subset = maritime_df[["MMSI","LAT","LON","SPEED","HEADING","TIMESTAMP"]] # subset columns
maritimeDF_subset.columns = ["mmsi", "latitude", "longitude", "speed", "heading", "timestamp"] # rename columns
for col in maritimeDF_subset.columns:
    print(col)
  
print("\n")

spireDF_subset = spire_df[["MMSI","Latitude","Longitude","Speed","Heading","MovementDateTime"]] # subset columns
spireDF_subset.columns = ["mmsi", "latitude", "longitude", "speed", "heading", "timestamp"] # rename columns
for col in spireDF_subset.columns:
    print(col)

print("\n")

orbCommStreamDF_subset = orbCommStream_df[["mmsi","lat","lon","speed","heading","datetime_UTC"]] # subset columns
orbCommStreamDF_subset.columns = ["mmsi", "latitude", "longitude", "speed", "heading", "timestamp"] # rename columns
for col in orbCommStreamDF_subset.columns:
   print(col)


mmsi
latitude
longitude
speed
heading
timestamp


mmsi
latitude
longitude
speed
heading
timestamp


mmsi
latitude
longitude
speed
heading
timestamp


Create a new column named `dataSource` that will capture from which the dataframe originated. Inspect the data columns to 

In [19]:
# Add dataSource column for maritimeDF_subset

maritimeDF_subset['dataSource'] = 'maritimeTraffic'
maritimeDF_subset.sample(10)








A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  maritimeDF_subset['dataSource'] = 'maritimeTraffic'


Unnamed: 0,mmsi,latitude,longitude,speed,heading,timestamp,dataSource
41,538008815,1.445,105.9,110,511,9/19/2022 13:45,maritimeTraffic
28,255805849,3.68,105.625,150,511,9/19/2022 13:41,maritimeTraffic
7,374907000,2.59,107.3933,120,511,9/19/2022 13:43,maritimeTraffic
15,636020999,3.18,105.2983,120,511,9/19/2022 13:36,maritimeTraffic
22,538008831,1.16,105.41,110,511,9/19/2022 13:44,maritimeTraffic
42,412440804,2.763333,105.0617,10,511,9/19/2022 13:44,maritimeTraffic
43,210430000,3.799688,107.1642,111,190,9/19/2022 13:01,maritimeTraffic
18,506127000,1.014167,106.0377,121,295,9/19/2022 12:59,maritimeTraffic
50,533130192,3.461667,105.585,70,511,9/19/2022 13:38,maritimeTraffic
13,477139900,3.778333,105.6867,170,511,9/19/2022 13:45,maritimeTraffic


In [20]:
# Add dataSource column for spireDF_subset

spireDF_subset['dataSource'] = 'spire'
spireDF_subset.sample(10)

Unnamed: 0,mmsi,latitude,longitude,speed,heading,timestamp,dataSource
32,239806300,37.804482,23.775738,0.1,290.6,2021-07-30T10:57:46Z,spire
51,2512003,64.063698,-21.96659,0.0,346.5,2021-07-19T19:57:57Z,spire
7,617008000,14.887717,-24.678392,0.4,0.0,2021-07-23T23:42:14Z,spire
42,563031390,1.289328,103.973625,0.1,0.0,2021-07-21T01:59:32Z,spire
35,309856000,-22.181513,-39.953247,0.1,272.1,2021-07-31T11:45:34Z,spire
34,414352760,32.717303,121.810693,0.0,270.0,2021-07-24T20:32:41Z,spire
13,355417000,28.451667,33.055,0.0,0.0,2021-07-25T06:37:05Z,spire
21,636019374,-24.57009,-42.24894,0.2,30.0,2021-07-15T23:58:38Z,spire
60,563023930,1.269758,103.863597,0.1,303.9,2021-07-29T04:43:38Z,spire
19,412320710,38.062007,118.968327,0.0,117.8,2021-07-27T10:58:07Z,spire


In [21]:
# Add dataSource column for orbCommStreamDF_subset

orbCommStreamDF_subset['dataSource'] = 'orbComm'
orbCommStreamDF_subset.sample(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  orbCommStreamDF_subset['dataSource'] = 'orbComm'


Unnamed: 0,mmsi,latitude,longitude,speed,heading,timestamp,dataSource
11,563057600,-9.216833,-146.453338,10.1,298,2022-09-26T15:05:08Z,orbComm
45,636022074,7.052477,79.766517,0.2,252,2022-09-26T15:24:40Z,orbComm
39,431005858,34.209977,135.143712,0.0,245,2022-09-26T15:24:40Z,orbComm
44,413835601,32.130205,119.555357,0.1,175,2022-09-26T15:24:40Z,orbComm
32,431001046,37.2401,138.14545,14.5,120,2022-09-26T15:24:36Z,orbComm
16,512005119,-36.843537,174.770093,0.0,168,2022-09-26T15:05:16Z,orbComm
9,413772502,23.084855,113.3966,0.0,355,2022-09-26T15:24:00Z,orbComm
34,373393000,36.223137,142.091033,11.2,226,2022-09-26T15:24:36Z,orbComm
55,413267350,30.007455,122.095202,0.2,215,2022-09-26T15:24:44Z,orbComm
53,232032840,-18.382863,146.54336,17.8,330,2022-09-26T15:24:44Z,orbComm
