### Date & Time Handling

In [2]:
import pandas as pd
import numpy as np

# Create synthetic dataset with string datetime values
np.random.seed(42)
dates = pd.date_range(start='2023-01-01', periods=10, freq='D').astype(str)  # 10 days
random_dates = np.random.choice(dates, size=15)  # 15 records with some repeats

# Create DataFrame with string datetime column and a feature
df = pd.DataFrame({
    'accident_id': range(1, 16),
    'date_str': random_dates,
    'severity': np.random.randint(1, 5, size=15)  # severity levels 1 to 4
})

# Show initial dataset
print("Initial dataset with date column as string:")
print(df.head())

Initial dataset with date column as string:
   accident_id    date_str  severity
0            1  2023-01-07         1
1            2  2023-01-04         2
2            3  2023-01-08         4
3            4  2023-01-05         4
4            5  2023-01-07         2


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15 entries, 0 to 14
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   accident_id  15 non-null     int64 
 1   date_str     15 non-null     object
 2   severity     15 non-null     int64 
dtypes: int64(2), object(1)
memory usage: 492.0+ bytes


In [5]:
# Convert string column to datetime type
df['date'] = pd.to_datetime(df['date_str'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15 entries, 0 to 14
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   accident_id  15 non-null     int64         
 1   date_str     15 non-null     object        
 2   severity     15 non-null     int64         
 3   date         15 non-null     datetime64[ns]
dtypes: datetime64[ns](1), int64(2), object(1)
memory usage: 612.0+ bytes


In [6]:
# Extract datetime features
df['Year'] = df['date'].dt.year
df['Month'] = df['date'].dt.month
df['Day'] = df['date'].dt.day
df

Unnamed: 0,accident_id,date_str,severity,date,Year,Month,Day
0,1,2023-01-07,1,2023-01-07,2023,1,7
1,2,2023-01-04,2,2023-01-04,2023,1,4
2,3,2023-01-08,4,2023-01-08,2023,1,8
3,4,2023-01-05,4,2023-01-05,2023,1,5
4,5,2023-01-07,2,2023-01-07,2023,1,7
5,6,2023-01-10,2,2023-01-10,2023,1,10
6,7,2023-01-03,2,2023-01-03,2023,1,3
7,8,2023-01-07,4,2023-01-07,2023,1,7
8,9,2023-01-08,4,2023-01-08,2023,1,8
9,10,2023-01-05,1,2023-01-05,2023,1,5


In [7]:
# Handling missing datetime values: introduce NaT (missing) for demonstration
nan_indices = np.random.choice(df.index, size=2, replace=False)
df.loc[nan_indices, 'date'] = pd.NaT

In [8]:
print("\nDataset with missing datetime values (NaT)")
print(df.loc[nan_indices])


Dataset with missing datetime values (NaT)
   accident_id    date_str  severity date  Year  Month  Day
1            2  2023-01-04         2  NaT  2023      1    4
5            6  2023-01-10         2  NaT  2023      1   10


In [10]:
# Fill missig datetime values using forward fill method
df['date'].fillna(method='ffill', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['date'].fillna(method='ffill', inplace=True)
  df['date'].fillna(method='ffill', inplace=True)


In [11]:
print("\nDataset after filling missing datetime values (forward fill):")
print(df.loc[nan_indices])


Dataset after filling missing datetime values (forward fill):
   accident_id    date_str  severity       date  Year  Month  Day
1            2  2023-01-04         2 2023-01-07  2023      1    4
5            6  2023-01-10         2 2023-01-07  2023      1   10


In [12]:
# Drop rows with missing datetime values
df_dropped = df.dropna(subset=['date'])

# Filter rows where date is after a specific datetime
filtered_df = df[df['date'] > pd.Timestamp('2023-01-05')]

print("\nRows after dropping missing datetime:")
print(df_dropped.head())

print("\nRows with date after 2023-01-05:")
print(filtered_df.head())


Rows after dropping missing datetime:
   accident_id    date_str  severity       date  Year  Month  Day
0            1  2023-01-07         1 2023-01-07  2023      1    7
1            2  2023-01-04         2 2023-01-07  2023      1    4
2            3  2023-01-08         4 2023-01-08  2023      1    8
3            4  2023-01-05         4 2023-01-05  2023      1    5
4            5  2023-01-07         2 2023-01-07  2023      1    7

Rows with date after 2023-01-05:
   accident_id    date_str  severity       date  Year  Month  Day
0            1  2023-01-07         1 2023-01-07  2023      1    7
1            2  2023-01-04         2 2023-01-07  2023      1    4
2            3  2023-01-08         4 2023-01-08  2023      1    8
4            5  2023-01-07         2 2023-01-07  2023      1    7
5            6  2023-01-10         2 2023-01-07  2023      1   10
