## Import libraries

In [1]:
import pandas as pd
from sqlalchemy import create_engine
import numpy as np

## Extract data from SQL server

In [17]:
# Extract the source data from the Bronze layer in the data warehouse

server = 'mohamedibrahim'
database = 'railway_dwh'

connection_string = (
    f'mssql+pyodbc://mohamedibrahim/railway_dwh'
    '?driver=ODBC+Driver+17+for+SQL+Server'
    '&Trusted_Connection=yes'
)
engine = create_engine(connection_string)

query = "SELECT * FROM bronze.railway"
df = pd.read_sql(query, engine)

## Data Overview

In [18]:
df.head()

Unnamed: 0,TransactionID,Date_of_Purchase,Time_of_Purchase,Purchase_Type,Payment_Method,Railcard,Ticket_Class,Ticket_Type,Price,Departure_Station,Arrival_Station,Date_of_Journey,Departure_Time,Arrival_Time,Actual_Arrival_Time,Journey_Status,Reason_for_Delay,Refund_Request
0,da8a6ba8-b3dc-4677-b176,2023-12-08,12:41:11,Online,Contactless,Adult,Standard,Advance,43,London Paddington,Liverpool Lime Street,2024-01-01,11:00:00,13:30:00,13:30:00,On Time,,No
1,b0cdd1b0-f214-4197-be53,2023-12-16,11:23:01,Station,Credit Card,Adult,Standard,Advance,23,London Kings Cross,York,2024-01-01,09:45:00,11:35:00,11:40:00,Delayed,Signal Failure,No
2,f3ba7a96-f713-40d9-9629,2023-12-19,19:51:27,Online,Credit Card,,Standard,Advance,3,Liverpool Lime Street,Manchester Piccadilly,2024-01-02,18:15:00,18:45:00,18:45:00,On Time,,No
3,b2471f11-4fe7-4c87-8ab4,2023-12-20,23:00:36,Station,Credit Card,,Standard,Advance,13,London Paddington,Reading,2024-01-01,21:30:00,22:30:00,22:30:00,On Time,,No
4,2be00b45-0762-485e-a7a3,2023-12-27,18:22:56,Online,Contactless,,Standard,Advance,76,Liverpool Lime Street,London Euston,2024-01-01,16:45:00,19:00:00,19:00:00,On Time,,No


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31653 entries, 0 to 31652
Data columns (total 18 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   TransactionID        31653 non-null  object
 1   Date_of_Purchase     31653 non-null  object
 2   Time_of_Purchase     31653 non-null  object
 3   Purchase_Type        31653 non-null  object
 4   Payment_Method       31653 non-null  object
 5   Railcard             31653 non-null  object
 6   Ticket_Class         31653 non-null  object
 7   Ticket_Type          31653 non-null  object
 8   Price                31653 non-null  int64 
 9   Departure_Station    31653 non-null  object
 10  Arrival_Station      31653 non-null  object
 11  Date_of_Journey      31653 non-null  object
 12  Departure_Time       31653 non-null  object
 13  Arrival_Time         31653 non-null  object
 14  Actual_Arrival_Time  29773 non-null  object
 15  Journey_Status       31653 non-null  object
 16  Reas

In [47]:
df.isnull().sum()

transaction_id            0
date_of_purchase          0
time_of_purchase          0
purchase_type             0
payment_method            0
railcard                  0
ticket_class              0
ticket_type               0
price                     0
departure_station         0
arrival_station           0
date_of_journey           0
departure_time            0
arrival_time              0
actual_arrival_time    1880
journey_status            0
reason_for_delay          0
refund_request            0
dtype: int64

In [21]:
df.duplicated().sum()

0

In [22]:
# Checks each object column for unwanted spaces (leading spaces, trailing spaces, multiple spaces)
for col in df.select_dtypes(include='object').columns:
    has_unwanted_spaces = False
    
    # Iterate through non-null values and check each one
    for value in df[col][df[col].notna()]:
        # Convert to string and check for unwanted spaces
        str_value = str(value)
        # Check for leading/trailing spaces or multiple consecutive spaces
        if str_value != str_value.strip() or '  ' in str_value:
            has_unwanted_spaces = True
            break
    
    if has_unwanted_spaces:
        print(f"Unwanted spaces found in column: {col}")

    else:
        print("It's all good :)")   

It's all good :)
It's all good :)
It's all good :)
It's all good :)
It's all good :)
It's all good :)
It's all good :)
It's all good :)
It's all good :)
It's all good :)
It's all good :)
It's all good :)
It's all good :)
It's all good :)
It's all good :)
It's all good :)
It's all good :)


## Naming Convention

### This section outlines the naming conventions used for tables, columns and other objects in the data warehouse.

#### Naming Conventions: Use snake_case, with lowercase letters and underscores (_) to separate words, this applies to any tables, columns and objects exepect columns values, stays as it is.

In [23]:
df.columns = df.columns.str.lower()

In [24]:
df = df.rename(columns={'transactionid': 'transaction_id'})

## Data Inspection & Cleaning

#### Column "transaction_id"

In [25]:
df['transaction_id'].is_unique

True

#### Column "date_of_purchase"

In [26]:
# Date_of_Purchase
# Using pattern matching to detect anomalies
import re

pattern = r'^\d{4}-\d{2}-\d{2}$'
anomalies = df.loc[~df['date_of_purchase'].astype(str).str.match(pattern), 'date_of_purchase']
print(anomalies)

Series([], Name: date_of_purchase, dtype: object)


#### Column "time_of_purchase"

In [27]:
pattern = r'^\d{2}:\d{2}:\d{2}$'
anomalies = df.loc[~df['time_of_purchase'].astype(str).str.match(pattern), 'time_of_purchase']
print(anomalies)

Series([], Name: time_of_purchase, dtype: object)


#### Column "purchase_type"

In [28]:
df['purchase_type'].unique()

array(['Online', 'Station'], dtype=object)

#### Column "payment_method"

In [29]:
df['payment_method'].unique()

array(['Contactless', 'Credit Card', 'Debit Card'], dtype=object)

#### Column "railcard"

In [30]:
df['railcard'].unique()

array(['Adult', 'None', 'Disabled', 'Senior'], dtype=object)

In [31]:
df['railcard'] = df['railcard'].fillna('None')

#### Column "ticket_class"

In [32]:
df['ticket_class'].unique()

array(['Standard', 'First Class'], dtype=object)

#### Column "ticket_type"

In [33]:
df['ticket_type'].unique()

array(['Advance', 'Off-Peak', 'Anytime'], dtype=object)

#### Column "price"

In [34]:
np.sort(df['price'].unique())

array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
        27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
        41,  42,  43,  44,  45,  47,  48,  49,  50,  51,  52,  53,  54,
        55,  56,  57,  59,  63,  64,  65,  66,  67,  69,  70,  71,  72,
        73,  74,  76,  77,  78,  80,  82,  84,  86,  88,  89,  90,  93,
        94,  95,  96,  97, 101, 102, 104, 106, 107, 108, 109, 110, 111,
       112, 113, 114, 116, 117, 118, 119, 121, 126, 128, 129, 134, 135,
       143, 144, 146, 151, 154, 157, 158, 162, 168, 171, 176, 178, 180,
       200, 203, 211, 216, 235, 238, 242, 267], dtype=int64)

#### Column "departure_station"

In [35]:
df['departure_station'].unique()

array(['London Paddington', 'London Kings Cross', 'Liverpool Lime Street',
       'London Euston', 'York', 'Manchester Piccadilly',
       'Birmingham New Street', 'London St Pancras', 'Oxford', 'Reading',
       'Edinburgh Waverley', 'Bristol Temple Meads'], dtype=object)

#### Column "arrival_station"

In [36]:
df['arrival_station'].unique()

array(['Liverpool Lime Street', 'York', 'Manchester Piccadilly',
       'Reading', 'London Euston', 'Oxford', 'Durham',
       'London St Pancras', 'Birmingham New Street', 'London Paddington',
       'Bristol Temple Meads', 'Tamworth', 'London Waterloo', 'Sheffield',
       'Wolverhampton', 'Leeds', 'Stafford', 'Doncaster', 'Swindon',
       'Nottingham', 'Peterborough', 'Edinburgh', 'Crewe',
       'London Kings Cross', 'Leicester', 'Nuneaton', 'Didcot',
       'Edinburgh Waverley', 'Coventry', 'Wakefield', 'Cardiff Central',
       'Warrington'], dtype=object)

#### Column "date_of_journey"

In [37]:
pattern = r'^\d{4}-\d{2}-\d{2}$'
anomalies = df.loc[~df['date_of_journey'].astype(str).str.match(pattern), 'date_of_journey']
print(anomalies)

Series([], Name: date_of_journey, dtype: object)


#### Column "departure_time"

In [38]:
pattern = r'^\d{2}:\d{2}:\d{2}$'
anomalies = df.loc[~df['departure_time'].astype(str).str.match(pattern), 'departure_time']
print(anomalies)

Series([], Name: departure_time, dtype: object)


#### Column "arrival_time"

In [39]:
pattern = r'^\d{2}:\d{2}:\d{2}$'
anomalies = df.loc[~df['arrival_time'].astype(str).str.match(pattern), 'arrival_time']
print(anomalies)

Series([], Name: arrival_time, dtype: object)


#### Column "actual_arrival_time"

In [40]:
pattern = r'^\d{2}:\d{2}:\d{2}$'
anomalies = df.loc[~df['actual_arrival_time'].astype(str).str.match(pattern).notna(), 'actual_arrival_time']
print(anomalies)

# The nulls in the column "actual_arrival_time" represent the cancelled journeys in the column "journey_status"

Series([], Name: actual_arrival_time, dtype: object)


#### Column "journey_status"

In [41]:
df['journey_status'].unique()

array(['On Time', 'Delayed', 'Cancelled'], dtype=object)

#### Column "reason_for_delay"

In [42]:
df['reason_for_delay'].unique()

array([None, 'Signal Failure', 'Technical Issue', 'Weather Conditions',
       'Weather', 'Staffing', 'Staff Shortage', 'Signal failure',
       'Traffic'], dtype=object)

In [43]:
df['reason_for_delay'] = df['reason_for_delay'].replace({'Signal failure': 'Signal Failure',
                                                         'Weather': 'Weather Conditions',
                                                         'Staffing': 'Staffing Issues',
                                                         'Staff Shortage': 'Staffing Issues'})

In [44]:
df['reason_for_delay'] = df['reason_for_delay'].fillna('No Delay')

#### Column "refund_request"

In [45]:
df['refund_request'].unique()

array(['No', 'Yes'], dtype=object)

## Load Transformed Data into SQL Server

In [48]:
# Load the transformed "railway" data into the silver layer in the data warehouse

server = 'mohamedibrahim'
database = 'railway_dwh'

connection_string = (
    f'mssql+pyodbc://mohamedibrahim/railway_dwh'
    '?driver=ODBC+Driver+17+for+SQL+Server'
    '&Trusted_Connection=yes'
)

engine = create_engine(connection_string)

df.to_sql(
    name='railway',
    schema='silver',
    con=engine,   
    if_exists='append',
    index=False
)

101