# Creating Dummy Datasets

In [1]:
# Import the necessary libraries
import pandas as pd
import random
import datetime

### Creating Pipeline Data Table Dummy Dataset

In [2]:
# Specifying the number of rows we want in our dataset

num_rows = 10000

In [3]:
# Getting the values for each column using random library

pipeline_ids = [f"PL-{i+1}" for i in range(num_rows)]
lengths = [random.uniform(1, 100) for i in range(num_rows)]
diameters = [random.uniform(100, 1000) for i in range(num_rows)]
ages = [random.randint(1, 50) for i in range(num_rows)]
materials = [random.choice(['steel', 'plastic', 'concrete']) for i in range(num_rows)]
locations = [random.choice(['lagos', 'warri', 'portharcourt', 'bayelsa']) for i in range(num_rows)]

In [4]:
# Creating a dictionary 
# This dictionary represents a tabular data structure with multiple columns, where each column is represented
# by a key-value pair.

pipeline_data_table = {'pipeline_id':pipeline_ids,
                       'lenght_km':lengths,
                       'diamteters_mm': diameters,
                       'age_years': ages,
                       'material':materials,
                       'location':locations}

In [5]:
# Converting the table to a dataframe

df = pd.DataFrame(pipeline_data_table)

In [6]:
# Showing the dataframe
df

Unnamed: 0,pipeline_id,lenght_km,diamteters_mm,age_years,material,location
0,PL-1,3.163867,205.964141,39,steel,lagos
1,PL-2,15.124574,655.077285,27,plastic,portharcourt
2,PL-3,43.003782,237.044090,32,steel,portharcourt
3,PL-4,78.164780,611.534529,19,concrete,lagos
4,PL-5,76.145205,603.479615,41,concrete,lagos
...,...,...,...,...,...,...
9995,PL-9996,39.479040,904.590474,45,plastic,bayelsa
9996,PL-9997,26.353406,115.942655,12,plastic,lagos
9997,PL-9998,82.566472,919.707763,50,steel,warri
9998,PL-9999,70.481720,894.962528,23,plastic,warri


In [7]:
# Saving the dataframe to a csv file
df.to_csv('pipeline_data_table.csv')

### Creating Maintenance Data Table Dummy Dataset

In [8]:
# Specifying the number of rows we want in our dataset

num_rows = 10000

In [9]:
# Getting the values for each column using random library

pipeline_ids = [f"PL-{i+1}" for i in range(num_rows)]
repair_types = [random.choice(['routine', 'preventive', 'corrective']) for i in range(num_rows)]
repair_durations = [random.randint(1, 24) for i in range(num_rows)]

# Generate costs based on repair type and duration   
costs = []
for repair_type, repair_duration in zip(repair_types, repair_durations):
    if repair_type == 'routine':
        cost = random.randint(100, 500)
    elif repair_type == 'preventive':
        cost = random.uniform(200, 800)
    else:
        # Higher cost for corrective repairs and higher duration
        cost = random.uniform(500, 1500) * repair_duration
    costs.append(cost)


In [10]:
# Creating a dictionary 
# This dictionary represents a tabular data structure with multiple columns, where each column is represented
# by a key-value pair.

maintenance_data_table = {'pipeline_id':pipeline_ids,
                          'repair_type':repair_types,
                          'repair_duration_hours': repair_durations,
                          'cost_in_dollars':costs}
    

In [11]:
# Converting the table to a dataframe

df1 = pd.DataFrame(maintenance_data_table)

In [12]:
# Showing the dataframe
df1

Unnamed: 0,pipeline_id,repair_type,repair_duration_hours,cost_in_dollars
0,PL-1,preventive,8,486.935598
1,PL-2,routine,2,313.000000
2,PL-3,routine,11,252.000000
3,PL-4,preventive,23,784.632265
4,PL-5,corrective,9,11137.279928
...,...,...,...,...
9995,PL-9996,routine,3,423.000000
9996,PL-9997,routine,4,397.000000
9997,PL-9998,routine,6,334.000000
9998,PL-9999,routine,11,172.000000


In [13]:
# Saving the dataframe to a csv file
df1.to_csv('maintenance_data_table.csv')

### Creating Sensor Data Table Dummy Dataset

In [14]:
# Specifying the number of rows we want in our dataset

num_rows = 10000

In [15]:
# Getting the values for each column using random library

pipeline_ids = [f"PL-{i+1}" for i in range(num_rows)]
sensor_ids = [random.choice(['sensor-1', 'sensor-2', 'sensor-3', 'sensor-4', 'sensor-5']) for i in range(num_rows)]

# Define the date range for the dataset
start_date = datetime.datetime(2012, 1, 1)
end_date = datetime.datetime(2021, 12, 31)

date = [start_date + datetime.timedelta(days=random.randint(0, (end_date - start_date).days)) for i in range(num_rows)]

pressure = [random.uniform(10, 100) for i in range(num_rows)]
temperature = [random.uniform(20, 30)for i in range(num_rows)]
flow_rate = [random.uniform(0, 10) for i in range(num_rows)]


In [16]:
# Creating a dictionary 
# This dictionary represents a tabular data structure with multiple columns, where each column is represented
# by a key-value pair.

sensor_data_table = {'sensor_id': sensor_ids,
                     'pipeline_id': pipeline_ids,
                     'date': date,
                     'pressure': pressure,
                     'temperature': temperature,
                     'flow_rate': flow_rate}

In [17]:
# Converting the table to a dataframe

df2 = pd.DataFrame(sensor_data_table)

In [18]:
# Showing the dataframe
df2

Unnamed: 0,sensor_id,pipeline_id,date,pressure,temperature,flow_rate
0,sensor-2,PL-1,2016-01-23,89.165094,26.487407,8.841278
1,sensor-4,PL-2,2021-05-06,51.422621,22.123034,2.239861
2,sensor-1,PL-3,2014-08-03,89.938270,24.376793,8.798216
3,sensor-2,PL-4,2019-03-26,91.698826,24.128967,4.434502
4,sensor-3,PL-5,2015-01-19,77.330940,23.286447,4.723534
...,...,...,...,...,...,...
9995,sensor-2,PL-9996,2015-10-14,36.631364,20.713640,2.362084
9996,sensor-1,PL-9997,2014-11-01,17.839181,29.162270,5.139987
9997,sensor-3,PL-9998,2016-04-01,43.170468,22.554705,6.052970
9998,sensor-2,PL-9999,2016-05-29,69.504303,22.656185,4.041316


In [19]:
# Saving the dataframe to a csv file
df2.to_csv('sensor_data_table.csv')

### Creating Inspection Data Table Dummy Dataset

In [20]:
# Specifying the number of rows we want in our dataset

num_rows = 10000

In [21]:
# Getting the values for each column using random library

pipeline_ids = [f"PL-{i+1}" for i in range(num_rows)]
corrosion_level = [random.randint(0,5) for i in range(num_rows)]
deformation_level = [random.randint(0,5) for i in range(num_rows)]

# Determine leak_detection based on corrosion_level and deformation_level
leak_detection = []
for corr, defor in zip(corrosion_level, deformation_level):
    if corr > 3 or defor > 3:
        leak_detection.append('Yes')
    else:
        leak_detection.append('No')


In [22]:
# Creating a dictionary 
# This dictionary represents a tabular data structure with multiple columns, where each column is represented
# by a key-value pair.

inspection_data_table = {'pipeline_id':pipeline_ids,
                         'corrosion_level': corrosion_level,
                         'deformation_level': deformation_level,
                         'leak_detection': leak_detection}

In [23]:
# Converting the table to a dataframe

df3 = pd.DataFrame(inspection_data_table)

In [24]:
# Showing the dataframe
df3

Unnamed: 0,pipeline_id,corrosion_level,deformation_level,leak_detection
0,PL-1,4,2,Yes
1,PL-2,3,3,No
2,PL-3,1,0,No
3,PL-4,2,4,Yes
4,PL-5,2,3,No
...,...,...,...,...
9995,PL-9996,2,1,No
9996,PL-9997,2,0,No
9997,PL-9998,5,0,Yes
9998,PL-9999,2,4,Yes


In [25]:
# Saving the dataframe to a csv file
df3.to_csv('inspection_data_table.csv')