# Import Libraries

In [10]:
import pandas as pd
import numpy as np

# Load Dataset

In [11]:
df = pd.read_csv('./pressure_sensor_data.tsv',delimiter='\t')

# Initial Viewing of data

In [12]:
df.head()

Unnamed: 0,Sensor Location,1/5/2023,1/6/2023,1/18/2023,1/28/2023,2/6/2023,2/7/2023,2/9/2023,2/28/2023
0,Entry A,29.0,37.0,,,23.0,16.0,13.0,
1,Area A,0.0,11.0,14.0,2.0,11.0,4.0,15.0,15.0
2,Area B,,28.0,0.0,21.0,19.0,23.0,6.0,16.0
3,Entry A,6.0,,4.0,5.0,,,11.0,3.0
4,Exit A,33.0,7.0,2.0,22.0,23.0,11.0,,13.0


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Sensor Location  6 non-null      object 
 1   1/5/2023         5 non-null      float64
 2   1/6/2023         5 non-null      float64
 3   1/18/2023        5 non-null      float64
 4   1/28/2023        5 non-null      float64
 5   2/6/2023         5 non-null      float64
 6   2/7/2023         5 non-null      float64
 7   2/9/2023         5 non-null      float64
 8   2/28/2023        5 non-null      float64
dtypes: float64(8), object(1)
memory usage: 564.0+ bytes


# 1. Tidiniess Issue: Column headers are values not variable names

They should be converted to variable names using `melt()`. There should be three columns after melting: `Sensor Location`, `Date` and `Pressure`

In [14]:
updated_df = df.melt(id_vars=['Sensor Location'],
                    var_name='Date',
                    value_name='Pressure')

In [15]:
updated_df

Unnamed: 0,Sensor Location,Date,Pressure
0,Entry A,1/5/2023,29.0
1,Area A,1/5/2023,0.0
2,Area B,1/5/2023,
3,Entry A,1/5/2023,6.0
4,Exit A,1/5/2023,33.0
5,Exit B,1/5/2023,18.0
6,Entry A,1/6/2023,37.0
7,Area A,1/6/2023,11.0
8,Area B,1/6/2023,28.0
9,Entry A,1/6/2023,


# 2. Tidiness Issue: Multiple variables stored in one column

The `Sensor Location` column stores both where the sensor belongs to (Area or Entry etc.) and what that location is (A or B etc.), these should be each stored separately

In [16]:
updated_df[['Sensor Postion','Sensor Location']] = updated_df['Sensor Location'].str.split(" ",n=1,expand=True)

In [18]:
updated_df.head()

Unnamed: 0,Sensor Location,Date,Pressure,Sensor Postion
0,A,1/5/2023,29.0,Entry
1,A,1/5/2023,0.0,Area
2,B,1/5/2023,,Area
3,A,1/5/2023,6.0,Entry
4,A,1/5/2023,33.0,Exit


# 3. Quality Issue: Validity and Accuracy

## 3.1 Between Jan 5th and Jan 6th, sensors accidentally went beyond 25 bar.

In [19]:
updated_df['Date'] = pd.to_datetime(updated_df['Date'])

In [20]:
updated_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48 entries, 0 to 47
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   Sensor Location  48 non-null     object        
 1   Date             48 non-null     datetime64[ns]
 2   Pressure         40 non-null     float64       
 3   Sensor Postion   48 non-null     object        
dtypes: datetime64[ns](1), float64(1), object(2)
memory usage: 1.6+ KB


In [24]:
invalid_data = updated_df.loc[((updated_df['Date'] == '2023-01-05')  | (updated_df['Date']=='2023-01-06')) & (updated_df['Pressure'] >25)]

In [26]:
invalid_data

Unnamed: 0,Sensor Location,Date,Pressure,Sensor Postion
0,A,2023-01-05,29.0,Entry
4,A,2023-01-05,33.0,Exit
6,A,2023-01-06,37.0,Entry
8,B,2023-01-06,28.0,Area


In [27]:
clean_df = updated_df.drop(index=invalid_data.index)

In [30]:
clean_df.head()

Unnamed: 0,Sensor Location,Date,Pressure,Sensor Postion
1,A,2023-01-05,0.0,Area
2,B,2023-01-05,,Area
3,A,2023-01-05,6.0,Entry
5,B,2023-01-05,18.0,Exit
7,A,2023-01-06,11.0,Area


## Before removing invalid data

In [31]:
updated_df.describe()

Unnamed: 0,Date,Pressure
count,48,40.0
mean,2023-01-28 21:00:00,13.6
min,2023-01-05 00:00:00,0.0
25%,2023-01-15 00:00:00,5.75
50%,2023-02-01 12:00:00,13.0
75%,2023-02-07 12:00:00,19.25
max,2023-02-28 00:00:00,37.0
std,,9.088736


## After removing invalid data

In [32]:
clean_df.describe()

Unnamed: 0,Date,Pressure
count,44,36.0
mean,2023-01-31 00:00:00,11.583333
min,2023-01-05 00:00:00,0.0
25%,2023-01-18 00:00:00,5.0
50%,2023-02-06 00:00:00,12.0
75%,2023-02-09 00:00:00,16.0
max,2023-02-28 00:00:00,23.0
std,,6.983143


The range of the pressure data is correct after cleaning, 0-23 falls under the 0-25 range