## Data cleaning

### Import libraries

In [1]:
# Import libraries.
import pandas as pd
import numpy as np
import re

## Client sales data

### Load data

In [None]:
# Load "2021 Client Sales" data.
sales_2021 = pd.read_csv('2021 Client Sales.csv')

# Load "2022 Client Sales" data.
sales_2022 = pd.read_csv('2022 Client Sales.csv')

### View data

In [None]:
# View sales_2021 head.
sales_2021.head()

Unnamed: 0,Province,Town/City,Post code,Company code,Petrol station code,Petrol station,Date,Sold units
0,Araba/Álava,ALTUBE,1139,33420,1004,ALTUBE DCHO.,6/1/2021,1
1,Araba/Álava,ALTUBE,1139,33420,1004,ALTUBE DCHO.,5/2/2021,1
2,Araba/Álava,ALTUBE,1139,33420,1004,ALTUBE DCHO.,15/2/2021,2
3,Araba/Álava,ALTUBE,1139,33420,1004,ALTUBE DCHO.,17/2/2021,1
4,Araba/Álava,ALTUBE,1139,33420,1004,ALTUBE DCHO.,27/2/2021,1


In [None]:
# View sales_2021 tail.
sales_2021.tail()

Unnamed: 0,Province,Town/City,Post code,Company code,Petrol station code,Petrol station,Date,Sold units
175565,Zaragoza,ZARAGOZA,50021,96995,1790,PUERTO VENECIA,26/12/2021,1
175566,Zaragoza,ZARAGOZA,50021,96995,1790,PUERTO VENECIA,27/12/2021,2
175567,Zaragoza,ZARAGOZA,50021,96995,1790,PUERTO VENECIA,29/12/2021,1
175568,Zaragoza,ZARAGOZA,50021,96995,1790,PUERTO VENECIA,30/12/2021,3
175569,Zaragoza,ZARAGOZA,50021,96995,1790,PUERTO VENECIA,31/12/2021,17


In [None]:
# View sales_2022 head.
sales_2022.head()

Unnamed: 0,Province,Town/City,Post code,Company code,Petrol station code,Petrol station,Date,Sold units
0,Araba/Álava,ALTUBE,1139,33420,1004,ALTUBE DCHO.,1/1/2022,2
1,Araba/Álava,ALTUBE,1139,33420,1004,ALTUBE DCHO.,4/1/2022,2
2,Araba/Álava,ALTUBE,1139,33420,1004,ALTUBE DCHO.,8/1/2022,2
3,Araba/Álava,ALTUBE,1139,33420,1004,ALTUBE DCHO.,15/1/2022,1
4,Araba/Álava,ALTUBE,1139,33420,1004,ALTUBE DCHO.,25/1/2022,1


In [None]:
# View sales_2022 tail.
sales_2022.tail()

Unnamed: 0,Province,Town/City,Post code,Company code,Petrol station code,Petrol station,Date,Sold units
155410,Zaragoza,ZARAGOZA,50021,96995,1790,PUERTO VENECIA,30/11/2022,1
155411,Zaragoza,ZARAGOZA,50021,96995,1790,PUERTO VENECIA,2/12/2022,2
155412,Zaragoza,ZARAGOZA,50021,96995,1790,PUERTO VENECIA,3/12/2022,3
155413,Zaragoza,ZARAGOZA,50021,96995,1790,PUERTO VENECIA,6/12/2022,3
155414,Zaragoza,ZARAGOZA,50021,96995,1790,PUERTO VENECIA,7/12/2022,1


### Data cleaning

#### Validating the data

In [None]:
# Validate the data function.
def validate_data(df):

    # Check the data types.
    data_types = df.dtypes
    print("Data types:")
    print(data_types)

    # Check the data shape.
    data_shape = df.shape
    print("Data shape:")
    print(data_shape)

    # Check for unique values
    unique_counts = df.nunique()
    print("Unique values per column:")
    print(unique_counts)

    # Check for duplicates
    duplicate_count = df.duplicated().sum()
    print("\nNumber of duplicate rows:")
    print(duplicate_count)

    # Summary statistics
    summary_stats = df.describe()
    print("\nSummary statistics:")
    print(summary_stats)

    return data_types, data_shape, unique_counts, duplicate_count, summary_stats

In [None]:
# Validating sales_2021.
validate_data(sales_2021)

Data types:
Province               object
Town/City              object
Post code               int64
Company code            int64
Petrol station code     int64
Petrol station         object
Date                   object
Sold units              int64
dtype: object
Data shape:
(175570, 8)
Unique values per column:
Province                50
Town/City              528
Post code              621
Company code           804
Petrol station code    804
Petrol station         804
Date                   365
Sold units             179
dtype: int64

Number of duplicate rows:
0

Summary statistics:
           Post code   Company code  Petrol station code     Sold units
count  175570.000000  175570.000000        175570.000000  175570.000000
mean    25468.181472   48039.153853          1186.722971       6.692174
std     14578.950331   36827.877303           519.483427       9.998190
min      1006.000000     652.000000             1.000000     -57.000000
25%     11130.000000   13214.000000          

(Province               object
 Town/City              object
 Post code               int64
 Company code            int64
 Petrol station code     int64
 Petrol station         object
 Date                   object
 Sold units              int64
 dtype: object,
 (175570, 8),
 Province                50
 Town/City              528
 Post code              621
 Company code           804
 Petrol station code    804
 Petrol station         804
 Date                   365
 Sold units             179
 dtype: int64,
 0,
            Post code   Company code  Petrol station code     Sold units
 count  175570.000000  175570.000000        175570.000000  175570.000000
 mean    25468.181472   48039.153853          1186.722971       6.692174
 std     14578.950331   36827.877303           519.483427       9.998190
 min      1006.000000     652.000000             1.000000     -57.000000
 25%     11130.000000   13214.000000           808.000000       1.000000
 50%     28024.000000   33874.000000     

***Notes on validation***:
- Some incorrect data types. The codes need to be strings. Date needs to be in datetime format.
- Unique values: All provinces & all dates are in the data set (365).
- Max sold units is 245 which seems reasonable. The minimum is a negative number but this could mean refunds - check with stakeholder.
- 175570 rows

In [None]:
# Validating sales_2022.
validate_data(sales_2022)

Data types:
Province               object
Town/City              object
Post code               int64
Company code            int64
Petrol station code     int64
Petrol station         object
Date                   object
Sold units              int64
dtype: object
Data shape:
(155415, 8)
Unique values per column:
Province                50
Town/City              539
Post code              629
Company code           819
Petrol station code    819
Petrol station         819
Date                   348
Sold units             230
dtype: int64

Number of duplicate rows:
0

Summary statistics:
           Post code   Company code  Petrol station code     Sold units
count  155415.000000  155415.000000        155415.000000  155415.000000
mean    25595.250027   48408.459023          1203.413641       7.365151
std     14479.028499   36960.573034           523.050468      12.908353
min      1006.000000     652.000000             1.000000     -77.000000
25%     11370.000000   13202.000000          

(Province               object
 Town/City              object
 Post code               int64
 Company code            int64
 Petrol station code     int64
 Petrol station         object
 Date                   object
 Sold units              int64
 dtype: object,
 (155415, 8),
 Province                50
 Town/City              539
 Post code              629
 Company code           819
 Petrol station code    819
 Petrol station         819
 Date                   348
 Sold units             230
 dtype: int64,
 0,
            Post code   Company code  Petrol station code     Sold units
 count  155415.000000  155415.000000        155415.000000  155415.000000
 mean    25595.250027   48408.459023          1203.413641       7.365151
 std     14479.028499   36960.573034           523.050468      12.908353
 min      1006.000000     652.000000             1.000000     -77.000000
 25%     11370.000000   13202.000000           837.000000       1.000000
 50%     28035.000000   33889.000000     

***Notes on validation***:
- Some incorrect data types. The codes need to be strings.
- Unique values: All provinces are in the data set. Not all dates are in the data set (348). There are a different number of unique values for the other columns than in the 2021 data (see the below table).
- Max sold units is 717 which seems reasonable especially as this was the year with the heatwaves. The minimum is a negative number but this could mean refunds - check with stakeholder.
- 155415 rows which is less than for the 2021 data (175570 rows)

![Number of unique values.png](attachment:cd9eab62-828a-407d-87fd-2aa6bd776ffa.png)

#### Vertically concatenate sales_2021 and sales_2022

In [None]:
# Concatenate sales_2021 and sales_2022.
sales = pd.concat([sales_2021, sales_2022.iloc[1:]], ignore_index=True)

In [None]:
# View sales head.
sales.head()

Unnamed: 0,Province,Town/City,Post code,Company code,Petrol station code,Petrol station,Date,Sold units
0,Araba/Álava,ALTUBE,1139,33420,1004,ALTUBE DCHO.,6/1/2021,1
1,Araba/Álava,ALTUBE,1139,33420,1004,ALTUBE DCHO.,5/2/2021,1
2,Araba/Álava,ALTUBE,1139,33420,1004,ALTUBE DCHO.,15/2/2021,2
3,Araba/Álava,ALTUBE,1139,33420,1004,ALTUBE DCHO.,17/2/2021,1
4,Araba/Álava,ALTUBE,1139,33420,1004,ALTUBE DCHO.,27/2/2021,1


In [None]:
# View sales tail.
sales.tail()

Unnamed: 0,Province,Town/City,Post code,Company code,Petrol station code,Petrol station,Date,Sold units
330979,Zaragoza,ZARAGOZA,50021,96995,1790,PUERTO VENECIA,30/11/2022,1
330980,Zaragoza,ZARAGOZA,50021,96995,1790,PUERTO VENECIA,2/12/2022,2
330981,Zaragoza,ZARAGOZA,50021,96995,1790,PUERTO VENECIA,3/12/2022,3
330982,Zaragoza,ZARAGOZA,50021,96995,1790,PUERTO VENECIA,6/12/2022,3
330983,Zaragoza,ZARAGOZA,50021,96995,1790,PUERTO VENECIA,7/12/2022,1


In [None]:
# Check that it's concatenated properly.
sales.to_csv('sales.csv', index=False)

In [None]:
# Check how many rows and columns in sales.
sales.shape

(330984, 8)

***Note:*** There are 330984 rows and 8 columns.

#### Clean column names

In [None]:
# Clean column names function.
def clean_column_names(df):
    df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')
    return df

In [None]:
# Clean column names.
clean_column_names(sales)

# View sales column names.
sales.columns

Index(['province', 'town/city', 'post_code', 'company_code',
       'petrol_station_code', 'petrol_station', 'date', 'sold_units'],
      dtype='object')

In [None]:
# Rename "post_code" to postcode.
sales.rename(columns={'post_code': 'postcode'}, inplace=True)

# View sales column names.
sales.columns

Index(['province', 'town/city', 'postcode', 'company_code',
       'petrol_station_code', 'petrol_station', 'date', 'sold_units'],
      dtype='object')

#### Check for missing values (nulls)

In [None]:
# Create functions for missing values.
def find_nulls(df):
    return df.isnull().sum()

def display_nulls(df, column_name):
    null_rows = df[df[column_name].isnull()]
    display(null_rows)

In [None]:
# Find nulls.
find_nulls(sales)

province               0
town/city              0
postcode               0
company_code           0
petrol_station_code    0
petrol_station         0
date                   0
sold_units             0
dtype: int64

***Note:*** There are no missing values (nulls).

#### Changing data types

In [None]:
# Change the code columns from integers to strings.
# Create function
def convert_to_string(df, columns):
    df[columns] = df[columns].astype(str)
    return df

# Use function.
sales = convert_to_string(sales, ['postcode', 'company_code', 'petrol_station_code'])

# Check data types of sales.
sales.dtypes

province               object
town/city              object
postcode               object
company_code           object
petrol_station_code    object
petrol_station         object
date                   object
sold_units              int64
dtype: object

In [None]:
# Convert "date" to datetime, assuming the format day, month, year for all rows.
# If the day, month, or years aren't in this format for all rows, this will be discovering when creating separate date, month, and year columns.

# Create function.
def convert_to_datetime(df, column_name, date_format="%d/%m/%Y"):
    df[column_name] = pd.to_datetime(df[column_name], format=date_format)
    return df

# Use function.
sales = convert_to_datetime(sales, 'date')

# Check data types of sales.
sales.dtypes

province                       object
town/city                      object
postcode                       object
company_code                   object
petrol_station_code            object
petrol_station                 object
date                   datetime64[ns]
sold_units                      int64
dtype: object

In [None]:
# Function to seperate date, month, and year columns & display unique values
def split_date(df, date_column):
    df['day'] = df[date_column].dt.day
    df['month'] = df[date_column].dt.month
    df['year'] = df[date_column].dt.year

    print("Day:", np.sort(df['day'].unique()))
    print("Month:", np.sort(df['month'].unique()))
    print("Year:", np.sort(df['year'].unique()))

    return df

In [None]:
# Create separate date, month, and year columns & display unique values
sales = split_date(sales, 'date')

Day: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 30 31]
Month: [ 1  2  3  4  5  6  7  8  9 10 11 12]
Year: [2021 2022]


In [None]:
# Function to create a "month_text" column with the month values as text abbreviations & display unique values

def add_month_text_column(df, date_column):
    # Create the "month_text" column with month abbreviations
    df['month_text'] = df[date_column].dt.month_name().str[:3]

    # View the unique values of month_text
    print(df['month_text'].unique())

    return df

In [None]:
# Create a "month_text" column with the month values as text abbreviations & display unique values
sales = add_month_text_column(sales, 'date')

['Jan' 'Feb' 'Mar' 'Apr' 'May' 'Jun' 'Jul' 'Aug' 'Sep' 'Oct' 'Nov' 'Dec']


#### Create additional columns: day of the week, season

In [None]:
# Function to create a "day of the week" column & display unique values
def add_day_of_week_column(df, date_column):
    # Create the "day_of_week" column
    df['day_of_week'] = df[date_column].dt.day_name()

    # View the unique values of day_of_week
    print(df['day_of_week'].unique())

    return df

In [None]:
# Create a "day of the week" column & display unique values
sales = add_day_of_week_column(sales, 'date')

['Wednesday' 'Friday' 'Monday' 'Saturday' 'Tuesday' 'Sunday' 'Thursday']


In [None]:
# Create a function with a mapping dictionary for the seasons & create a new column to display seasons
def add_season_column(df, month_text_column):
    month_season_mapping = {
        'Jan': 'Winter',
        'Feb': 'Winter',
        'Mar': 'Spring',
        'Apr': 'Spring',
        'May': 'Spring',
        'Jun': 'Summer',
        'Jul': 'Summer',
        'Aug': 'Summer',
        'Sep': 'Autumn',
        'Oct': 'Autumn',
        'Nov': 'Autumn',
        'Dec': 'Winter'
    }
    df['season'] = df[month_text_column].replace(month_season_mapping)
    print(df['season'].unique())

    return df

In [None]:
# Create a season column & check unique values
sales = add_season_column(sales, 'month_text')

['Winter' 'Spring' 'Summer' 'Autumn']


#### Cleaning string values

In [None]:
# Create a function to clean the string values by changing to title text and removing special characters.
def clean_text(df, columns):
        for column in columns:
            df[column] = df[column].str.title()
            df[column] = df[column].str.replace(r'[^a-zA-ZáéíóúñÁÉÍÓÚÑüÜ\s/1-2]', '', regex=True)
        return df

In [None]:
# View sales again.
sales.head(2)

Unnamed: 0,province,town/city,postcode,company_code,petrol_station_code,petrol_station,date,sold_units,day,month,year,month_text,day_of_week,season
0,Araba/Álava,ALTUBE,1139,33420,1004,ALTUBE DCHO.,2021-01-06,1,6,1,2021,Jan,Wednesday,Winter
1,Araba/Álava,ALTUBE,1139,33420,1004,ALTUBE DCHO.,2021-02-05,1,5,2,2021,Feb,Friday,Winter


In [None]:
# Use the clean_text function of province, town/city, and petrol_station.
clean_text(sales, ['province', 'town/city', 'sales_location'])

# View sales.
sales.head()

Unnamed: 0,province,town/city,postcode,company_code,petrol_station_code,petrol_station,date,sold_units,day,month,year,month_text,day_of_week,season
0,Araba/Álava,Altube,1139,33420,1004,Altube Dcho,2021-01-06,1,6,1,2021,Jan,Wednesday,Winter
1,Araba/Álava,Altube,1139,33420,1004,Altube Dcho,2021-02-05,1,5,2,2021,Feb,Friday,Winter
2,Araba/Álava,Altube,1139,33420,1004,Altube Dcho,2021-02-15,2,15,2,2021,Feb,Monday,Winter
3,Araba/Álava,Altube,1139,33420,1004,Altube Dcho,2021-02-17,1,17,2,2021,Feb,Wednesday,Winter
4,Araba/Álava,Altube,1139,33420,1004,Altube Dcho,2021-02-27,1,27,2,2021,Feb,Saturday,Winter


In [None]:
# Check unique values in the province column.
sales['province'].unique()

array(['Araba/Álava', 'Albacete', 'Alicante/Alacant', 'Almería', 'Ávila',
       'Badajoz', 'Balears Illes', 'Barcelona', 'Burgos', 'Cáceres',
       'Cádiz', 'Castellón/Castelló', 'Ciudad Real', 'Córdoba',
       'Coruña A', 'Cuenca', 'Girona', 'Granada', 'Guadalajara',
       'Gipuzkoa', 'Huelva', 'Huesca', 'Jaén', 'León', 'Lleida',
       'Rioja La', 'Lugo', 'Madrid', 'Málaga', 'Murcia', 'Navarra',
       'Ourense', 'Asturias', 'Palencia', 'Palmas Las', 'Pontevedra',
       'Salamanca', 'Santa Cruz De Tenerife', 'Cantabria', 'Segovia',
       'Sevilla', 'Soria', 'Tarragona', 'Teruel', 'Toledo',
       'Valencia/Valncia', 'Valladolid', 'Bizkaia', 'Zamora', 'Zaragoza'],
      dtype=object)

Some of these province names are incorrect.

In [None]:
# Replace the incorrect province name with the correct one.
# Create a dictionary.
province_dict = {'removed to ensure anonymity of employer'}


# Apply the dictionary to clean province values.
sales['province'] = sales['province'].replace(province_dict)

In [None]:
# View province unique values (in alphabetical order).
sorted(sales['province'].unique())

['A Coruña',
 'Albacete',
 'Alicante',
 'Almería',
 'Araba-Álava',
 'Asturias',
 'Badajoz',
 'Balearic Islands',
 'Barcelona',
 'Biscay',
 'Burgos',
 'Cantabria',
 'Castellón',
 'Ciudad Real',
 'Cuenca',
 'Cáceres',
 'Cádiz',
 'Córdoba',
 'Gipuzkoa',
 'Girona',
 'Granada',
 'Guadalajara',
 'Huelva',
 'Huesca',
 'Jaén',
 'La Rioja',
 'Las Palmas',
 'León',
 'Lleida',
 'Lugo',
 'Madrid',
 'Murcia',
 'Málaga',
 'Navarra',
 'Ourense',
 'Palencia',
 'Pontevedra',
 'Salamanca',
 'Santa Cruz de Tenerife',
 'Segovia',
 'Sevilla',
 'Soria',
 'Tarragona',
 'Teruel',
 'Toledo',
 'Valencia',
 'Valladolid',
 'Zamora',
 'Zaragoza',
 'Ávila']

In [None]:
# Check unique values in town/city column.
sales['town/city'].unique()

array(['Altube', 'Barrundia El Burgo', 'Igay', 'Llodio', 'Lopidana',
       'Vitoria', 'Albacete', 'Almansa', 'Bonete', 'La Gineta', 'La Roda',
       'Alcoy', 'Alicante', 'Altea', 'Campello', 'Cocentaina',
       'Crevillente', 'Desconocido', 'DNia Alicante', 'El Realengo',
       'Elche', 'Guardamar Del Segura', 'Javea', 'Javea Alicante',
       'La Nucia', 'Los Angeles', 'Ondara', 'Pinoso',
       'San Miguel De Salinas', 'San Vicente De Raspeig',
       'Santa Pola Alicante', 'Sax', 'Sta Faz Alicante', 'Torreveija',
       'Torrevieja', 'Villajoyosa', 'Almería', 'Cuevas De AlmanzoraPalom',
       'Garrucha AlmerA', 'Huercal', 'Espinosa De Los Caballero',
       'Badajoz', 'Lobon', 'Merida', 'Mérida', 'Monesterio',
       'Monesterio Badajoz', 'Zafra Badajoz', 'Algaida', 'Artá',
       'Cala Ratjada', 'Calviá', 'Campos', 'Ciudadela De Menorca',
       'Llucmajor', 'Llucmayor', 'Mahon', 'Manacor', 'Palma De Mallorca',
       'Palma De Mallorca Balear', 'Pollensa', 'Santany Palma',
  

Some of these towns/cities aren't correct (incorrect spelling or have the province at the end of the name).

In [None]:
# Replace the incorrect town/city name with the correct one.
# Create a dictionary.
town_cities_dict = {'removed to ensure anonymity of employer'}

# Apply the dictionary to clean town/city values.
sales['town/city'] = sales['town/city'].replace(town_cities_dict)

In [None]:
# View town/city unique values (in alphabetical order).
sorted(sales['town/city'].unique())

['A Coruña',
 'A Rúa',
 'Abanto y Ciérbana',
 'Aguilar De Campoo',
 'Agüimes',
 'Aielo De Malferit',
 'Ajalvir',
 'Alagón',
 'Albacete',
 'Alberique',
 'Albolote',
 'Alboraya',
 'Alcalá De Guadaira',
 'Alcalá De Henares',
 'Alcobendas',
 'Alcorisa',
 'Alcoy',
 'Alcázar De San Juan',
 'Aldeanueva De Figueroa',
 'Alfafar',
 'Algaida',
 'Algarrobo',
 'Algeciras',
 'Alhama',
 'Alicante',
 'Almansa',
 'Almazán',
 'Almenara',
 'Almería',
 'Almodóvar Del Río',
 'Alovera',
 'Altafulla',
 'Altea',
 'Altube',
 'Ameixeira',
 'Amorebieta-Etxano',
 'Amposta',
 'Andújar',
 'Antequera',
 'Arafo',
 'Arahal',
 'Aranda De Duero',
 'Arapiles',
 'Arbo Del Penedes',
 'Arona',
 'Arrabal De Portillo',
 'Arrigorriaga',
 'Arroyo De La Encomienda',
 'Arroyo De La Miel',
 'Artebakarra  Derio',
 'Artá',
 'Astorga',
 'Azuqueca De Henares',
 'Badajoz',
 'Badalona',
 'Baeza',
 'Bailén',
 'Balaguer',
 'Banyoles',
 'Baracaldo',
 'Barbastro',
 'Barberá Del Vallés',
 'Barcelona',
 'Barcena De Cicero',
 'Barna',
 'Barrei

In [None]:
# View unique sales locations names.
sorted(sales['sales_location'].unique())

['A Veiga',
 'Abanilla',
 'Ademuz',
 'Aeropuerto Manises',
 'Agama',
 'Aguas Nuevas',
 'Aguilar',
 'Aielo',
 'Ajalvir Md',
 'Ajalvir Mi',
 'Alamillo',
 'Alaska Málaga',
 'Albacete',
 'Albares',
 'Alberique',
 'Alberto Aguilera',
 'Albolote',
 'Alboraya',
 'Alboraya Pueblo',
 'Alcalá De Henares',
 'Alcalá Pueblo',
 'Alcobendas',
 'Alcorisa',
 'Alcoy',
 'Alcázar',
 'Aldeanueva',
 'Alfafar',
 'Alfaz Del Pi',
 'Algaida',
 'Algar',
 'Algarrobo',
 'Algarrobo Costa',
 'Algeciras Puerto',
 'Almansa',
 'Almazán',
 'Almenara',
 'Almería',
 'Almodóvar',
 'Alovera',
 'Alpicat',
 'Altabix',
 'Altafulla',
 'Altos De Marbella I',
 'Altos De Marbella Ii',
 'Altube Dcho',
 'Altube Izq',
 'Ameixeira Norte',
 'Ameixeira Sur',
 'Amorebieta',
 'Amorebieta A Md',
 'Amorebieta A Mi',
 'Andújar',
 'Apeadero',
 'Arabi',
 'Arahal',
 'Arahal Ii',
 'Aranda',
 'Arapiles',
 'Arc De Bara',
 'Archiduque Carlos',
 'Area 2 Madrid',
 'Area 2 Zaragoza',
 'Area Montcada Norte',
 'Area Montcada Sur',
 'Area Penedes Norte',

In [None]:
# Remove any extra spaces between words.
# Replace multiple spaces with a single space
sales['sales_location'] = sales['sales_location'].apply(lambda x: re.sub(r'\s+', ' ', x).strip())

# View petrol station unique values.
sorted(sales['sales_location'].unique())

['A Veiga',
 'Abanilla',
 'Ademuz',
 'Aeropuerto Manises',
 'Agama',
 'Aguas Nuevas',
 'Aguilar',
 'Aielo',
 'Ajalvir Md',
 'Ajalvir Mi',
 'Alamillo',
 'Alaska Málaga',
 'Albacete',
 'Albares',
 'Alberique',
 'Alberto Aguilera',
 'Albolote',
 'Alboraya',
 'Alboraya Pueblo',
 'Alcalá De Henares',
 'Alcalá Pueblo',
 'Alcobendas',
 'Alcorisa',
 'Alcoy',
 'Alcázar',
 'Aldeanueva',
 'Alfafar',
 'Alfaz Del Pi',
 'Algaida',
 'Algar',
 'Algarrobo',
 'Algarrobo Costa',
 'Algeciras Puerto',
 'Almansa',
 'Almazán',
 'Almenara',
 'Almería',
 'Almodóvar',
 'Alovera',
 'Alpicat',
 'Altabix',
 'Altafulla',
 'Altos De Marbella I',
 'Altos De Marbella Ii',
 'Altube Dcho',
 'Altube Izq',
 'Ameixeira Norte',
 'Ameixeira Sur',
 'Amorebieta',
 'Amorebieta A Md',
 'Amorebieta A Mi',
 'Andújar',
 'Apeadero',
 'Arabi',
 'Arahal',
 'Arahal Ii',
 'Aranda',
 'Arapiles',
 'Arc De Bara',
 'Archiduque Carlos',
 'Area 2 Madrid',
 'Area 2 Zaragoza',
 'Area Montcada Norte',
 'Area Montcada Sur',
 'Area Penedes Norte',

In [None]:
# Change the roman numerals to the correct format e.g. "Ii" to "II".
sales['sales_location'] = (
    sales['sales_location']
    .str.replace('Ii', 'II')
    .str.replace('Iii', 'III')
    .str.replace('IIi', 'III')
    .str.replace('Iv', 'IV'))

# View petrol station unique values.
sorted(sales['sales_location'].unique())

['A Veiga',
 'Abanilla',
 'Ademuz',
 'Aeropuerto Manises',
 'Agama',
 'Aguas Nuevas',
 'Aguilar',
 'Aielo',
 'Ajalvir Md',
 'Ajalvir Mi',
 'Alamillo',
 'Alaska Málaga',
 'Albacete',
 'Albares',
 'Alberique',
 'Alberto Aguilera',
 'Albolote',
 'Alboraya',
 'Alboraya Pueblo',
 'Alcalá De Henares',
 'Alcalá Pueblo',
 'Alcobendas',
 'Alcorisa',
 'Alcoy',
 'Alcázar',
 'Aldeanueva',
 'Alfafar',
 'Alfaz Del Pi',
 'Algaida',
 'Algar',
 'Algarrobo',
 'Algarrobo Costa',
 'Algeciras Puerto',
 'Almansa',
 'Almazán',
 'Almenara',
 'Almería',
 'Almodóvar',
 'Alovera',
 'Alpicat',
 'Altabix',
 'Altafulla',
 'Altos De Marbella I',
 'Altos De Marbella II',
 'Altube Dcho',
 'Altube Izq',
 'Ameixeira Norte',
 'Ameixeira Sur',
 'Amorebieta',
 'Amorebieta A Md',
 'Amorebieta A Mi',
 'Andújar',
 'Apeadero',
 'Arabi',
 'Arahal',
 'Arahal II',
 'Aranda',
 'Arapiles',
 'Arc De Bara',
 'Archiduque Carlos',
 'Area 2 Madrid',
 'Area 2 Zaragoza',
 'Area Montcada Norte',
 'Area Montcada Sur',
 'Area Penedes Norte',

In [None]:
# Check unique postcodes (should all have X digits).
sales['postcode'].unique()

array(['1139', '1206', '1213', '1400', '1196', '1006', '1007', '1013',
       '2005', '2006', '2640', '2691', '2110', '2630', '3804', '3002',
       '3009', '3580', '3750', '3590', '3560', '3820', '3330', '3509',
       '3690', '3730', '3700', '3339', '3200', '3290', '3140', '3530',
       '3010', '3760', '3012', '3193', '3130', '3630', '3559', '3180',
       '4006', '4007', '4008', '4618', '4740', '4630', '4230', '5296',
       '6300', '6011', '6498', '6800', '6260', '7210', '7570', '7590',
       '7184', '7630', '7760', '7620', '7703', '7500', '7007', '7011',
       '7611', '7460', '7650', '7510', '7009', '7120', '8911', '8915',
       '8918', '8210', '8013', '8017', '8024', '8028', '8038', '8040',
       '8880', '8349', '8140', '8370', '8506', '8420', '8440', '8211',
       '8860', '8290', '8758', '8940', '8027', '8031', '8110', '8232',
       '8304', '8460', '8840', '8292', '8400', '8902', '8907', '8430',
       '8520', '8760', '8320', '8303', '8302', '8750', '8100', '8160',
      

Not all of these have 5 digits. Are the 4 digit postcodes supposed to have a zero in front?

In [None]:
# View all wrong amount digit postcodes.
four_digit_postcodes = sales[sales['postcode'].apply(lambda x: isinstance(x, str) and len(x) == Y)]

unique_postcode_province = four_digit_postcodes[['postcode', 'province']].drop_duplicates()

unique_postcode_province

Unnamed: 0,postcode,province
0,1139,Araba-Álava
218,1206,Araba-Álava
347,1213,Araba-Álava
573,1400,Araba-Álava
786,1196,Araba-Álava
...,...,...
41040,9290,Burgos
41372,9600,Burgos
183207,3610,Alicante
195456,8016,Barcelona


This is a lot of incorrect postcodes so I will get AI to check whether these just require a zero in front. To do this, I will create a dictionary list.

In [None]:
# Create a list of strings.
postcode_province_list = [f"{postcode}:{province}" for postcode, province in zip(unique_postcode_province['postcode'], unique_postcode_province['province'])]

# View the list.
postcode_province_list

['1139:Araba-Álava',
 '1206:Araba-Álava',
 '1213:Araba-Álava',
 '1400:Araba-Álava',
 '1196:Araba-Álava',
 '1006:Araba-Álava',
 '1007:Araba-Álava',
 '1013:Araba-Álava',
 '2005:Albacete',
 '2006:Albacete',
 '2640:Albacete',
 '2691:Albacete',
 '2110:Albacete',
 '2630:Albacete',
 '3804:Alicante',
 '3002:Alicante',
 '3009:Alicante',
 '3580:Alicante',
 '3750:Alicante',
 '3590:Alicante',
 '3560:Alicante',
 '3820:Alicante',
 '3330:Alicante',
 '3509:Alicante',
 '3690:Alicante',
 '3730:Alicante',
 '3700:Alicante',
 '3339:Alicante',
 '3200:Alicante',
 '3290:Alicante',
 '3140:Alicante',
 '3530:Alicante',
 '3010:Alicante',
 '3760:Alicante',
 '3012:Alicante',
 '3193:Alicante',
 '3130:Alicante',
 '3630:Alicante',
 '3559:Alicante',
 '3180:Alicante',
 '4006:Almería',
 '4007:Almería',
 '4008:Almería',
 '4618:Almería',
 '4740:Almería',
 '4630:Almería',
 '4230:Almería',
 '5296:Ávila',
 '6300:Badajoz',
 '6011:Badajoz',
 '6498:Badajoz',
 '6800:Badajoz',
 '6260:Badajoz',
 '7210:Balearic Islands',
 '7570:Bale

I cross checked this list with the province identifier. I can confirm that a zero needs to be added to all of these postcodes at the start.

In [None]:
# Add a leading zero to all Y-digit postcodes.

# Create function.
def add_leading_zero(postcode):
    return postcode if len(postcode) == X else '0' + postcode

# Apply the function to the postcode column.
sales['postcode'] = sales['postcode'].apply(add_leading_zero)

# View all unique postcodes.
sales['postcode'].unique()

array(['01139', '01206', '01213', '01400', '01196', '01006', '01007',
       '01013', '02005', '02006', '02640', '02691', '02110', '02630',
       '03804', '03002', '03009', '03580', '03750', '03590', '03560',
       '03820', '03330', '03509', '03690', '03730', '03700', '03339',
       '03200', '03290', '03140', '03530', '03010', '03760', '03012',
       '03193', '03130', '03630', '03559', '03180', '04006', '04007',
       '04008', '04618', '04740', '04630', '04230', '05296', '06300',
       '06011', '06498', '06800', '06260', '07210', '07570', '07590',
       '07184', '07630', '07760', '07620', '07703', '07500', '07007',
       '07011', '07611', '07460', '07650', '07510', '07009', '07120',
       '08911', '08915', '08918', '08210', '08013', '08017', '08024',
       '08028', '08038', '08040', '08880', '08349', '08140', '08370',
       '08506', '08420', '08440', '08211', '08860', '08290', '08758',
       '08940', '08027', '08031', '08110', '08232', '08304', '08460',
       '08840', '082

In [None]:
# Check that each petrol station only has one unique petrol station code assigned to it.

# Group by station name and check the number of unique station codes
station_code_check = sales.groupby('petrol_station')['petrol_station_code'].nunique()

# Find stations that have more than one unique code
stations_with_multiple_codes = station_code_check[station_code_check > 1]

# View how many petrol stations have more than one unique code.
stations_with_multiple_codes

Series([], Name: petrol_station_code, dtype: int64)

No petrol stations have multiple codes.

#### Check for duplicates.

In [None]:
# Check for duplicates.
sales.duplicated().sum()

0

In [None]:
# View sales.
sales

Unnamed: 0,province,town/city,postcode,company_code,petrol_station_code,petrol_station,date,sold_units,day,month,year,month_text,day_of_week,season
0,Araba-Álava,Altube,01139,33420,1004,Altube Dcho,2021-01-06,1,6,1,2021,Jan,Wednesday,Winter
1,Araba-Álava,Altube,01139,33420,1004,Altube Dcho,2021-02-05,1,5,2,2021,Feb,Friday,Winter
2,Araba-Álava,Altube,01139,33420,1004,Altube Dcho,2021-02-15,2,15,2,2021,Feb,Monday,Winter
3,Araba-Álava,Altube,01139,33420,1004,Altube Dcho,2021-02-17,1,17,2,2021,Feb,Wednesday,Winter
4,Araba-Álava,Altube,01139,33420,1004,Altube Dcho,2021-02-27,1,27,2,2021,Feb,Saturday,Winter
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
330979,Zaragoza,Zaragoza,50021,96995,1790,Puerto Venecia,2022-11-30,1,30,11,2022,Nov,Wednesday,Autumn
330980,Zaragoza,Zaragoza,50021,96995,1790,Puerto Venecia,2022-12-02,2,2,12,2022,Dec,Friday,Winter
330981,Zaragoza,Zaragoza,50021,96995,1790,Puerto Venecia,2022-12-03,3,3,12,2022,Dec,Saturday,Winter
330982,Zaragoza,Zaragoza,50021,96995,1790,Puerto Venecia,2022-12-06,3,6,12,2022,Dec,Tuesday,Winter


#### Download sales as a csv.

In [None]:
# Download as a CSV.
sales.to_csv('sales.csv', index=False)

### Create a filtered sales dataframe for within the 65km radius of metropolitan area

I added leading zeros to the few postcodes that were Y digits long.

I took a sample of postcodes to check whether these were in the 65km radius of the metropolitan area. ***There is a possibility that some postcodes or parts of postcodes are slightly outside of the 65km radius.***

In [None]:
# Create a list of postcodes to keep.
postcodes_to_keep = ['removed to ensure anonymity of employer']

# Create the filtered_sales dataframe.
filtered_sales = sales[sales['postcode'].isin(postcodes_to_keep)]

# View filtered sales.
filtered_sales

Unnamed: 0,province,town/city,postcode,company_code,petrol_station_code,petrol_station,date,sold_units,day,month,year,month_text,day_of_week,season
65714,Guadalajara,Alovera,19208,5561,459,Alovera,2021-01-03,1,3,1,2021,Jan,Sunday,Winter
65715,Guadalajara,Alovera,19208,5561,459,Alovera,2021-01-27,2,27,1,2021,Jan,Wednesday,Winter
65716,Guadalajara,Alovera,19208,5561,459,Alovera,2021-02-14,1,14,2,2021,Feb,Sunday,Winter
65717,Guadalajara,Alovera,19208,5561,459,Alovera,2021-02-27,1,27,2,2021,Feb,Saturday,Winter
65718,Guadalajara,Alovera,19208,5561,459,Alovera,2021-03-06,2,6,3,2021,Mar,Saturday,Spring
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
311663,Toledo,Yuncos,45210,96349,1056,Yuncos,2022-11-11,1,11,11,2022,Nov,Friday,Autumn
311664,Toledo,Yuncos,45210,96349,1056,Yuncos,2022-11-13,1,13,11,2022,Nov,Sunday,Autumn
311665,Toledo,Yuncos,45210,96349,1056,Yuncos,2022-11-20,1,20,11,2022,Nov,Sunday,Autumn
311666,Toledo,Yuncos,45210,96349,1056,Yuncos,2022-11-27,2,27,11,2022,Nov,Sunday,Autumn


In [None]:
# Check the provinces that are included. If there are provinces included far from Madrid, it hasn't been filtered properly.
filtered_sales['province'].unique()

array(['Guadalajara', 'Madrid', 'Toledo'], dtype=object)

![Screenshot 2025-04-26 154719.png](attachment:f072d902-2da9-4e0a-bd13-e47de69f27aa.png)

Yes, these provinces make sense.

In [None]:
# Download as a CSV.
filtered_sales.to_csv('filtered_sales.csv', index=False)

## Temperature data

#### Load files

In [None]:
# Load the CSV files.
temperatures_2021 = pd.read_csv("temperature-madrid-2021.csv")
temperatures_2022 = pd.read_csv("temperature-madrid-2022.csv")

#### Vertically concatenating the two data files.

In [None]:
# Concatenate the two data sets.
temperatures_2021['year'] = 2021
temperatures_2022['year'] = 2022

temperatures = pd.concat([temperatures_2021, temperatures_2022], ignore_index=True)

#### Validating the data.

In [None]:
# Validate the data.
validate_data(temperatures)

Data types:
date     object
tavg    float64
tmin    float64
tmax    float64
prcp    float64
snow    float64
wdir    float64
wspd    float64
wpgt    float64
pres    float64
tsun    float64
year      int64
dtype: object
Data shape:
(713, 12)
Unique values per column:
date    713
tavg    264
tmin    245
tmax    283
prcp     91
snow      0
wdir    270
wspd    159
wpgt      0
pres    235
tsun      0
year      2
dtype: int64

Number of duplicate rows:
0

Summary statistics:
             tavg        tmin        tmax        prcp  snow        wdir  \
count  712.000000  713.000000  713.000000  713.000000   0.0  710.000000   
mean    16.084410   11.409397   21.484572    1.486816   NaN  155.830986   
std      7.826257    6.793839    9.007184    5.350041   NaN  109.068642   
min     -3.200000   -7.400000    0.300000    0.000000   NaN    1.000000   
25%      9.800000    5.500000   13.900000    0.000000   NaN   47.000000   
50%     14.600000   10.700000   19.900000    0.000000   NaN  143.000000   
75

(date     object
 tavg    float64
 tmin    float64
 tmax    float64
 prcp    float64
 snow    float64
 wdir    float64
 wspd    float64
 wpgt    float64
 pres    float64
 tsun    float64
 year      int64
 dtype: object,
 (713, 12),
 date    713
 tavg    264
 tmin    245
 tmax    283
 prcp     91
 snow      0
 wdir    270
 wspd    159
 wpgt      0
 pres    235
 tsun      0
 year      2
 dtype: int64,
 0,
              tavg        tmin        tmax        prcp  snow        wdir  \
 count  712.000000  713.000000  713.000000  713.000000   0.0  710.000000   
 mean    16.084410   11.409397   21.484572    1.486816   NaN  155.830986   
 std      7.826257    6.793839    9.007184    5.350041   NaN  109.068642   
 min     -3.200000   -7.400000    0.300000    0.000000   NaN    1.000000   
 25%      9.800000    5.500000   13.900000    0.000000   NaN   47.000000   
 50%     14.600000   10.700000   19.900000    0.000000   NaN  143.000000   
 75%     22.325000   16.500000   28.800000    0.000000   NaN 

#### Data cleaning

In [None]:
# Rename columns for easier processing
temperatures.rename(columns={'tavg': 'avg_temp', 'tmin': 'min_temp',
                          'tmax': 'max_temp','prcp': 'precip',
                          'wdir': 'wind_dir','wspd': 'wind_speed',
                          'wpgt': 'wind_peak','pres': 'avg_pressure',
                          'tsun': 'sun_minutes'}, inplace=True)
temperatures.columns

Index(['date', 'avg_temp', 'min_temp', 'max_temp', 'precip', 'snow',
       'wind_dir', 'wind_speed', 'wind_peak', 'avg_pressure', 'sun_minutes',
       'year'],
      dtype='object')

In [None]:
# Checking for missing values (nulls).
find_nulls(temperatures)

date              0
avg_temp          1
min_temp          0
max_temp          0
precip            0
snow            713
wind_dir          3
wind_speed        3
wind_peak       713
avg_pressure      3
sun_minutes     713
year              0
dtype: int64

In [None]:
# Remove columns snow, wind_peak & sun_minutes as they do not contain data (713 out of 713 null values)
temperatures.drop(columns=['snow', 'wind_peak', 'sun_minutes'], inplace=True)

In [None]:
# Display rows with null values in avg_temp
display_nulls(temperatures, 'avg_temp')

Unnamed: 0,date,avg_temp,min_temp,max_temp,precip,wind_dir,wind_speed,avg_pressure,year
492,2022-05-26,,10.5,26.7,0.0,,,,2022


In [None]:
# Display rows with null values in wind_dir
display_nulls(temperatures, 'wind_dir')

Unnamed: 0,date,avg_temp,min_temp,max_temp,precip,wind_dir,wind_speed,avg_pressure,year
0,2021-01-01,3.5,1.6,6.5,0.0,,,,2021
492,2022-05-26,,10.5,26.7,0.0,,,,2022
633,2022-03-27,11.4,8.6,13.8,0.0,,,,2022


In [None]:
# Display rows with null values in wind_speed
display_nulls(temperatures, 'wind_speed')

Unnamed: 0,date,avg_temp,min_temp,max_temp,precip,wind_dir,wind_speed,avg_pressure,year
0,2021-01-01,3.5,1.6,6.5,0.0,,,,2021
492,2022-05-26,,10.5,26.7,0.0,,,,2022
633,2022-03-27,11.4,8.6,13.8,0.0,,,,2022


In [None]:
# Display rows with null values in avg_pressure
display_nulls(temperatures, 'avg_pressure')

Unnamed: 0,date,avg_temp,min_temp,max_temp,precip,wind_dir,wind_speed,avg_pressure,year
0,2021-01-01,3.5,1.6,6.5,0.0,,,,2021
492,2022-05-26,,10.5,26.7,0.0,,,,2022
633,2022-03-27,11.4,8.6,13.8,0.0,,,,2022


***Handling nulls***

Null values will not be removed as the columns still contain valuable data, they will not be changed to 00 or avg values to keep the
data set to avoid misrepresentation.

In [None]:
# Check if all dates are entered YYYY-MM-DD or if there are inconsistencies.
length_ok = temperatures['date'].astype(str).str.len() == 10
hyphens_ok = temperatures['date'].astype(str).str.count('-') == 2
format_ok = length_ok & hyphens_ok
invalid_dates = temperatures[~format_ok]

if invalid_dates.empty:
    print("All dates are correctly formatted as YYYY-MM-DD.")
else:
    print("The following rows have incorrectly formatted dates:")
    print(invalid_dates)

All dates are correctly formatted as YYYY-MM-DD.


In [None]:
# Check if there are any missing dates
temperatures['date'] = pd.to_datetime(temperatures['date'])
full_dates = pd.date_range(start=temperatures['date'].min(), end=temperatures['date'].max())
missing_dates = full_dates.difference(temperatures['date'])

if missing_dates.empty:
    print("There are no missing dates in the dataset.")
else:
    print(f"The following dates are missing: {missing_dates.tolist()}")

There are no missing dates in the dataset.


In [None]:
# Convert 'date' to datetime without changing format
temperatures['date'] = pd.to_datetime(temperatures['date'], format='%Y-%m-%d')

In [None]:
# Create separate date, month, and year columns & display unique values.
temperatures = split_date(temperatures, 'date')

Day: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 30 31]
Month: [ 1  2  3  4  5  6  7  8  9 10 11 12]
Year: [2021 2022]


In [None]:
# Create a "day of the week" column & display unique values
temperatures = add_day_of_week_column(temperatures, 'date')

['Friday' 'Saturday' 'Sunday' 'Monday' 'Tuesday' 'Wednesday' 'Thursday']


In [None]:
# Create a "month_text" column with the month values as text abbreviations & display unique values.
temperatures = add_month_text_column(temperatures, 'date')

['Jan' 'Feb' 'Mar' 'Apr' 'May' 'Jun' 'Jul' 'Aug' 'Sep' 'Oct' 'Nov' 'Dec']


In [None]:
# Create a "season" column & display unique values.
temperatures = add_season_column(temperatures, 'month_text')

['Winter' 'Spring' 'Summer' 'Autumn']


In [None]:
temperatures.head()

Unnamed: 0,date,avg_temp,min_temp,max_temp,precip,wind_dir,wind_speed,avg_pressure,year,day,month,day_of_week,month_text,season
0,2021-01-01,3.5,1.6,6.5,0.0,,,,2021,1,1,Friday,Jan,Winter
1,2021-01-02,1.8,-0.9,5.4,0.0,294.0,12.7,1015.5,2021,2,1,Saturday,Jan,Winter
2,2021-01-03,2.2,-1.8,7.8,0.0,267.0,11.4,1017.0,2021,3,1,Sunday,Jan,Winter
3,2021-01-04,2.6,0.0,5.4,0.0,268.0,11.2,1013.6,2021,4,1,Monday,Jan,Winter
4,2021-01-05,0.9,-1.7,4.7,0.0,320.0,5.2,1013.8,2021,5,1,Tuesday,Jan,Winter


In [None]:
# Reorder the columns for simpler overview
ordered_columns = ['date', 'day', 'day_of_week', 'month', 'month_text', 'season', 'year',
                   'avg_temp', 'min_temp', 'max_temp', 'precip', 'wind_dir', 'wind_speed', 'avg_pressure']

temperatures = temperatures[ordered_columns]

temperatures.head()

Unnamed: 0,date,day,day_of_week,month,month_text,season,year,avg_temp,min_temp,max_temp,precip,wind_dir,wind_speed,avg_pressure
0,2021-01-01,1,Friday,1,Jan,Winter,2021,3.5,1.6,6.5,0.0,,,
1,2021-01-02,2,Saturday,1,Jan,Winter,2021,1.8,-0.9,5.4,0.0,294.0,12.7,1015.5
2,2021-01-03,3,Sunday,1,Jan,Winter,2021,2.2,-1.8,7.8,0.0,267.0,11.4,1017.0
3,2021-01-04,4,Monday,1,Jan,Winter,2021,2.6,0.0,5.4,0.0,268.0,11.2,1013.6
4,2021-01-05,5,Tuesday,1,Jan,Winter,2021,0.9,-1.7,4.7,0.0,320.0,5.2,1013.8


In [None]:
# Function to identify outliers using IQR method and print Min/Max + Outliers
def identify_outliers_iqr(df, column):
    min_value = df[column].min()
    max_value = df[column].max()

    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    out_of_range = (df[column] < lower_bound) | (df[column] > upper_bound)
    outliers = df[out_of_range]

    print(f"Min value for {column}: {min_value}")
    print(f"Max value for {column}: {max_value}")

    if not outliers.empty:
        print("\nOutliers for", column)
        display(outliers)
    else:
        print("\nNo outliers found for", column)

    return outliers

# Get min, max and outliers for the 'min_temp' column.
outliers_min_temp = identify_outliers_iqr(temperatures, 'min_temp')

Min value for min_temp: -7.4
Max value for min_temp: 26.2

No outliers found for min_temp


In [None]:
# Get min, max and outliers for the 'max_temp' column
outliers_max_temp = identify_outliers_iqr(temperatures, 'max_temp')

Min value for max_temp: 0.3
Max value for max_temp: 40.7

No outliers found for max_temp


In [None]:
# Get min, max and outliers for the 'avg_temp' column
outliers_avg_temp = identify_outliers_iqr(temperatures, 'avg_temp')

Min value for avg_temp: -3.2
Max value for avg_temp: 33.7

No outliers found for avg_temp


In [None]:
# Get min, max and outliers for the 'precip' column
outliers_precip = identify_outliers_iqr(temperatures, 'precip')

Min value for precip: 0.0
Max value for precip: 67.7

Outliers for precip


Unnamed: 0,date,day,day_of_week,month,month_text,season,year,avg_temp,min_temp,max_temp,precip,wind_dir,wind_speed,avg_pressure
6,2021-01-07,7,Thursday,1,Jan,Winter,2021,0.6,-0.6,1.4,2.3,34.0,11.2,1014.2
7,2021-01-08,8,Friday,1,Jan,Winter,2021,-0.7,-1.0,0.3,32.9,36.0,18.6,1012.2
8,2021-01-09,9,Saturday,1,Jan,Winter,2021,-0.7,-1.6,0.5,17.7,51.0,18.3,1006.6
19,2021-01-20,20,Wednesday,1,Jan,Winter,2021,3.3,0.9,6.5,3.1,182.0,13.1,1013.6
20,2021-01-21,21,Thursday,1,Jan,Winter,2021,7.7,5.4,10.0,7.9,228.0,19.9,1014.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
697,2022-12-05,5,Monday,12,Dec,Winter,2022,6.4,2.3,10.1,11.7,48.0,9.6,1014.5
698,2022-02-26,26,Saturday,2,Feb,Winter,2022,8.4,6.7,9.6,3.4,76.0,10.2,1022.8
702,2022-01-05,5,Wednesday,1,Jan,Winter,2022,5.9,4.3,8.9,5.0,290.0,14.0,1019.5
707,2022-12-11,11,Sunday,12,Dec,Winter,2022,7.4,6.3,8.5,15.1,280.0,9.5,1008.0


In [None]:
# Get min, max and outliers for the 'wind_dir' column
outliers_wind_dir = identify_outliers_iqr(temperatures, 'wind_dir')

Min value for wind_dir: 1.0
Max value for wind_dir: 359.0

No outliers found for wind_dir


In [None]:
# Get min, max and outliers for the 'wind_speed' column
outliers_wind_speed = identify_outliers_iqr(temperatures, 'wind_speed')

Min value for wind_speed: 3.5
Max value for wind_speed: 23.9

Outliers for wind_speed


Unnamed: 0,date,day,day_of_week,month,month_text,season,year,avg_temp,min_temp,max_temp,precip,wind_dir,wind_speed,avg_pressure
20,2021-01-21,21,Thursday,1,Jan,Winter,2021,7.7,5.4,10.0,7.9,228.0,19.9,1014.0
21,2021-01-22,22,Friday,1,Jan,Winter,2021,7.8,4.9,10.2,0.0,244.0,23.9,1011.0
22,2021-01-23,23,Saturday,1,Jan,Winter,2021,6.4,3.6,10.6,1.7,237.0,22.2,1014.7
24,2021-01-25,25,Monday,1,Jan,Winter,2021,10.4,7.5,13.0,0.0,242.0,19.8,1014.0
29,2021-01-30,30,Saturday,1,Jan,Winter,2021,10.1,8.2,12.6,3.1,250.0,21.5,1014.1
30,2021-01-31,31,Sunday,1,Jan,Winter,2021,10.3,8.1,12.8,0.0,238.0,21.6,1014.2
38,2021-02-08,8,Monday,2,Feb,Winter,2021,7.3,5.4,10.6,2.2,239.0,20.3,1005.1
39,2021-02-09,9,Tuesday,2,Feb,Winter,2021,7.1,5.4,9.1,18.2,224.0,23.8,1006.1
75,2021-03-17,17,Wednesday,3,Mar,Spring,2021,12.4,8.0,18.8,0.0,46.0,20.7,1025.1
78,2021-03-20,20,Saturday,3,Mar,Spring,2021,6.7,2.6,11.7,0.0,34.0,19.8,1019.1


In [None]:
# Get min, max and outliers for the 'avg_pressure' column
outliers_avg_pressure = identify_outliers_iqr(temperatures, 'avg_pressure')

Min value for avg_pressure: 1001.0
Max value for avg_pressure: 1034.1

Outliers for avg_pressure


Unnamed: 0,date,day,day_of_week,month,month_text,season,year,avg_temp,min_temp,max_temp,precip,wind_dir,wind_speed,avg_pressure
608,2022-01-29,29,Saturday,1,Jan,Winter,2022,8.8,2.3,15.5,0.0,34.0,9.2,1033.8
664,2022-12-09,9,Friday,12,Dec,Winter,2022,9.2,7.1,12.2,6.3,205.0,10.6,1001.0
666,2022-04-23,23,Saturday,4,Apr,Spring,2022,8.1,5.1,12.1,1.9,241.0,22.9,1002.7
691,2022-04-22,22,Friday,4,Apr,Spring,2022,8.7,6.0,10.9,22.8,205.0,16.6,1001.8
695,2022-01-17,17,Monday,1,Jan,Winter,2022,4.8,0.4,10.3,0.0,43.0,6.2,1033.4
703,2022-01-13,13,Thursday,1,Jan,Winter,2022,5.6,2.4,8.9,0.0,60.0,8.1,1034.1
708,2022-01-14,14,Friday,1,Jan,Winter,2022,4.4,1.0,8.2,0.0,43.0,7.0,1033.9


***Handling Outliers***

The temperature values in the dataset appear realistic for the metropolitan area — plausible given the city's continental climate. Precipitation outliers above x mm/day, while relatively rare, are consistent with occasional events, particularly during autumn storms. Wind speeds peaking at x km/h are on the higher end but not extreme, likely reflecting stormy or exposed conditions. Atmospheric pressure values ranging from x to x hPa are also within normal weather variation for Madrid. Overall, the outlier values, while relatively infrequent, fall within realistic bounds for the area's climate. Therefore, the outliers will be kept in the dataset, as they represent valid and plausible meteorological extremes.

#### Download temperature CSV.

In [None]:
# Download as a CSV.
temperatures.to_csv('temperatures.csv', index=False)

## Merge filtered_sales and temperatures.

In [None]:
# Merge temperatures and filtered_sales.
filtered_sales_temp = filtered_sales.merge(temperatures, on='date', how='left')

# View filtered_sales_temp.
filtered_sales_temp

Unnamed: 0,province,town/city,postcode,company_code,petrol_station_code,petrol_station,date,sold_units,day_x,month_x,...,month_text_y,season_y,year_y,avg_temp,min_temp,max_temp,precip,wind_dir,wind_speed,avg_pressure
0,Guadalajara,Alovera,19208,5561,459,Alovera,2021-01-03,1,3,1,...,Jan,Winter,2021,2.2,-1.8,7.8,0.0,267.0,11.4,1017.0
1,Guadalajara,Alovera,19208,5561,459,Alovera,2021-01-27,2,27,1,...,Jan,Winter,2021,13.0,9.7,17.8,0.0,275.0,9.6,1025.2
2,Guadalajara,Alovera,19208,5561,459,Alovera,2021-02-14,1,14,2,...,Feb,Winter,2021,9.8,5.0,14.6,0.0,67.0,5.7,1029.6
3,Guadalajara,Alovera,19208,5561,459,Alovera,2021-02-27,1,27,2,...,Feb,Winter,2021,12.4,7.5,18.2,0.0,46.0,9.7,1025.1
4,Guadalajara,Alovera,19208,5561,459,Alovera,2021-03-06,2,6,3,...,Mar,Spring,2021,11.3,9.1,14.8,0.0,73.0,8.6,1019.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38358,Toledo,Yuncos,45210,96349,1056,Yuncos,2022-11-11,1,11,11,...,Nov,Autumn,2022,10.0,5.3,16.6,3.2,48.0,9.8,1025.3
38359,Toledo,Yuncos,45210,96349,1056,Yuncos,2022-11-13,1,13,11,...,Nov,Autumn,2022,11.7,7.8,17.6,0.0,41.0,6.1,1021.2
38360,Toledo,Yuncos,45210,96349,1056,Yuncos,2022-11-20,1,20,11,...,Nov,Autumn,2022,9.9,6.9,14.5,0.4,255.0,14.5,1023.6
38361,Toledo,Yuncos,45210,96349,1056,Yuncos,2022-11-27,2,27,11,...,Nov,Autumn,2022,6.9,1.9,13.7,0.8,3.0,6.1,1025.0


In [None]:
# View filtered_sales_temp columns.
filtered_sales_temp.columns

Index(['province', 'town/city', 'postcode', 'company_code',
       'petrol_station_code', 'petrol_station', 'date', 'sold_units', 'day_x',
       'month_x', 'year_x', 'month_text_x', 'day_of_week_x', 'season_x',
       'day_y', 'day_of_week_y', 'month_y', 'month_text_y', 'season_y',
       'year_y', 'avg_temp', 'min_temp', 'max_temp', 'precip', 'wind_dir',
       'wind_speed', 'avg_pressure'],
      dtype='object')

In [None]:
# Drop all duplicate columns.
filtered_sales_temp = filtered_sales_temp.drop(columns=['day_y', 'day_of_week_y', 'month_y', 'month_text_y', 'season_y', 'year_y'])

# Rename columns.
filtered_sales_temp = filtered_sales_temp.rename(columns={'day_x': 'day','month_x': 'month','month_text_x': 'month_text', 'year_x':'year', 'day_of_week_x':'day_of_week', 'season_x':'season'})

# View filtered_sales_temp columns.
filtered_sales_temp.columns

Index(['province', 'town/city', 'postcode', 'company_code',
       'petrol_station_code', 'petrol_station', 'date', 'sold_units', 'day',
       'month', 'year', 'month_text', 'day_of_week', 'season', 'avg_temp',
       'min_temp', 'max_temp', 'precip', 'wind_dir', 'wind_speed',
       'avg_pressure'],
      dtype='object')

#### Download filtered_sales_temp.

In [None]:
# Download as a CSV.
filtered_sales_temp.to_csv('filtered_sales_temp.csv', index=False)