In [21]:
import pandas as pd
import os

# Read in the dataset
trans_df = pd.read_csv(os.path.join("..", "data", "Final with Coords, Fuel Type and Prices.csv"))
vehicle_df = pd.read_csv(os.path.join("..", "data", "Final KMPL dataset.csv"))

In [22]:
trans_df.shape

(175933, 21)

In [23]:
vehicle_df.columns

Index(['Reg', 'Overlap_Start', 'Overlap_End', 'Total_Km', 'Total_Litres',
       'KMPL', 'REG_NUM', 'VEHICLE MAKE', 'MODEL DERIVATIVE', 'DEPARTMENT',
       'District', 'Site', 'Site Lat', 'Site Long', 'Fuel Type',
       'RATE CARD CATEGORY'],
      dtype='object')

In [24]:
trans_df.columns

Index(['Transaction Date', 'REG_NUM', 'Merchant Name', 'No. of Litres',
       'Transaction Amount', 'VEHICLE MAKE', 'MODEL DERIVATIVE', 'DEPARTMENT',
       'RATE CARD CATEGORY', 'Site', 'District', 'Site Lat', 'Site Long',
       'Merchant Lat', 'Merchant Long', 'Fuel Type',
       'Estimated Price Per Litre', 'Coastal Petrol', 'Inland Petrol',
       'Coastal Diesel', 'Inland Diesel'],
      dtype='object')

In [52]:
# Remove the word "CATEGORY" from the 'RATE CARD CATEGORY' in vehicle_df
vehicle_df['RATE CARD CATEGORY'] = vehicle_df['RATE CARD CATEGORY'].str.replace('CATEGORY', '')

# Fleet Composition

In [63]:
import matplotlib.pyplot as plt
import pandas as pd

def shorten_names(names, max_length=20):
    shortened_names = []
    for name in names:
        if len(name) > max_length:
            shortened_name = name[:max_length-3] + '...'
        else:
            shortened_name = name
        shortened_names.append(shortened_name)
    return shortened_names


def create_bar_plot(data, title, filename, threshold, max_length=50):
    # Filter the data and calculate the sum of counts for the remaining categories
    filtered_data = data[data >= threshold]
    others_count = data[data < threshold].sum()
    filtered_data['Others'] = others_count

    # Shorten the names for each category
    shortened_names = shorten_names(filtered_data.index, max_length=max_length)

    # Create a single plot
    fig, ax = plt.subplots(figsize=(8, 8))

    # Set the font size for the title and labels
    # title_font_size = 14
    label_font_size = 12
    y_label_font_size = 14

    # Plot the data
    ax.bar(shortened_names, filtered_data)
    ax.set_xticklabels(shortened_names, rotation=45, ha='right', fontsize=label_font_size)
    ax.set_yticklabels(ax.get_yticks(), fontsize=label_font_size)
    ax.set_ylabel('Count', fontsize=y_label_font_size)
    # ax.set_title(title, fontsize=title_font_size)

    # Adjust the spacing
    plt.tight_layout()

    # Save the plot as a PDF file
    plt.savefig(filename, format='pdf', bbox_inches='tight')

    # Close the plot
    plt.close(fig)

In [64]:
# Group the DataFrame by 'MODEL DERIVATIVE' and count the occurrences
fleet_data_derivative = vehicle_df['MODEL DERIVATIVE'].value_counts()

# Group the DataFrame by 'VEHICLE MAKE' and count the occurrences
fleet_data_make = vehicle_df['VEHICLE MAKE'].value_counts()

# Group the DataFrame by 'DEPARTMENT' and count the occurrences
fleet_data_department = vehicle_df['DEPARTMENT'].value_counts()

# Group the DataFrame by 'RATE CARD CATEGORY' and count the occurrences
fleet_data_rate_card = vehicle_df['RATE CARD CATEGORY'].value_counts()

# Set thresholds for the minimum count to display individually
threshold_model = 20
threshold_make = 10
threshold_department = 20
threshold_rate_card = 25

# Create and save individual plots
create_bar_plot(fleet_data_derivative, 
                'Vehicle Fleet Composition by Model Derivative', 
                '../plots/eda/fleet_composition_derivative.pdf', 
                threshold_model,
                max_length=30)

create_bar_plot(fleet_data_make, 
                'Vehicle Fleet Composition by Make', 
                '../plots/eda/fleet_composition_make.pdf', 
                threshold_make,
                max_length=30)

create_bar_plot(fleet_data_department, 
                'Vehicle Fleet Composition by Department', 
                '../plots/eda/fleet_composition_department.pdf', 
                threshold_department,
                max_length=30)

create_bar_plot(fleet_data_rate_card, 
                'Vehicle Fleet Composition by Rate Card Category', 
                '../plots/eda/fleet_composition_rate_card.pdf', 
                threshold_rate_card,
                max_length=30)

  ax.set_xticklabels(shortened_names, rotation=45, ha='right', fontsize=label_font_size)
  ax.set_yticklabels(ax.get_yticks(), fontsize=label_font_size)
  ax.set_xticklabels(shortened_names, rotation=45, ha='right', fontsize=label_font_size)
  ax.set_yticklabels(ax.get_yticks(), fontsize=label_font_size)
  ax.set_xticklabels(shortened_names, rotation=45, ha='right', fontsize=label_font_size)
  ax.set_yticklabels(ax.get_yticks(), fontsize=label_font_size)
  ax.set_xticklabels(shortened_names, rotation=45, ha='right', fontsize=label_font_size)
  ax.set_yticklabels(ax.get_yticks(), fontsize=label_font_size)


In [5]:
# Find all the unique registration numbers where the District is missing
missing_district_reg_make_model = trans_df.loc[trans_df['District'].isnull(), ['REG_NUM', 'Make and Model']].drop_duplicates() 

# Save the missing district registration numbers to a csv file
missing_district_reg_make_model.to_csv(os.path.join("..", "data", "Missing Registration Numbers.csv"), index=False)

In [170]:
# Convert 'Transaction Date' to datetime format if it's not already
trans_df['Transaction Date'] = pd.to_datetime(trans_df['Transaction Date'])

# Sort the DataFrame by 'REG_NUM' and 'Transaction Date'
trans_df = trans_df.sort_values(by=['REG_NUM', 'Transaction Date'])

# Calculate the difference in days between consecutive transactions for each vehicle
trans_df['Days Between Transactions'] = trans_df.groupby('REG_NUM')['Transaction Date'].diff().dt.days

# Show the updated DataFrame structure
trans_df.head()


Unnamed: 0,Transaction Date,REG_NUM,Merchant Name,Purchase Category,No. of Litres,Transaction Amount,Make,Model,Make and Model,Site,...,Rental,Merchant Lat,Merchant Long,Site Lat,Site Long,Fuel Type,Actual Fuel Price,Actual Fuel Price Inland,Estimated Price Per Litre,Days Between Transactions
981,2021-04-04,GGA001EC,TOTAL SAVOY MTHATHA,FUEL,44.89,713.0,FORD,RANGER,FORD RANGER,UMTATA,...,23731.68,-31.593486,28.766973,-31.59405,28.756873,DIESEL,13.5212,14.1242,15.88327,
1888,2021-04-06,GGA001EC,NTLALI MOTORS,FUEL,63.8,1029.14,FORD,RANGER,FORD RANGER,UMTATA,...,23731.68,-32.29684,26.419389,-31.59405,28.756873,DIESEL,13.5212,14.1242,16.130721,2.0
2675,2021-04-07,GGA001EC,NTLALI MOTORS,FUEL,53.9,901.66,FORD,RANGER,FORD RANGER,UMTATA,...,23731.68,-32.29684,26.419389,-31.59405,28.756873,DIESEL,14.1612,14.7762,16.728386,1.0
3521,2021-04-08,GGA001EC,ELLIOTDALE FILLING STATION,FUEL,48.12,781.5,FORD,RANGER,FORD RANGER,UMTATA,...,23731.68,-31.966873,28.681895,-31.59405,28.756873,DIESEL,14.1612,14.7762,16.240648,1.0
8968,2021-04-16,GGA001EC,ELLIOTDALE FILLING STATION,FUEL,54.46,884.56,FORD,RANGER,FORD RANGER,UMTATA,...,23731.68,-31.966873,28.681895,-31.59405,28.756873,DIESEL,14.1612,14.7762,16.24238,8.0


In [122]:
# Check how many null values there are in the 'Days Between Transactions' column
trans_df['Days Between Transactions'].isnull().sum()

4260

In [123]:
trans_df.shape

(197176, 22)

In [124]:
trans_df.head()

Unnamed: 0,Transaction Date,REG_NUM,Merchant Name,Purchase Category,No. of Litres,Transaction Amount,Make,Model,Make and Model,Site,...,Rental,Merchant Lat,Merchant Long,Site Lat,Site Long,Fuel Type,Actual Fuel Price,Actual Fuel Price Inland,Estimated Price Per Litre,Days Between Transactions
981,2021-04-04,GGA001EC,TOTAL SAVOY MTHATHA,FUEL,44.89,713.0,FORD,RANGER,FORD RANGER,UMTATA,...,23731.68,-31.593486,28.766973,-31.59405,28.756873,DIESEL,13.5212,14.1242,15.88327,
1888,2021-04-06,GGA001EC,NTLALI MOTORS,FUEL,63.8,1029.14,FORD,RANGER,FORD RANGER,UMTATA,...,23731.68,-32.29684,26.419389,-31.59405,28.756873,DIESEL,13.5212,14.1242,16.130721,2.0
2675,2021-04-07,GGA001EC,NTLALI MOTORS,FUEL,53.9,901.66,FORD,RANGER,FORD RANGER,UMTATA,...,23731.68,-32.29684,26.419389,-31.59405,28.756873,DIESEL,14.1612,14.7762,16.728386,1.0
3521,2021-04-08,GGA001EC,ELLIOTDALE FILLING STATION,FUEL,48.12,781.5,FORD,RANGER,FORD RANGER,UMTATA,...,23731.68,-31.966873,28.681895,-31.59405,28.756873,DIESEL,14.1612,14.7762,16.240648,1.0
8968,2021-04-16,GGA001EC,ELLIOTDALE FILLING STATION,FUEL,54.46,884.56,FORD,RANGER,FORD RANGER,UMTATA,...,23731.68,-31.966873,28.681895,-31.59405,28.756873,DIESEL,14.1612,14.7762,16.24238,8.0


In [171]:
# Convert the date column to a datetime object
trans_df['Transaction Date'] = pd.to_datetime(trans_df['Transaction Date'])

# Create a new column for the month name
trans_df['Month Name'] = trans_df['Transaction Date'].dt.month_name()

# Create a new column for the weekday name
trans_df['Weekday Name'] = trans_df['Transaction Date'].dt.day_name()

In [126]:
trans_df.columns

Index(['Transaction Date', 'REG_NUM', 'Merchant Name', 'Purchase Category',
       'No. of Litres', 'Transaction Amount', 'Make', 'Model',
       'Make and Model', 'Site', 'District', 'Category', 'Rental',
       'Merchant Lat', 'Merchant Long', 'Site Lat', 'Site Long', 'Fuel Type',
       'Actual Fuel Price', 'Actual Fuel Price Inland',
       'Estimated Price Per Litre', 'Days Between Transactions', 'Month Name',
       'Weekday Name'],
      dtype='object')

In [127]:
# Check how many missing values there are in the District column
trans_df['District'].isna().sum()

90772

In [128]:
# To avoid losing more data than necessary, setting missing values to unknown
trans_df['District'].fillna('Unknown', inplace=True)
trans_df['Site'].fillna('Unknown', inplace=True)
trans_df['Category'].fillna('Unknown', inplace=True)

# Optionally, verify the changes by checking for missing values again
print(trans_df[['District', 'Site', 'Category']].isnull().sum())

District    0
Site        0
Category    0
dtype: int64


In [129]:
# Save the dataset
trans_df.to_csv(os.path.join("..", "data", "Final transactions for analysis.csv"), index=False)

In [23]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Set the aesthetic style of the plots
sns.set_style("whitegrid")
plt.rcParams['axes.titlepad'] = 20 
plt.rcParams['figure.figsize'] = (8, 8)
plt.rc('axes', titlesize=16)  # Set the font size for plot titles
plt.rc('axes', labelsize=14)  # Set the font size for axis labels (x and y)
sns.set_palette('viridis')


## Univariate Exploration

In [172]:
# Read in the trans_df dataset
trans_df = pd.read_csv(os.path.join("..", "data", "Final transactions for analysis.csv"))

In [117]:
# List of columns to plot
columns_to_plot = ['Transaction Amount', 'No. of Litres', 'Days Between Transactions']

# Create histograms
for column in columns_to_plot:
    plt.figure(figsize=(8, 8), dpi=300)
    sns.histplot(data=trans_df, x=column, kde=True, bins=30, color='green')
    
    # plt.title(f'Distribution of {column}', fontsize=15)
    plt.xlabel(column, fontsize=15)
    plt.ylabel('Frequency', fontsize=15)
    
    # Save each plot as a PDF
    plt.savefig(f'../final_plots/uni_eda/{column}_histogram.pdf', bbox_inches='tight')
    plt.close()

In [31]:
# Plot histogram for "Estimated Price Per Litre"
plt.figure(dpi=300)
sns.histplot(data=trans_df, x='Estimated Price Per Litre', kde=True, bins=30, color='green')

# Calculate min and max of "Actual Fuel Price"
min_actual_price = trans_df['Actual Fuel Price'].min()
max_actual_price = trans_df['Actual Fuel Price'].max()

# Add vertical lines for min and max actual fuel price
plt.axvline(min_actual_price, color='blue', linestyle='--', label=f'Min Actual Fuel Price: {min_actual_price}')
plt.axvline(max_actual_price, color='red', linestyle='--', label=f'Max Actual Fuel Price: {max_actual_price}')

# plt.title('Distribution of Estimated Price Per Litre', fontsize=15)
plt.xlabel('Estimated Price Per Litre', fontsize=15)
plt.ylabel('Frequency', fontsize=15)

# Add legend to the plot
plt.legend()

# Save the plot as a PDF
plt.savefig('../final_plots/uni_eda/Estimated_Price_Per_Litre_histogram.pdf', bbox_inches='tight')
plt.close()

In [40]:
# List of categorical variables to plot
categorical_variables = ['District', 'Category', 'Fuel Type', 'Make and Model', 'Month Name', 'Weekday Name']

for variable in categorical_variables:
    plt.figure(figsize=(8, 8), dpi=300)
    
    if variable in ['Site', 'Make and Model']:
        # For variables with many categories, plot horizontally for better readability
        sns.countplot(y=variable, data=trans_df, order=trans_df[variable].value_counts().index)
        plt.xlabel(variable, fontsize=15)  # Set x-axis title font size
        plt.ylabel('Count', fontsize=15)
    else:
        # For variables with fewer categories, a vertical plot is sufficient
        sns.countplot(x=variable, data=trans_df, order=trans_df[variable].value_counts().index)
        plt.xlabel(variable, fontsize=15)  # Set x-axis title font size
        plt.ylabel('Count', fontsize=15)
    
    plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels for readability if vertical plot
    plt.tight_layout()
    
    # Save each plot as a PDF
    plt.savefig(f'../final_plots/uni_eda/{variable}_countplot.pdf', bbox_inches='tight')
    plt.close()

In [69]:
# Handle 'Site' separately to limit to top 15 categories
plt.figure(figsize=(8, 8), dpi=300)
top_15_sites = trans_df['Site'].value_counts().index[:15]
sns.countplot(y='Site', data=trans_df, order=top_15_sites)
# plt.title('Count of Site (Top 15)', fontsize=15)
plt.xlabel('Count', fontsize=15)  # Set x-axis title font size
plt.ylabel('Site', fontsize=15)
plt.tight_layout()

# Save the 'Site' plot as a PDF in the specified folder
plt.savefig('../final_plots/uni_eda/Site_countplot_top_15.pdf', bbox_inches='tight')
plt.close()

In [70]:
plt.figure(figsize=(8, 8), dpi=300)
top_15_merchs = trans_df['Merchant Name'].value_counts().index[:15]
sns.countplot(y='Merchant Name', data=trans_df, order=top_15_merchs)
# plt.title('Count of Site (Top 15)', fontsize=15)
plt.xlabel('Count', fontsize=15)  # Set x-axis title font size
plt.ylabel('Merchant', fontsize=15)
plt.tight_layout()

# Save the 'Site' plot as a PDF in the specified folder
plt.savefig('../final_plots/uni_eda/Merchant_Name_countplot_top_15.pdf', bbox_inches='tight')
plt.close()

## Bivariate Exploration

In [162]:
trans_df = trans_df[trans_df['Transaction Amount'] <= 5000]

In [163]:
# Function to truncate category names
def truncate_category_names(categories, max_length=15):
    """Truncate category names to a maximum length, appending '...' if longer."""
    truncated = [(cat[:max_length] + '...') if len(cat) > max_length else cat for cat in categories]
    return truncated

def plot_bivariate_grid(df, numeric_var, cat_vars, plot_file_path):
    """
    Plots a 2x2 grid of box plots for a numeric variable against top 10 categories of 4 categorical variables.

    Parameters:
    - df: DataFrame containing the data.
    - numeric_var: The numeric variable for y-axis.
    - cat_vars: A list of 4 categorical variables for x-axis.
    - plot_file_path: Path to save the plot PDF file.
    """
    # Set the aesthetic style of the plots
    sns.set_style("whitegrid")
    plt.rc('axes', titlesize=16)  # Set the font size for plot titles
    plt.rc('axes', labelsize=15)  # Set the font size for axis labels (x and y)
    plt.rc('xtick', labelsize=12)  # Set the font size for x-axis tick labels
    plt.rc('ytick', labelsize=12)  # Set the font size for y-axis tick labels
    sns.set_palette('viridis')

    # Prepare the figure for the grid of plots
    fig, axs = plt.subplots(2, 2, figsize=(10, 10), dpi=150)
    plot_prefix = ['a)', 'b)', 'c)', 'd)']

    for i, variable in enumerate(cat_vars):
        # Find the top 10 categories for the current variable
        top_categories = df[variable].value_counts().nlargest(8).index
        truncated_categories = truncate_category_names(top_categories, max_length=10)
        category_mapping = dict(zip(top_categories, truncated_categories))
        filtered_df = df.loc[df[variable].isin(top_categories)].copy()
        filtered_df[variable] = filtered_df[variable].map(category_mapping)
        
        ax = axs[i // 2, i % 2]
        sns.boxplot(x=variable, y=numeric_var, data=filtered_df, ax=ax, order=truncated_categories)
        ax.set_title(f'{plot_prefix[i]} {variable} vs {numeric_var}', fontsize=16)
        ax.set_xlabel(variable, fontsize=15)
        ax.set_ylabel(numeric_var, fontsize=15)
        ax.tick_params(axis='x', labelrotation=45)

    plt.tight_layout()
    plt.savefig(plot_file_path, bbox_inches='tight')
    plt.close()

In [164]:
plot_bivariate_grid(trans_df, 'Transaction Amount', ['District', 'Model', 'Category', 'Month Name'], 
                    '../final_plots/mv_eda/TransAmount_vs_Variables_boxplots.pdf')

In [165]:
plot_bivariate_grid(trans_df, 'No. of Litres', ['District', 'Model', 'Category', 'Month Name'], 
                    '../final_plots/mv_eda/Litres_vs_Variables_boxplots.pdf')

In [166]:
plot_bivariate_grid(trans_df, 'Days Between Transactions', ['District', 'Model', 'Category', 'Month Name'], 
                    '../final_plots/mv_eda/DaysBetweenTransactions_vs_Variables_boxplots.pdf')

In [174]:
# Ensure the 'Transaction Date' is a datetime type
trans_df['Transaction Date'] = pd.to_datetime(trans_df['Transaction Date'])

# Set the aesthetic style of the plots
sns.set_style("whitegrid")
plt.rc('axes', titlesize=16)  # Set the font size for plot titles
plt.rc('axes', labelsize=15)  # Set the font size for axis labels (x and y)
plt.rc('xtick', labelsize=12)  # Set the font size for x-axis tick labels
plt.rc('ytick', labelsize=12)  # Set the font size for y-axis tick labels
sns.set_palette('viridis')

# Prepare the figure for the 1x2 grid of plots
fig, axs = plt.subplots(1, 2, figsize=(12, 7), dpi=150, sharey=True)

prefix = ['a)', 'b)']
fuel_types = ['Petrol', 'Diesel']

# Filter data by Fuel Type
for i, fuel_type in enumerate(['PETROL', 'DIESEL']):
    df_filtered = trans_df[trans_df['Fuel Type'] == fuel_type]

    # Group by 'Transaction Date' and calculate the mean for 'Estimated Price Per Litre'
    daily_avg_estimated = df_filtered.groupby('Transaction Date')['Estimated Price Per Litre'].mean()
    
    # Also, calculate the daily mean for 'Actual Fuel Price' and 'Actual Fuel Price Inland'
    daily_avg_actual = df_filtered.groupby('Transaction Date')['Actual Fuel Price'].mean()
    daily_avg_inland = df_filtered.groupby('Transaction Date')['Actual Fuel Price Inland'].mean()
    
    # Plotting
    axs[i].plot(daily_avg_estimated.index, daily_avg_estimated, label='Estimated Price Per Litre', linewidth=2)
    axs[i].plot(daily_avg_actual.index, daily_avg_actual, label='Actual Fuel Price', linestyle='--', linewidth=2)
    axs[i].plot(daily_avg_inland.index, daily_avg_inland, label='Actual Fuel Price Inland', linestyle='-.', linewidth=2)
    
    axs[i].set_title(f'{prefix[i]} {fuel_types[i]} Vehicles', fontsize=16)
    axs[i].set_xlabel('Transaction Date', fontsize=15)
    axs[i].set_ylabel('Price Per Litre (ZAR)', fontsize=15)
    # Update legend with larger font size
    axs[i].legend(fontsize=14)
    
    # Rotate x-axis labels for better visibility
    for label in axs[i].get_xticklabels():
        label.set_rotation(45)
        label.set_horizontalalignment('right')

plt.tight_layout()
plt.savefig('../final_plots/mv_eda/Fuel_Price_Comparison.pdf', bbox_inches='tight')
plt.close()

In [176]:
# Assuming trans_df is your DataFrame
trans_df['Transaction Date'] = pd.to_datetime(trans_df['Transaction Date'])

# Sort the DataFrame by 'Transaction Date'
trans_df = trans_df.sort_values(by='Transaction Date')

# Create a copy with filtered values and add 'Month-Year'
trans_df_filtered = trans_df[trans_df['Estimated Price Per Litre'] <= 25].copy()
trans_df_filtered['Month-Year'] = trans_df_filtered['Transaction Date'].dt.to_period('M').astype(str)  # Convert to string for plotting

# Set the aesthetic style of the plots
sns.set_style("whitegrid")
plt.rc('axes', titlesize=16)  # Set the font size for plot titles
plt.rc('axes', labelsize=15)  # Set the font size for axis labels (x and y)
plt.rc('xtick', labelsize=12)  # Set the font size for x-axis tick labels
plt.rc('ytick', labelsize=12)  # Set the font size for y-axis tick labels
sns.set_palette('viridis')

# Prepare the figure for the 1x2 grid of plots
fig, axs = plt.subplots(1, 2, figsize=(12, 7), dpi=150, sharey=True)

prefix = ['a)', 'b)']
fuel_types = ['Petrol', 'Diesel']

for i, fuel_type in enumerate(fuel_types):
    df_filtered = trans_df_filtered[trans_df_filtered['Fuel Type'].str.upper() == fuel_type.upper()]

    # Group by 'Month-Year' for boxplot
    sns.boxplot(x='Month-Year', y='Estimated Price Per Litre', data=df_filtered, ax=axs[i], palette='viridis')
    
    # Convert 'Transaction Date' to 'Month-Year' for actual price calculations
    df_filtered['Month-Year'] = df_filtered['Transaction Date'].dt.to_period('M').astype(str)
    
    # Calculate the monthly mean for 'Actual Fuel Price' and 'Actual Fuel Price Inland'
    monthly_avg_actual = df_filtered.groupby('Month-Year')['Actual Fuel Price'].mean()
    monthly_avg_inland = df_filtered.groupby('Month-Year')['Actual Fuel Price Inland'].mean()

    # Plotting
    axs[i].plot(monthly_avg_actual.index, monthly_avg_actual.values, label='Actual Fuel Price', linestyle='--', linewidth=2, color='red')
    axs[i].plot(monthly_avg_inland.index, monthly_avg_inland.values, label='Actual Fuel Price Inland', linestyle='-.', linewidth=2, color='blue')

    axs[i].set_title(f'{prefix[i]} {fuel_type} Vehicles', fontsize=16)
    axs[i].set_xlabel('Month-Year', fontsize=15)
    axs[i].set_ylabel('Price Per Litre (ZAR)', fontsize=15)
    axs[i].legend(fontsize=14)

    # Rotate x-axis labels for better visibility
    axs[i].tick_params(axis='x', labelrotation=45)

plt.tight_layout()
plt.savefig('../final_plots/mv_eda/Monthly_Price_Comparison_Updated.pdf', bbox_inches='tight')
plt.close()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['Month-Year'] = df_filtered['Transaction Date'].dt.to_period('M').astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['Month-Year'] = df_filtered['Transaction Date'].dt.to_period('M').astype(str)


# Aggregated Data Exploration

In [156]:
# Read in the aggregated dataset
data_agg = pd.read_csv(os.path.join("..", "data", "Final aggregated with KMPL and Imputed.csv"))

In [157]:
data_agg.columns

Index(['Reg', 'Total Transaction Amount', 'Avg Transaction Amount',
       'Sum of Litres', 'Avg Litres', 'Model', 'Make and Model', 'District',
       'Site', 'Category', 'Site Lat', 'Site Long', 'Fuel Type', 'Rental',
       'Avg Actual Fuel Price', 'Avg Actual Fuel Price Inland',
       'Avg Estimated Price Per Litre', 'Total Kilometres', 'Total Litres',
       'KMPL'],
      dtype='object')

In [158]:
# Rename column names for plots
data_agg.rename(columns={'Transaction Amount_sum': 'Total Transaction Amount', 
                         'Transaction Amount_mean': 'Avg Transaction Amount',
                         'No. of Litres_sum': 'Sum of Litres',
                         'No. of Litres_mean': 'Avg Litres',
                         'Model_first': 'Model',
                         'Make and Model_first': 'Make and Model',
                         'District_first': 'District',
                         'Site_first': 'Site',
                         'Category_first': 'Category',
                         'Site Lat_first': 'Site Lat',
                         'Site Long_first': 'Site Long',
                         'Fuel Type_first': 'Fuel Type',
                         'Rental_first': 'Rental',
                         'Actual Fuel Price_mean': 'Avg Actual Fuel Price',
                         'Actual Fuel Price Inland_mean': 'Avg Actual Fuel Price Inland',
                         'Estimated Price Per Litre_mean': 'Avg Estimated Price Per Litre',
                         'Total_Km': 'Total Kilometres',
                         'Total_Litres': 'Total Litres',
                         'KMPL': 'KMPL'
                         }, inplace=True)

In [159]:
data_agg.columns

Index(['Reg', 'Total Transaction Amount', 'Avg Transaction Amount',
       'Sum of Litres', 'Avg Litres', 'Model', 'Make and Model', 'District',
       'Site', 'Category', 'Site Lat', 'Site Long', 'Fuel Type', 'Rental',
       'Avg Actual Fuel Price', 'Avg Actual Fuel Price Inland',
       'Avg Estimated Price Per Litre', 'Total Kilometres', 'Total Litres',
       'KMPL'],
      dtype='object')

In [108]:
# Save dataset with renamed columns
data_agg.to_csv(os.path.join("..", "data", "Final aggregated with KMPL and Imputed.csv"), index=False)

In [160]:
data_agg.head()

Unnamed: 0,Reg,Total Transaction Amount,Avg Transaction Amount,Sum of Litres,Avg Litres,Model,Make and Model,District,Site,Category,Site Lat,Site Long,Fuel Type,Rental,Avg Actual Fuel Price,Avg Actual Fuel Price Inland,Avg Estimated Price Per Litre,Total Kilometres,Total Litres,KMPL
0,GGA001EC,50839.76,847.329333,2964.03,49.4005,RANGER,FORD RANGER,OR Tambo,UMTATA,CATEGORY 18: Ambulances Large p/van,-31.59405,28.756873,DIESEL,23731.68,14.710303,15.324903,17.148712,,,
1,GGA002EC,74634.78,761.579388,4219.71,43.058265,RANGER,FORD RANGER,,,,,,DIESEL,,15.228486,15.843486,17.69735,,,
2,GGA005EC,89670.4,896.704,5167.84,51.6784,RANGER,FORD RANGER,OR Tambo,TSOLO,CATEGORY 8: LDV Ambulance,-31.319447,28.754782,DIESEL,9009.78,14.952516,15.566796,17.338017,,,
3,GGA007EC,2400.0,1200.0,135.84,67.92,RANGER,FORD RANGER,,,,,,DIESEL,,15.7298,16.3448,17.576323,,,
4,GGA008EC,44332.0,836.45283,2526.3,47.666038,RANGER,FORD RANGER,Amathole,BUTTERWORTH,CATEGORY 8: LDV Ambulance,-32.33235,28.144626,DIESEL,9009.78,15.7862,16.400974,17.554233,,,


## Univariate Exploration

In [None]:
# Read in the dataset
data_agg = pd.read_csv(os.path.join("..", "data", "Final aggregated with KMPL and Imputed.csv"))

In [99]:
# List of columns to plot
columns_to_plot = ['Total Transaction Amount', 'Avg Transaction Amount',
       'Sum of Litres', 'Avg Litres', 'Rental',
       'Avg Estimated Price Per Litre', 'KMPL']

# Create histograms
for column in columns_to_plot:
    plt.figure(figsize=(8, 8), dpi=300)
    sns.histplot(data=data_agg, x=column, kde=True, bins=30, color='green')
    
    # plt.title(f'Distribution of {column}', fontsize=15)
    plt.xlabel(column, fontsize=15)
    plt.ylabel('Frequency', fontsize=15)
    
    # Save each plot as a PDF
    plt.savefig(f'../final_plots/uni_eda/Agg_{column}_histogram.pdf', bbox_inches='tight')
    plt.close()

In [101]:
# List of categorical variables to plot
categorical_variables = ['Make and Model', 'District',
       'Site', 'Category', 'Fuel Type']

for variable in categorical_variables:
    plt.figure(figsize=(8, 8), dpi=300)
    
    if variable in ['Site', 'Make and Model']:
        # For variables with many categories, plot horizontally for better readability
        sns.countplot(y=variable, data=data_agg, order=data_agg[variable].value_counts().index)
        plt.xlabel(variable, fontsize=15)  # Set x-axis title font size
        plt.ylabel('Count', fontsize=15)
    else:
        # For variables with fewer categories, a vertical plot is sufficient
        sns.countplot(x=variable, data=data_agg, order=data_agg[variable].value_counts().index)
        plt.xlabel(variable, fontsize=15)  # Set x-axis title font size
        plt.ylabel('Count', fontsize=15)
    
    plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels for readability if vertical plot
    plt.tight_layout()
    
    # Save each plot as a PDF
    plt.savefig(f'../final_plots/uni_eda/Agg_{variable}_countplot.pdf', bbox_inches='tight')
    plt.close()

In [102]:
# Handle 'Site' separately to limit to top 15 categories
plt.figure(figsize=(8, 8), dpi=300)
top_15_sites = data_agg['Site'].value_counts().index[:15]
sns.countplot(y='Site', data=data_agg, order=top_15_sites)
# plt.title('Count of Site (Top 15)', fontsize=15)
plt.xlabel('Count', fontsize=15)  # Set x-axis title font size
plt.ylabel('Site', fontsize=15)
plt.tight_layout()

# Save the 'Site' plot as a PDF in the specified folder
plt.savefig('../final_plots/uni_eda/Agg_Site_countplot_top_15.pdf', bbox_inches='tight')
plt.close()

## Bivariate Exploration

In [161]:
plot_bivariate_grid(data_agg, 'Total Transaction Amount', ['District', 'Model', 'Category', 'Fuel Type'], 
                    '../final_plots/mv_eda/Agg_TotalTransAmount_vs_Variables_boxplots.pdf')

plot_bivariate_grid(data_agg, 'Avg Transaction Amount', ['District', 'Model', 'Category', 'Fuel Type'], 
                    '../final_plots/mv_eda/Agg_AvgTransAmount_vs_Variables_boxplots.pdf')

plot_bivariate_grid(data_agg, 'Rental', ['District', 'Model', 'Category', 'Fuel Type'], 
                    '../final_plots/mv_eda/Agg_Rental_vs_Variables_boxplots.pdf')

plot_bivariate_grid(data_agg, 'KMPL', ['District', 'Model', 'Category', 'Fuel Type'], 
                    '../final_plots/mv_eda/Agg_KMPL_vs_Variables_boxplots.pdf')