In [None]:
import os
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')

# Utilizing Big Data Analytics for the Assessment and Prediction of Fuel Consumption and CO2 Emissions in Automobiles

In [None]:
data_dir = './Data/'

In [None]:
def read_datasets(data_dir):
    """
    Reads CSV files from the specified directory and concatenates them into a single DataFrame.

    Parameters:
    - data_dir (str): The directory containing CSV files.

    Returns:
    - pd.DataFrame: A DataFrame containing the concatenated data from all CSV files.
    """
    # List all files in the specified directory
    files = os.listdir(data_dir)

    # Initialize an empty list to store DataFrames
    df = []

    # Iterate through each file in the directory
    for file in files:
        # Check if the file has a CSV extension
        if file.split('.')[1] == 'csv':
            # Read the CSV file into a DataFrame and append it to the list
            tmp = pd.read_csv(os.path.join(data_dir, file), encoding='latin-1')
            df.append(tmp)

    # Concatenate all DataFrames in the list into a single DataFrame
    df = pd.concat(df)

    # Print the size of the concatenated DataFrame
    print('dataset size; ( rows:{}, columns:{} )'.format(df.shape[0], df.shape[1]))

    # Return the concatenated DataFrame
    return df

In [None]:
df = read_datasets(data_dir)
df.head()

In [None]:
# store the combined dataset
#df.to_csv('fuel_consumption.csv', index=False)

# Descriptive Statistical Analysis

## Compare various vehicles based on their fuel consumption and CO2 emissions to identify patterns and outliers.

In [None]:
df.describe(include = object)

In [None]:
# Convert column values to a specified letter case for Consistency.
df['Make'] = df['Make'].str.title()
df['Model'] = df['Model'].str.title()
df['Vehicle class'] = df['Vehicle class'].str.title()
df['Transmission'] = df['Transmission'].str.upper()
df['Fuel type'] = df['Fuel type'].str.upper()

In [None]:
# Define a dictionary mapping old column names to new column names
column_names = {
    'Model year': 'year',
    'Make': 'make',
    'Model': 'model',
    'Vehicle class': 'vehicle_class',
    'Engine size (L)': 'engine_size',
    'Cylinders': 'cylinder_count',
    'Transmission': 'transmission_type',
    'Fuel type': 'fuel_type',
    'City (L/100 km)': 'city_fuel_consumption',
    'Highway (L/100 km)': 'highway_fuel_consumption',
    'Combined (L/100 km)': 'combined_fuel_consumption',
    'Combined (mpg)': 'combined_fuel_efficiency_mpg',
    'CO2 emissions (g/km)': 'co2_emissions',
}

# Use the rename() method to rename columns using the dictionary,
# specifying axis=1 to indicate column names, and inplace=True to modify the DataFrame in place
df.rename(column_names, axis=1, inplace=True)

In [None]:
#List of features of interest
features_of_interest = [

    'engine_size', 'cylinder_count', 'city_fuel_consumption',

    'highway_fuel_consumption','combined_fuel_consumption','co2_emissions', 'CO2 rating', 'Smog rating'

]

# Calculate descriptive statistics only for the features of interest

descriptive_stats = df[features_of_interest].describe()

# Add variance to the descriptive statistics

descriptive_stats.loc['var'] = df[features_of_interest].var()

# Drop the percentiles

descriptive_stats = descriptive_stats.drop(['25%', '50%', '75%'], errors='ignore')

# Reorder the rows to match the requested order

desired_order = ['mean', 'std', 'min', 'max', 'var']

descriptive_stats = descriptive_stats.reindex(desired_order)

# Round the descriptive statistics to three decimal places

descriptive_stats_rounded = descriptive_stats.round(3)

# Transpose the table to have rows as columns and vice-versa

descriptive_stats_transposed = descriptive_stats_rounded.T

# Correcting column names to match the user's image

descriptive_stats_transposed.columns = ['Mean', 'Standard Deviation', 'Min', 'Max', 'Variance']

# Print the DataFrame to display the table

print(descriptive_stats_transposed)

## Contrast the fuel consumption and CO2 emissions between light-and heavy-duty vehicles to understand their environmental impact.

In [None]:
# Light-duty vehicle classes
light_duty_classes = ['Compact', 'Subcompact', 'Minicompact', 'Two-Seater', 'Sport Utility Vehicle: Small',
                      'Station Wagon: Small', 'Station Wagon: Mid-Size', 'Mid-Size']

# Heavy-duty vehicle classes
heavy_duty_classes = ['Pickup Truck: Standard', 'Pickup Truck: Small', 'Van: Cargo', 'Minivan',
                      'Sport Utility Vehicle: Standard', 'Sport Utility Vehicle',
                      'Van: Passenger', 'Special Purpose Vehicle', 'Full-Size']

In [None]:
def make_vehicle_class_group(vehicle_class):
    """
    Determine the vehicle class group based on the given vehicle class.

    Parameters:
    - vehicle_class (str): The vehicle class to categorize.

    Returns:
    - str: The vehicle class group ('light duty' or 'heavy duty').
    """
    # Check if the vehicle class is in the list of light duty classes
    if vehicle_class in light_duty_classes:
        return 'light duty'  # Return 'light duty' if it is
    # Check if the vehicle class is in the list of heavy duty classes
    elif vehicle_class in heavy_duty_classes:
        return 'heavy duty'  # Return 'heavy duty' if it is

In [None]:
# Define a new column 'vehicle_class_group' in the DataFrame,
# and populate it with the result of applying the make_vehicle_class_group function to each value in the 'vehicle_class' column.
df['vehicle_class_group'] = df['vehicle_class'].apply(make_vehicle_class_group)

In [None]:
def plot_vehicle_class_group(col, x_label, ax):
    """
    Plot a histogram showing the distribution of a particular feature ('col')
    across vehicle class groups ('light duty' and 'heavy duty').

    Parameters:
    - col (str): The column in the DataFrame to plot.
    - x_label (str): The label for the x-axis.
    - ax (matplotlib.axes.Axes): The axes to plot on.

    Returns:
    None
    """
    # Calculate mean values for each group
    mean_light = df[df['vehicle_class_group'] == 'light duty'][col].mean()
    mean_heavy = df[df['vehicle_class_group'] == 'heavy duty'][col].mean()

    # Add vertical lines for the means
    ax.axvline(mean_light, color='blue', linestyle='dashed', linewidth=2, label='Light Duty Mean')
    ax.axvline(mean_heavy, color='orange', linestyle='dashed', linewidth=2, label='Heavy Duty Mean')

    # Plot the histogram
    sns.histplot(data=df, x=col, hue='vehicle_class_group', ax=ax, palette={'light duty': 'blue', 'heavy duty': 'orange'})

    # Set the titles and labels
    ax.set_title('{} per Vehicle Group'.format(x_label), fontsize=14)
    ax.set_xlabel(x_label, fontsize=12)
    ax.set_ylabel('Count')

    # Update legend to reflect light duty and heavy duty vehicles
    ax.legend(title='Vehicle Group', fontsize=12, title_fontsize=12)

# Create a figure with two subplots side by side
fig, axes = plt.subplots(1, 2, figsize=(18, 4))

# Plot each graph on its respective subplot with the new labels
plot_vehicle_class_group('combined_fuel_consumption', 'Combined Fuel Consumption (L/100km)', axes[0])
plot_vehicle_class_group('co2_emissions', 'CO2 Emissions (g/km)', axes[1])

# Adjust subplot spacing to bring the plots closer together
plt.subplots_adjust(wspace=0.1)

# Display the plot
plt.show()

In [None]:
# Count the number of entries for each car make
make_counts = df['make'].value_counts()

# Find the make with the most entries and its count
most_common_make = make_counts.idxmax()
num_entries_most_common = make_counts[most_common_make]

# Find the make with the least entries and its count
least_common_make = make_counts.idxmin()
num_entries_least_common = make_counts[least_common_make]

# Print the results
print(f"The make with the most entries in the dataset is: {most_common_make} with {num_entries_most_common} vehicles.")
print(f"The make with the least entries in the dataset is: {least_common_make} with {num_entries_least_common} vehicles.")


In [None]:
# Sorting the make_counts in alphabetical order
sorted_make_counts = make_counts.sort_index()

# Plotting the bar chart
plt.figure(figsize=(14, 6))
sorted_make_counts.plot(kind='bar', color=plt.cm.viridis(np.linspace(0, 1, len(sorted_make_counts))))
plt.title('Number of Vehicles by Brand')
plt.xlabel('Brand')
plt.ylabel('Number of Vehicles')
plt.xticks(rotation=45, ha='right')
plt.show()

In [None]:
# Create a new column 'total_fuel_consumption' in the DataFrame by adding 'city_fuel_consumption' and 'highway_fuel_consumption'
df['total_fuel_consumption'] = df['city_fuel_consumption'] + df['highway_fuel_consumption']

# Note:
# This line of code calculates the total fuel consumption by adding the city and highway fuel consumption for each row in the DataFrame.
# The result is stored in a new column named 'total_fuel_consumption'.

In [None]:
def compare_col_by_brand(col='total_fuel_consumption', arrange='top', n=5, ax=None):
    """
    Compare a specified column ('col') for top or bottom 'n' brands based on their average value,
    and visualize the comparison using a bar plot.

    Parameters:
    - col (str): The column in the DataFrame to compare. Defaults to 'total_fuel_consumption'.
    - arrange (str): Specifies whether to compare the top or bottom 'n' brands. Defaults to 'top'.
    - n (int): The number of brands to compare. Defaults to 5.
    - ax (matplotlib.axes.Axes): The axes to plot on. If None, a new figure and axes will be created.

    Returns:
    None
    """
    # Determine sorting order based on 'arrange' parameter
    if arrange == 'top':
        asc = False  # Sort in descending order for top 'n' brands
    elif arrange == 'bottom':
        asc = True   # Sort in ascending order for bottom 'n' brands

    # Group by brand (make) and compute the average value of the specified column for each brand
    total_col_by_brand = df.groupby('make')[col].mean().sort_values(ascending=asc).head(n)

    # If ax parameter is not provided, create a new figure and axes
    if ax is None:
        fig, ax = plt.subplots(figsize=(12, 6))

    # Define a color palette for the bars
    colors = sns.color_palette("Set2")

    # Plot the bar chart
    total_col_by_brand.plot(kind='bar', ax=ax, color=colors)

    # Format column name for better readability in the plot title
    col_str = ' '.join(col.split('_')).title()

    # Set title, labels, and tick labels for the plot
    ax.set_title('{} {} {} by Brand'.format(arrange.title(), n, col_str))
    ax.set_xlabel('Brand')
    ax.set_ylabel(col_str)
    ax.set_xticklabels(total_col_by_brand.index, rotation=0)

In [None]:
# Create a figure with two subplots side by side
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 5))

# Plot the top 'n' brands for total fuel consumption on the first subplot (axes[0])
compare_col_by_brand(arrange='top', ax=axes[0])

# Plot the bottom 'n' brands for total fuel consumption on the second subplot (axes[1])
compare_col_by_brand(arrange='bottom', ax=axes[1])

# Note:
# The compare_col_by_brand function is called twice, each time specifying different arrangements ('top' and 'bottom').
# The subplots are arranged side by side using nrows=1 and ncols=2 in the plt.subplots() function.
# The figsize parameter sets the size of the entire figure.
# Each subplot is passed the corresponding axis (axes[0] and axes[1]) to plot on.

In [None]:
def compare_col_by_brand(col='co2_emissions', arrange='top', n=5, ax=None):
    """
    Compare a specified column ('col') for top or bottom 'n' brands based on their average value,
    and visualize the comparison using a bar plot.

    Parameters:
    - col (str): The column in the DataFrame to compare. Defaults to 'co2_emissions'.
    - arrange (str): Specifies whether to compare the top or bottom 'n' brands. Defaults to 'top'.
    - n (int): The number of brands to compare. Defaults to 5.
    - ax (matplotlib.axes.Axes): The axes to plot on. If None, a new figure and axes will be created.

    Returns:
    None
    """
    # Determine sorting order based on 'arrange' parameter
    if arrange == 'top':
        asc = False  # Sort in descending order for top 'n' brands
    elif arrange == 'bottom':
        asc = True   # Sort in ascending order for bottom 'n' brands

    # Group by brand (make) and compute the average value of the specified column for each brand
    total_col_by_brand = df.groupby('make')[col].mean().sort_values(ascending=asc).head(n)

    # If ax parameter is not provided, create a new figure and axes
    if ax is None:
        fig, ax = plt.subplots(figsize=(12, 6))

    # Define a color palette for the bars
    colors = sns.color_palette("Set2")

    # Plot the bar chart
    total_col_by_brand.plot(kind='bar', ax=ax, color=colors)

    # Format column name for better readability in the plot title
    col_str = ' '.join(col.split('_')).title()

    # Set title, labels, and tick labels for the plot
    ax.set_title('{} {} {} by Brand'.format(arrange.title(), n, col_str))
    ax.set_xlabel('Brand')
    ax.set_ylabel(col_str)
    ax.set_xticklabels(total_col_by_brand.index, rotation=0)

In [None]:
# Group by brand (make) and compute the average fuel consumption for each car make
total_co2_by_brand = df.groupby('make')['co2_emissions'].mean().sort_values(ascending=False)

In [None]:
# Create a figure with two subplots side by side
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 5))

# Plot the top 'n' brands for CO2 emissions on the first subplot (axes[0])
compare_col_by_brand(col='co2_emissions', arrange='top', ax=axes[0])

# Plot the bottom 'n' brands for CO2 emissions on the second subplot (axes[1])
compare_col_by_brand(col='co2_emissions', arrange='bottom', ax=axes[1])

# Note:
# The compare_col_by_brand function is called twice, each time specifying different arrangements ('top' and 'bottom')
# and the column 'co2_emissions' is explicitly passed to ensure it's plotting CO2 emissions.
# The subplots are arranged side by side using nrows=1 and ncols=2 in the plt.subplots() function.
# The figsize parameter sets the size of the entire figure.
# Each subplot is passed the corresponding axis (axes[0] and axes[1]) to plot on.

In [None]:
# Filter out entries with missing CO2 rating values
filtered_df = df.dropna(subset=['CO2 rating'])
# Group by brand (make) and compute the average CO2 rating for each car make
average_co2_rating_by_brand = filtered_df.groupby('make')['CO2 rating'].mean().sort_values(ascending=False)

In [None]:
# Create a figure with two subplots side by side
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 5))

# Plot the top 'n' brands for CO2 rating on the first subplot (axes[0])
compare_col_by_brand(col='CO2 rating', arrange='top', ax=axes[0])

# Plot the bottom 'n' brands for CO2 rating on the second subplot (axes[1])
compare_col_by_brand(col='CO2 rating', arrange='bottom', ax=axes[1])

# Note:
# The compare_col_by_brand function is called twice, each time specifying different arrangements ('top' and 'bottom')
# and the column 'CO2 rating' is explicitly passed to ensure it's plotting CO2 ratings.
# The subplots are arranged side by side using nrows=1 and ncols=2 in the plt.subplots() function.
# The figsize parameter sets the size of the entire figure.
# Each subplot is passed the corresponding axis (axes[0] and axes[1]) to plot on.

In [None]:
# Filter out entries with missing 'Smog rating' values and create a new DataFrame
filtered_df = df.dropna(subset=['Smog rating'])

# Group by brand (make) and compute the average 'Smog rating' for each car make
average_co2_rating_by_brand = filtered_df.groupby('make')['Smog rating'].mean().sort_values(ascending=False)

# Note:
# - Missing values in the 'Smog rating' column are removed using dropna(), and a new DataFrame 'filtered_df' is created.
# - Then, the new DataFrame is grouped by the 'make' column, and the mean 'Smog rating' for each brand is computed.
# - The results are sorted in descending order of the average 'Smog rating' for visualization purposes.

In [None]:
# Create a figure with two subplots side by side
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 5))

# Plot the top 'n' brands for Smog rating on the first subplot (axes[0])
compare_col_by_brand(col='Smog rating', arrange='top', ax=axes[0])

# Plot the bottom 'n' brands for Smog rating on the second subplot (axes[1])
compare_col_by_brand(col='Smog rating', arrange='bottom', ax=axes[1])

# Note:
# - The compare_col_by_brand function is called twice to plot the top and bottom 'n' brands for Smog rating.
# - Subplots are arranged side by side using nrows=1 and ncols=2 in plt.subplots() function.
# - figsize parameter sets the size of the entire figure.
# - Each subplot is passed the corresponding axis (axes[0] and axes[1]) to plot on.

In [None]:
def plot_top_bottom_side_by_side_horizontal(df, agg_cols, title, ylabel, figsize=(10, 4)):
    """
    Plot the bottom and top 3 for specified aggregated columns side by side in a horizontal layout,
    with smaller size and in ascending order.

    Parameters:
    - df (pd.DataFrame): The DataFrame containing the data.
    - agg_cols (list of str): List of columns to aggregate and plot mean values for.
    - title (str): Title for the plots.
    - ylabel (list of str): Labels for the y-axis for each plot.
    - figsize (tuple): Figure size.
    """

    # Create subplots with specified number of columns (equal to the number of agg_cols)
    fig, axs = plt.subplots(1, len(agg_cols), figsize=figsize)

    # Define color palette
    colors = sns.color_palette("husl", n_colors=6)

    # Iterate over each aggregated column
    for i, column in enumerate(agg_cols):
        # Ensure 'model' is a string
        df['model'] = df['model'].astype(str)

        # Calculate bottom and top 3 and combine
        bottom_3 = df.sort_values(by=column, ascending=True).head(3)
        top_3 = df.sort_values(by=column, ascending=False).head(3)
        combined = pd.concat([bottom_3, top_3])

        # Sort combined data in ascending order
        combined = combined.sort_values(by=column, ascending=True)

        # Plot in a horizontal layout with bar chart on the x-axis
        ax = axs[i]  # Select the current subplot
        sns.barplot(x='model', y=column, data=combined, ax=ax, palette=colors, orient='v')  # Plot bar chart
        ax.set_title(f'{title[i]}')  # Set title
        ax.set_xlabel('Model')  # Set x-axis label
        ax.set_ylabel(ylabel[i])  # Set y-axis label
        ax.set_xticklabels(combined['model'], rotation=45, ha='right')  # Adjust x-axis labels for better readability

    plt.tight_layout()  # Adjust layout
    plt.show()  # Show the plot

# Call the function
average_by_model = df.groupby('model').agg({'total_fuel_consumption': 'mean', 'co2_emissions': 'mean'}).reset_index()
plot_top_bottom_side_by_side_horizontal(
    average_by_model,
    ['total_fuel_consumption', 'co2_emissions'],  # Specify columns to plot
    title=['Bottom and Top 3 Models for Fuel Consumption', 'Bottom and Top 3 Models for CO2 Emissions'],  # Set titles
    ylabel=['Fuel Consumption (L/100 km)', 'CO2 Emissions (g/m)'],  # Set y-axis labels
    figsize=(10, 4)  # Set figure size
)

In [None]:
def plot_top_bottom_by_vehicle_class_side_by_side(df, agg_cols, group_col, n=3, figsize=(12, 6)):
    """
    Plot the top and bottom n vehicle classes for specified aggregated columns side by side in ascending order.

    Parameters:
    - df (pd.DataFrame): The DataFrame containing the data.
    - agg_cols (list of str): List of columns to aggregate and plot mean values for.
    - group_col (str): The column to use for grouping (vehicle class).
    - n (int): Number of top or bottom vehicle classes to display.
    - figsize (tuple): Figure size for the entire plot.
    """

    # Setting up the subplot configuration
    ncols = len(agg_cols)
    fig, axs = plt.subplots(1, ncols, figsize=figsize)

    # Define different color palettes for top and bottom bars
    top_colors = sns.color_palette("Set1", n_colors=n)
    bottom_colors = sns.color_palette("Set2", n_colors=n)

    for i, agg_col in enumerate(agg_cols):
        # Calculate mean values for each group and sort in ascending order
        grouped = df.groupby(group_col)[agg_col].mean().sort_values(ascending=True)
        top_n = grouped.head(n)
        bottom_n = grouped.tail(n)

        # Combine top and bottom for plotting
        combined = pd.concat([top_n, bottom_n]).reset_index()

        # Plot
        ax = axs[i]
        sns.barplot(x=group_col, y=agg_col, data=combined, ax=ax, palette=top_colors[:n] + bottom_colors[:n])
        ax.set_title(f'Bottom and Top {n} Vehicle Classes for {agg_col.replace("_", " ").title()}')
        # Rotate x-axis labels for better readability
        ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
        ax.set_ylabel(agg_col.replace('_', ' ').title())

    plt.tight_layout()
    plt.show()

# Call the function
plot_top_bottom_by_vehicle_class_side_by_side(df, ['co2_emissions', 'total_fuel_consumption'], 'vehicle_class', 3)

In [None]:
def compare_col_by_engine_size(df, cols, n=5):
    """
    Compare and plot the top and bottom n engine sizes based on specified columns
    (e.g., total fuel consumption and CO2 emissions) using grouped bar charts.

    Parameters:
    - df (pandas.DataFrame): The DataFrame to analyze.
    - cols (list of str): List of columns to aggregate and compare.
    - n (int): Number of top or bottom engine sizes to display.
    """

    fig, axs = plt.subplots(1, 2, figsize=(15, 6))
    fig.subplots_adjust(hspace=0.4, wspace=0.3)

    for i, col in enumerate(cols):
        # Get top and bottom n engine sizes
        top_engine_sizes = df.groupby('engine_size')[col].mean().nlargest(n)
        bottom_engine_sizes = df.groupby('engine_size')[col].mean().nsmallest(n)

        # Combine and sort
        combined = pd.concat([top_engine_sizes, bottom_engine_sizes]).sort_index()
        top_bottom_flags = ['Top' if engine in top_engine_sizes.index else 'Bottom' for engine in combined.index]

        # Create grouped bar chart
        ax = axs[i]
        sns.barplot(x=combined.index, y=combined.values, hue=top_bottom_flags, ax=ax)
        col_str = ' '.join(col.split('_')).title()
        ax.set_title('Bottom and Top 5 Engine Sizes for {}'.format(col_str))
        ax.set_xlabel('Engine Size')
        ax.set_ylabel(col_str)
        ax.legend(title='Category')

    plt.show()

# Call the function
compare_col_by_engine_size(df, ['total_fuel_consumption', 'co2_emissions'], 5)

In [None]:
def plot_top_bottom_cylinder_counts(df, agg_cols, group_col='cylinder_count', n=3):
    """
    Plot the top and bottom n cylinder counts based on specified aggregated columns
    (e.g., CO2 emissions and total fuel consumption), with both graphs side by side.

    Parameters:
    - df (pd.DataFrame): The DataFrame containing the data.
    - agg_cols (list of str): List of columns to aggregate and plot mean values for.
    - group_col (str): The column to use for grouping (cylinder count).
    - n (int): Number of top or bottom cylinder counts to display.
    """

    # One row, two columns for subplots
    nrows = 1
    ncols = len(agg_cols)
    fig, axs = plt.subplots(nrows, ncols, figsize=(12, 6))  # Adjust overall size as needed

    for i, agg_col in enumerate(agg_cols):
        # Calculate mean and get top and bottom n cylinder counts
        grouped = df.groupby(group_col)[agg_col].mean()
        top_n = grouped.nlargest(n)
        bottom_n = grouped.nsmallest(n)

        # Combine top and bottom for plotting
        combined = pd.concat([top_n, bottom_n]).reset_index()
        combined['Type'] = combined[group_col].apply(lambda x: 'Top' if x in top_n.index else 'Bottom')

        # Plot
        ax = axs[i] if ncols > 1 else axs
        sns.barplot(x=group_col, y=agg_col, hue='Type', data=combined, ax=ax, palette='Reds' if agg_col == 'co2_emissions' else 'YlGnBu')
        ax.set_title(f'Top and Bottom {n} for {agg_col.replace("_", " ").title()} per {group_col.replace("_", " ").title()}')
        ax.set_xlabel(group_col.replace('_', ' ').title())
        ax.set_ylabel(agg_col.replace('_', ' ').title())
        ax.tick_params(axis='x', rotation=45)

    plt.tight_layout()
    plt.show()

# Call the function
plot_top_bottom_cylinder_counts(df, ['co2_emissions', 'total_fuel_consumption'])


In [None]:
def plot_top_bottom_by_transmission_type(df, agg_cols, group_col='transmission_type', n=3, figsize=(12, 6)):
    """
    Plot the top and bottom n transmission types for specified aggregated columns side by side in ascending order.

    Parameters:
    - df (pd.DataFrame): The DataFrame containing the data.
    - agg_cols (list of str): List of columns to aggregate and plot mean values for.
    - group_col (str): The column to use for grouping (transmission type).
    - n (int): Number of top or bottom transmission types to display.
    - figsize (tuple): Figure size for the entire plot.
    """

    ncols = len(agg_cols)
    fig, axs = plt.subplots(1, ncols, figsize=figsize)

    for i, agg_col in enumerate(agg_cols):
        # Calculate mean values for each group and sort in ascending order
        grouped = df.groupby(group_col)[agg_col].mean().sort_values(ascending=True)
        top_n = grouped.tail(n)
        bottom_n = grouped.head(n)

        # Combine top and bottom for plotting
        combined = pd.concat([bottom_n, top_n]).reset_index()

        # Define color palette
        colors = sns.color_palette("viridis", n_colors=2 * n)

        # Plot
        ax = axs[i]
        sns.barplot(x=group_col, y=agg_col, data=combined, ax=ax, palette=colors)
        ax.set_title(f'Bottom and Top {n} Transmission Types for {agg_col.replace("_", " ").title()}')
        ax.set_xlabel(group_col.replace('_', ' ').title())
        ax.set_ylabel(agg_col.replace('_', ' ').title())
        ax.tick_params(axis='x', rotation=45)

    plt.tight_layout()
    plt.show()

# Call the function
plot_top_bottom_by_transmission_type(df, ['co2_emissions', 'total_fuel_consumption'])

In [None]:
# Mapping for fuel types
fuel_type_mapping = {
    'X': 'Regular gasoline',
    'Z': 'Premium gasoline',
    'D': 'Diesel',
    'E': 'Ethanol (E85)',
    'N': 'Natural gas'
}

def plot_fuel_type_charts(df, agg_cols, group_col='fuel_type', figsize=(18, 7), label_fontsize=12):
    """
    Plot bar charts for specified aggregated columns side by side for different fuel types,
    in ascending order and with colorful bars. Increase the font size of fuel type labels.

    Parameters:
    - df (pd.DataFrame): The DataFrame containing the data.
    - agg_cols (list of str): List of columns to aggregate and plot mean values for.
    - group_col (str): The column to use for grouping (fuel type).
    - figsize (tuple): Figure size for the entire plot.
    - label_fontsize (int): Font size for the fuel type labels.
    """

    ncols = len(agg_cols)
    fig, axs = plt.subplots(1, ncols, figsize=figsize)

    for i, agg_col in enumerate(agg_cols):
        # Calculate mean values for each group and sort
        grouped = df.groupby(group_col)[agg_col].mean().sort_values().reset_index()

        # Replace fuel type codes with full names
        grouped[group_col] = grouped[group_col].map(fuel_type_mapping)

        # Plot
        ax = axs[i]
        sns.barplot(x=group_col, y=agg_col, data=grouped, ax=ax, palette='viridis')
        ax.set_title(f'{agg_col.replace("_", " ").title()} by Fuel Type')
        ax.set_xlabel('Fuel Type', fontsize=label_fontsize)
        ax.set_ylabel(agg_col.replace('_', ' ').title())
        ax.set_xticklabels(ax.get_xticklabels(), rotation=45, fontsize=label_fontsize)

    plt.tight_layout()
    plt.show()

# Call the function
plot_fuel_type_charts(df, ['co2_emissions', 'total_fuel_consumption'], label_fontsize=14)

## Examine the trends in fuel consumption and emissions across various vehicle types over a specified period.

In [None]:
# Group by year and vehicle, then sum up the total fuel consumption and CO2 emissions
yearly_totals = df.groupby(['year']).agg({'total_fuel_consumption': 'sum', 'co2_emissions': 'sum'}).reset_index()

# Display the table
print(yearly_totals)

# Note:
# - The DataFrame 'df' is grouped by 'year' and then aggregated to sum up the 'total_fuel_consumption' and 'co2_emissions'.
# - The result is stored in the DataFrame 'yearly_totals'.
# - Finally, the table showing the yearly totals for fuel consumption and CO2 emissions is printed.

In [None]:
# Plotting
fig, ax1 = plt.subplots(figsize=(12, 6))

# Plot total fuel consumption
ax1.set_xlabel('Year')
ax1.set_ylabel('Total Fuel Consumption', color='tab:blue')
ax1.plot(yearly_totals['year'], yearly_totals['total_fuel_consumption'], color='tab:blue', label='Total Fuel Consumption')
ax1.tick_params(axis='y', labelcolor='tab:blue')

# Instantiate a second axes that shares the same x-axis
ax2 = ax1.twinx()
ax2.set_ylabel('CO2 Emissions', color='tab:red')
ax2.plot(yearly_totals['year'], yearly_totals['co2_emissions'], color='tab:red', label='CO2 Emissions')
ax2.tick_params(axis='y', labelcolor='tab:red')

# Title and show plot
plt.title('Yearly Total Fuel Consumption and CO2 Emissions')
fig.tight_layout()
plt.show()

In [None]:
# Group by year and calculate the mean for each feature
yearly_averages = df.groupby(['year']).agg({
    'engine_size': 'mean',
    'cylinder_count': 'mean',
    'city_fuel_consumption':'mean',
    'highway_fuel_consumption': 'mean',
    'combined_fuel_consumption': 'mean',
    'combined_fuel_efficiency_mpg':'mean'

    # Add more features here
}).reset_index()

# Display the table
print(yearly_averages)

# Note:
# - The DataFrame 'df' is grouped by 'year'.
# - For each feature specified in the dictionary passed to agg(), the mean value is calculated for each year.
# - The result is stored in the DataFrame 'yearly_averages'.
# - Finally, the table showing the yearly averages for each feature is printed.

In [None]:
# List of features of interest as named in the DataFrame
features_of_interest = [
    'engine_size', 'cylinder_count', 'city_fuel_consumption',
    'highway_fuel_consumption', 'combined_fuel_consumption', 'combined_fuel_efficiency_mpg'
]

# Plotting all selected features in one graph
plt.figure(figsize=(12, 6))  # Reduced figure size for compactness

for feature in features_of_interest:
    plt.plot(yearly_averages['year'], yearly_averages[feature])

plt.title('Yearly Averages of Vehicle Features (1995-2023)')
plt.xlabel('Year')
plt.ylabel('Average Value')
plt.grid(True)
plt.xticks(yearly_averages['year'], rotation=45)  # Rotate x-axis labels for clarity

# Set custom labels for the legend
custom_labels = [
    'Engine Size (L)', 'Cylinders', 'City Fuel Consumption (L/100km)',
    'Highway Fuel Consumption (L/100km)', 'Combined Fuel Consumption (L/100km)', 'Combined Fuel Efficiency (mpg)'
]

plt.legend(custom_labels, loc='upper right')  # Place legend inside the plot with custom labels
plt.tight_layout()  # Adjust layout to fit everything
plt.show()

In [None]:
def top_brands(tmp, direction='low'):
    """
    Identify and print the top brands based on combined criteria of CO2 emissions and fuel consumption.

    Parameters:
    - tmp (pd.DataFrame): The DataFrame containing the data.
    - direction (str, optional): The direction to identify top brands.
      'low' for the least combined CO2 emissions and lowest fuel consumption,
      'high' for the most combined CO2 emissions and highest fuel consumption.
      Default is 'low'.

    Returns:
    None
    """

    # Columns for CO2 emissions and total fuel consumption
    co2_column = 'co2_emissions'
    fuel_column = 'total_fuel_consumption'

    # Check the specified direction and set the order for sorting
    if direction == 'low':
        ascending_order = True
    elif direction == 'high':
        ascending_order = False
    else:
        raise ValueError("Invalid direction. Use 'low' or 'high'.")

    # Identify the top 3 brands by CO2 Emission
    top_co2_brands = tmp.groupby('make')[co2_column].mean().sort_values(ascending=ascending_order).head(3).index.values

    # Identify the top 3 brands by Total Fuel Consumption
    top_fuel_brands = tmp.groupby('make')[fuel_column].mean().sort_values(ascending=ascending_order).head(3).index.values

    # Combine the results
    combined_top_brands = set(top_co2_brands) | set(top_fuel_brands)

    # Create a list of dictionaries to store 'make' and 'score' for each brand
    result_list = []

    # Populate the list with brands and their scores
    for brand in combined_top_brands:
        co2_score = tmp.loc[tmp['make'] == brand, co2_column].mean()
        score = co2_score
        result_list.append({'make': brand, 'score': score})

    # Convert the list of dictionaries to a DataFrame
    result_df = pd.DataFrame(result_list)

    # Sort the DataFrame by the 'score' column
    if direction == 'low':
        result_df = result_df.sort_values(by='score', ascending=True)
    else:
        result_df = result_df.sort_values(by='score', ascending=False)

    # Print the result
    print(f"Top Brands with Combined {'Least' if direction == 'low' else 'Most'} "
          f"CO2 Emissions and {'Lowest' if direction == 'low' else 'Highest'} Fuel Consumption:")

    print(result_df.reset_index(drop=True))


In [None]:
# Identify the top brands with the least CO2 emission over time
top_brands(df, direction='low')

In [None]:
tmp = df[df['make']=='Smart']

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Filter the data to include only years from 2008 onwards
filtered_tmp = tmp[tmp['year'] >= 2008]

fig, ax1 = plt.subplots(figsize=(12, 5))

# Plotting CO2 emissions as a bar chart
color = 'tab:red'
ax1.set_xlabel('Year')
ax1.set_ylabel('CO2 Emissions', color=color)
ax1.bar(filtered_tmp['year'], filtered_tmp['co2_emissions'], color=color, label='CO2 Emissions', width=0.4, align='center')
ax1.tick_params(axis='y', labelcolor=color)

# Creating a second y-axis for total fuel consumption
ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis
color = 'tab:blue'
ax2.set_ylabel('Fuel Consumption', color=color)
# Offset the positions of the bars for the second data series
ax2.bar(filtered_tmp['year'] + 0.4, filtered_tmp['total_fuel_consumption'], color=color, label='Fuel Consumption', width=0.4, align='center')
ax2.tick_params(axis='y', labelcolor=color)

# Adding title and legend
plt.title("CO2 Emissions and Fuel Consumption for Smart Brand")
ax1.legend(loc='upper left')
ax2.legend(loc='upper right')

fig.tight_layout()  # Adjust layout to make room for the second y-axis
plt.show()

In [None]:
top_brands(df, direction='high')

In [None]:
tmp = df[df['make']=='Bugatti']

In [None]:
# Filter out the years up to 2018
tmp_filtered = tmp[tmp['year'] > 2017]

fig, ax1 = plt.subplots(figsize=(12, 5))

# Plotting CO2 emissions
color = 'tab:red'
ax1.set_xlabel('Year')
ax1.set_ylabel('CO2 Emissions', color=color)
bar_width = 0.4
ax1.bar(tmp_filtered['year'] - bar_width/2, tmp_filtered['co2_emissions'], color=color, width=bar_width, label='CO2 Emissions')
ax1.tick_params(axis='y', labelcolor=color)

# Creating a second y-axis for total fuel consumption
ax2 = ax1.twinx()
color = 'tab:blue'
ax2.set_ylabel('Fuel Consumption', color=color)
ax2.bar(tmp_filtered['year'] + bar_width/2, tmp_filtered['total_fuel_consumption'], color=color, width=bar_width, label='Fuel Consumption')
ax2.tick_params(axis='y', labelcolor=color)

# Setting x-ticks to the years in tmp_filtered
ax1.set_xticks(tmp_filtered['year'])

# Adding title and legend
plt.title("CO2 Emissions and Fuel Consumption for Bugatti Brand")
ax1.legend(loc='upper left')
ax2.legend(loc='upper right')

fig.tight_layout()  # Adjust layout
plt.show()

In [None]:
# print the total number of unique car brands
print('total number of unique car brands = ', df['make'].nunique())

In [None]:
# get top 20 car brands
top_20_popular_brands = df['make'].value_counts(ascending=False).head(20).index.values.tolist()

In [None]:
# Filter the DataFrame to include only entries with car makes that are among the top 20 most popular brands
tmp = df[df['make'].isin(top_20_popular_brands)]

In [None]:
top_brands(tmp, 'low')

In [None]:
top_brands(tmp, 'high')

In [None]:
# Set up the matplotlib figure
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5), sharey=False)

# CO2 Emissions Line Chart
sns.lineplot(x='year', y='co2_emissions', data=tmp, color='darkblue', ax=ax1, marker='o')
ax1.set_title('CO2 Emissions by Year for Honda', fontsize=16)
ax1.set_xlabel('Year', fontsize=14)
ax1.set_ylabel('CO2 Emissions', fontsize=14)
ax1.tick_params(axis='y', colors='darkblue')

# Total Fuel Consumption Line Chart
sns.lineplot(x='year', y='total_fuel_consumption', data=tmp, color='lightblue', ax=ax2, marker='o')
ax2.set_title('Total Fuel Consumption by Year for Honda', fontsize=16)
ax2.set_xlabel('Year', fontsize=14)
ax2.set_ylabel('Fuel Consumption', fontsize=14)
ax2.tick_params(axis='y', colors='lightblue')

# Display the plots
plt.tight_layout()
plt.show()


In [None]:
# Filter the DataFrame to include only entries with the car make 'Gmc'
tmp = df[df['make']=='Gmc']

In [None]:
# Creating separate line plots for CO2 emissions and total fuel consumption

# Set up the matplotlib figure
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5), sharey=False)

# CO2 Emissions Line Chart
sns.lineplot(x='year', y='co2_emissions', data=tmp, color='darkblue', ax=ax1, marker='o')
ax1.set_title('CO2 Emissions by Year for GMC', fontsize=16)
ax1.set_xlabel('Year', fontsize=14)
ax1.set_ylabel('CO2 Emissions', fontsize=14)
ax1.tick_params(axis='y', colors='darkblue')

# Total Fuel Consumption Line Chart
sns.lineplot(x='year', y='total_fuel_consumption', data=tmp, color='lightblue', ax=ax2, marker='o')
ax2.set_title('Total Fuel Consumption by Year for GMC', fontsize=16)
ax2.set_xlabel('Year', fontsize=14)
ax2.set_ylabel('Fuel Consumption', fontsize=14)
ax2.tick_params(axis='y', colors='lightblue')

# Display the plots
plt.tight_layout()
plt.show()


## Historical consumption patterns of different fuel types.

In [None]:
# get unique fuel_types
fuel_types = df['fuel_type'].unique()
print(len(fuel_types))

In [None]:
# Mapping of fuel type codes to full names
fuel_type_names = {
    'D': 'Diesel',
    'E': 'Ethanol E85',
    'X': 'Regular gasoline',
    'Z': 'Premium gasoline'
}

fig, ax = plt.subplots(figsize=(10,7))  # Adjusted to a smaller size

# Iterate over each fuel type
for i, fuel_type in enumerate(fuel_types):
    if fuel_type != 'N':  # Skip the fuel type 'N'
        # Filter for the specific fuel type and years 2015 to 2023
        tmp = df[(df['fuel_type'] == fuel_type) & (df['year'] >= 2015) & (df['year'] <= 2023)]
        tmp = tmp.groupby(by=['year'])['total_fuel_consumption'].mean().reset_index()
        # Use the full name of the fuel type for the label
        ax.plot(tmp['year'], tmp['total_fuel_consumption'], label=f'{fuel_type} ({fuel_type_names.get(fuel_type, "Other")})')

ax.set_xlabel('Year')
ax.set_ylabel('Average Total Fuel Consumption')
ax.set_title('Fuel Consumption per Fuel type')

plt.legend()  # Add legend to the plot
plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels for better readability
plt.tight_layout()  # Adjust layout for better appearance
plt.show()  # Display the plot

# Inferential Statistical Analysis

## Determine the correlation between city and highway fuel consumption in vehicles to infer driving efficiency.

In [None]:
# Create a pairplot to visualize the distribution

pairplot = sns.pairplot(df[['city_fuel_consumption', 'highway_fuel_consumption']])
pairplot.fig.suptitle('Pairplot of City and Highway Fuel Consumption', y=1.05)
plt.show()

## 	T-test


In [None]:
from scipy.stats import ttest_ind

# Define specific fuel type
specific_type = 'Z'

In [None]:
# Filter data for the specific fuel type and the rest
specific_df = df[df['fuel_type'] == specific_type]
rest_df = df[df['fuel_type'] != specific_type]

# Choose a performance metric (e.g., 'total_fuel_consumption')
performance_metric = 'total_fuel_consumption'

# Statistical test (t-test)
t_stat, p_value = ttest_ind(specific_df[performance_metric], rest_df[performance_metric])

# Print the results
print(f"T-statistic: {t_stat}")
print(f"P-value: {p_value}")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Define a dictionary to map the fuel type codes to their actual names
fuel_type_labels = {'Z': 'Premium Gasoline', 'X': 'Regular Gasoline', 'D': 'Diesel', 'E': 'Ethanol E85', 'N': 'Natural Gas'}

# Replace fuel type codes in the dataframe with the actual names for plotting
df['fuel_type'] = df['fuel_type'].map(fuel_type_labels)

# Visualization (Boxplot)
plt.figure(figsize=(12, 6))
sns.boxplot(x='fuel_type', y='total_fuel_consumption', data=df, palette='Set2')
plt.xlabel('Fuel Type')  # Renaming x-axis label
plt.ylabel('Total Fuel Consumption')  # Renaming y-axis label
plt.title('Total Fuel Consumption Comparison by Fuel Type (Premium Gasoline vs Others)')  # Renaming the title
plt.show()


## Chi-square


In [None]:
df[['make', 'model', 'vehicle_class', 'transmission_type', 'fuel_type']].head()

In [None]:
from scipy.stats import chi2_contingency

#### Fuel Type and CO2 Rating

In [None]:
# Create a contingency table
contingency_table = pd.crosstab(df['fuel_type'], df['CO2 rating'])

# Perform the chi-square test
chi2, p, _, _ = chi2_contingency(contingency_table)

# Display the chi-square test results
print("Chi-Square Test Results:")
print("Chi-Square Value: {:.2f}".format(chi2))
print("P-Value: {:.3e}".format(p))

# Interpretation
alpha = 0.05
interpretation = "There is a significant relationship between Fuel type and CO2 Rating." if p < alpha else "There is no significant relationship between Fuel type and CO2 Rating."
print("\nInterpretation:")
print(interpretation)


#### Vehicle Class and CO2 Rating

In [None]:
# Create a contingency table
contingency_table = pd.crosstab(df['vehicle_class'], df['transmission_type'])

# Perform the chi-square test
chi2, p, _, _ = chi2_contingency(contingency_table)

# Display the chi-square test results
print("Chi-Square Test Results:")
print("Chi-Square Value: {:.2f}".format(chi2))
print("P-Value: {:.3e}".format(p))

# Interpretation
alpha = 0.05
interpretation = "There is a significant relationship between Vehicle Class and Transmission Type." if p < alpha else "There is no significant relationship between Fuel type and CO2 Rating."
print("\nInterpretation:")
print(interpretation)


#### Transmission Type and Fuel Type

In [None]:
# Create a contingency table
contingency_table = pd.crosstab(df['transmission_type'], df['fuel_type'])

# Perform the chi-square test
chi2, p, _, _ = chi2_contingency(contingency_table)

# Display the chi-square test results
print("Chi-Square Test Results:")
print("Chi-Square Value: {:.2f}".format(chi2))
print("P-Value: {:.3e}".format(p))

# Interpretation
alpha = 0.05
interpretation = "There is a significant relationship between Transmission Type and Fuel Type." if p < alpha else "There is no significant relationship between Fuel type and CO2 Rating."
print("\nInterpretation:")
print(interpretation)


#### Vehicle Class and Transmission Type

In [None]:
# Create a contingency table
contingency_table = pd.crosstab(df['transmission_type'], df['vehicle_class'])

# Perform the chi-square test
chi2, p, _, _ = chi2_contingency(contingency_table)

# Display the chi-square test results
print("Chi-Square Test Results:")
print("Chi-Square Value: {:.2f}".format(chi2))
print("P-Value: {:.3e}".format(p))

# Interpretation
alpha = 0.05
interpretation = "There is a significant relationship between Transmission Type and Vehicle Class." if p < alpha else "There is no significant relationship between Fuel type and CO2 Rating."
print("\nInterpretation:")
print(interpretation)


#### Make and Fuel Type

In [None]:
# Create a contingency table
contingency_table = pd.crosstab(df['make'], df['fuel_type'])

# Perform the chi-square test
chi2, p, _, _ = chi2_contingency(contingency_table)

# Display the chi-square test results
print("Chi-Square Test Results:")
print("Chi-Square Value: {:.2f}".format(chi2))
print("P-Value: {:.3e}".format(p))

# Interpretation
alpha = 0.05
interpretation = "There is a significant relationship between Car Make and Fuel Type." if p < alpha else "There is no significant relationship between Fuel type and CO2 Rating."
print("\nInterpretation:")
print(interpretation)


#### Model and Transmission Type

In [None]:
# Create a contingency table
contingency_table = pd.crosstab(df['model'], df['transmission_type'])

# Perform the chi-square test
chi2, p, _, _ = chi2_contingency(contingency_table)

# Display the chi-square test results
print("Chi-Square Test Results:")
print("Chi-Square Value: {:.2f}".format(chi2))
print("P-Value: {:.3e}".format(p))

# Interpretation
alpha = 0.05
interpretation = "There is a significant relationship between Car Model and Transmission Type." if p < alpha else "There is no significant relationship between Fuel type and CO2 Rating."
print("\nInterpretation:")
print(interpretation)


In [None]:
data = df.copy()

## Data Preprocessing for Correlation Analysis

In [None]:
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.model_selection import train_test_split

In [None]:
df.head(2)

In [None]:
#convert categorical features to numbers
cat_features = df.select_dtypes(object).columns.tolist()

In [None]:
# Create an OrdinalEncoder object with integer dtype
encoder = OrdinalEncoder(dtype=int)

# Note:
# - OrdinalEncoder is used to encode categorical features as ordinal integers.
# - By specifying dtype=int, it ensures that the encoded values are integers.

In [None]:
# Use the OrdinalEncoder to transform categorical features in the DataFrame
# cat_features contains the names of categorical columns in the DataFrame
# encoder.fit_transform() fits the encoder to the data and transforms the categorical features
# The transformed values are assigned back to the original DataFrame columns
df[cat_features] = encoder.fit_transform(df[cat_features])

# Note:
# - The OrdinalEncoder is applied to the categorical features in the DataFrame (df[cat_features]).
# - The fit_transform() method fits the encoder to the data and transforms the categorical features.
# - The transformed values are then assigned back to the original DataFrame columns.
# - This process replaces categorical values with ordinal integers based on their order of appearance.

In [None]:
df.head(3)

## Correlation Analysis

In [None]:
corr = df.corr()

In [None]:
fig, ax = plt.subplots(figsize=(12, 7))

# Define custom variable names
custom_labels = [
    'Model year', 'Make', 'Model', 'Vehicle class', 'Engine size (L)',
    'Cylinders', 'Transmission', 'Fuel type', 'City (L/100 km)',
    'Highway (L/100 km)', 'Combined (L/100 km)', 'Combined (mpg)',
    'CO2 emissions (g/km)', 'CO2 rating', 'Smog rating', 'Vehicle class Group', 'Total fuel consumption'
]

# Create the heatmap with custom labels
sns.heatmap(corr, annot=True, xticklabels=custom_labels, yticklabels=custom_labels, ax=ax)

plt.show()

## Data Preprocessing for ML Analysis

In [None]:
# Calculate the percentage of missing values for each column in the DataFrame
# df.isna().sum() calculates the total number of missing values for each column
# df.shape[0] gives the total number of rows in the DataFrame
# By dividing the sum of missing values by the total number of rows, we get the percentage of missing values for each column
# This provides insight into the extent of missing data in each column
(df.isna().sum() / df.shape[0])

# Note:
# - df.isna().sum() calculates the sum of missing values for each column in the DataFrame.
# - df.shape[0] gives the total number of rows in the DataFrame.
# - By dividing the sum of missing values by the total number of rows, we get the percentage of missing values for each column.
# - This allows us to assess the completeness of the data and identify columns with a high proportion of missing values.

In [None]:
# drop columns with too many missing "NaN" values
df.drop(columns=['CO2 rating', 'Smog rating'], inplace=True)

In [None]:
df.head(2)

# Machine Learning

In [None]:
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [None]:
targets = ['co2_emissions', 'total_fuel_consumption']
features = ['year', 'make', 'model', 'vehicle_class', 'engine_size', 'cylinder_count', 'transmission_type', 'fuel_type', 'vehicle_class_group']

In [None]:
# Split the dataset into training and testing sets using train_test_split function from scikit-learn
# df[features] contains the feature columns used for training and testing
# df[targets] contains the target variable(s) to be predicted
# test_size=0.2 specifies that 20% of the data will be used for testing, while the remaining 80% will be used for training
# random_state=42 sets the random seed for reproducibility of the split
# X_train and y_train represent the feature and target variables for the training set, respectively
# X_test and y_test represent the feature and target variables for the testing set, respectively
X_train, X_test, y_train, y_test = train_test_split(df[features], df[targets], test_size=0.2, random_state=42)

# Note:
# - train_test_split function is used to split the dataset into training and testing sets.
# - df[features] contains the feature columns used for prediction, and df[targets] contains the target variable(s).
# - test_size=0.2 specifies that 20% of the data will be used for testing, while the remaining 80% will be used for training.
# - random_state=42 sets the random seed for reproducibility of the split, ensuring consistent results across runs.
# - The resulting X_train, X_test, y_train, and y_test are the feature and target variables for the training and testing sets, respectively.

In [None]:
# Create a StandardScaler object
# StandardScaler is used to standardize features by removing the mean and scaling to unit variance
scaler = StandardScaler()

# Note:
# - StandardScaler is a preprocessing technique used to standardize features by removing the mean and scaling to unit variance.
# - It ensures that each feature has a mean of 0 and a standard deviation of 1, which can be important for many machine learning algorithms.
# - Standardizing features can improve the performance and convergence of certain machine learning models.
# - The scaler object will be used to transform the data, ensuring that all features are on the same scale.

In [None]:
# Standardize the features in the training set using the fit_transform method of the StandardScaler object
# scaler.fit_transform() computes the mean and standard deviation of each feature in the training set
# and then standardizes the features by subtracting the mean and dividing by the standard deviation
# The result is assigned to X_train, and the column names are preserved
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)

# Standardize the features in the testing set using the transform method of the StandardScaler object
# scaler.transform() applies the same transformation computed from the training set to the testing set
# The result is assigned to X_test, and the column names are preserved
X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

# Note:
# - StandardScaler.fit_transform() computes the mean and standard deviation of each feature in the training set and then standardizes the features.
# - It's crucial to use the same scaler object for both training and testing sets to ensure consistency in scaling.
# - StandardScaler.transform() applies the same transformation computed from the training set to the testing set, ensuring that both sets are on the same scale.
# - The resulting standardized feature matrices (X_train and X_test) are stored as Pandas DataFrames with the original column names preserved.

In [None]:
# Create a dictionary containing different regression models
# Each key-value pair in the dictionary represents a model name (key) and its corresponding regression model (value)
models_dict = {
    'Linear Regression': LinearRegression(),           # Linear Regression model
    'Decision Tree': DecisionTreeRegressor(),          # Decision Tree Regression model
    'Random Forest': RandomForestRegressor(),          # Random Forest Regression model
    'KNeighbors': KNeighborsRegressor(),               # K-Nearest Neighbors Regression model
    'Multi-layer Perceptron': MLPRegressor(),          # Multi-layer Perceptron Regression model
    'Xgboost': XGBRegressor()                          # XGBoost Regression model
}

# Note:
# - The models_dict dictionary contains different regression models initialized with their respective constructors.
# - Each key represents a model name, and each value represents an instance of the corresponding regression model.
# - This dictionary is useful for iterating over multiple models in a loop or for easy access to specific models by name.

In [None]:
# Iterate over each model in the models_dict dictionary
for model_name, model in models_dict.items():
    # Fit the current model to the training data
    model.fit(X_train, y_train)
    # Print a message indicating that the model has been trained
    print('Trained a {} Model'.format(model_name))

# Note:
# - The for loop iterates over each key-value pair in the models_dict dictionary.
# - In each iteration, 'model_name' represents the name of the model, and 'model' represents the regression model object.
# - The model is trained using the fit() method with the training data (X_train and y_train).
# - After training, a message is printed indicating that the model has been trained.
# - This loop allows training of each model in the dictionary with the same training data.

In [None]:
def plot_predictions(models_dict, X_test, y_test):
    """
    Plot predictions versus actual values for CO2 emissions and fuel consumption.

    Parameters:
    - models_dict (dict): A dictionary containing model names as keys and corresponding trained models as values.
    - X_test (pd.DataFrame): The feature values of the test dataset.
    - y_test (pd.DataFrame): The true target values of the test dataset.

    """
    # Extract true values for CO2 emissions and fuel consumption
    y_true_co2 = y_test.iloc[:, 0]
    y_true_fuel = y_test.iloc[:, 1]

    # Plot predictions vs. actual for CO2 emissions
    plt.figure(figsize=(12, 6))
    plt.subplot(1, 2, 1)
    for model_name, model in models_dict.items():
        y_pred_co2 = model.predict(X_test)[:, 0]
        plt.scatter(y_true_co2, y_pred_co2, label=model_name, alpha=0.7)

    # Plot diagonal line for reference
    plt.plot([min(y_true_co2), max(y_true_co2)], [min(y_true_co2), max(y_true_co2)], linestyle='--', color='red', linewidth=2)
    plt.title('CO2 Emissions Predictions vs. Actual')
    plt.xlabel('Actual CO2 Emissions')
    plt.ylabel('Predicted CO2 Emissions')
    plt.legend()

    # Plot predictions vs. actual for fuel consumption
    plt.subplot(1, 2, 2)
    for model_name, model in models_dict.items():
        y_pred_fuel = model.predict(X_test)[:, 1]
        plt.scatter(y_true_fuel, y_pred_fuel, label=model_name, alpha=0.7)

    # Plot diagonal line for reference
    plt.plot([min(y_true_fuel), max(y_true_fuel)], [min(y_true_fuel), max(y_true_fuel)], linestyle='--', color='red', linewidth=2)
    plt.title('Fuel Consumption Predictions vs. Actual')
    plt.xlabel('Actual Fuel Consumption')
    plt.ylabel('Predicted Fuel Consumption')
    plt.legend()

    # Adjust layout for better visualization
    plt.tight_layout()
    plt.show()

# Note:
# - This function plots predictions versus actual values for CO2 emissions and fuel consumption for each model in the provided dictionary.
# - It first extracts the true values for CO2 emissions and fuel consumption from the test dataset.
# - Then, it iterates over each model in the dictionary, predicts the target variables using the model, and plots the actual vs. predicted values.
# - The diagonal line in each plot represents perfect predictions (actual = predicted).
# - Each subplot corresponds to a target variable (CO2 emissions or fuel consumption), and each point represents a data point.
# - The layout of the plots is adjusted for better visualization using plt.tight_layout().

In [None]:
# Define a function to evaluate regression models using various metrics for CO2 emissions and fuel consumption
def evaluate_model(models_dict, X_test=X_test, y_test=y_test):
    """
    Evaluate regression models using various metrics for CO2 emissions and fuel consumption.

    Parameters:
    - models_dict (dict): A dictionary containing model names as keys and corresponding trained models as values.
    - X_test (pd.DataFrame, optional): The feature values of the test dataset. Default is the global variable 'X_test'.
    - y_test (pd.DataFrame, optional): The true target values of the test dataset. Default is the global variable 'y_test'.

    Returns:
    pd.DataFrame: A DataFrame containing evaluation metrics for each model.
    """

    # Initialize an empty list to store evaluation results
    results = []

    # Iterate over each model in the models_dict dictionary
    for model_name, model in models_dict.items():
        # Predictions
        y_pred = model.predict(X_test)

        # Calculate evaluation metrics for CO2 Emissions (y_test.iloc[:, 0])
        r2_co2 = r2_score(y_test.iloc[:, 0], y_pred[:, 0])      # R-squared (Coefficient of Determination)
        mae_co2 = mean_absolute_error(y_test.iloc[:, 0], y_pred[:, 0])  # Mean Absolute Error
        rmse_co2 = np.sqrt(mean_squared_error(y_test.iloc[:, 0], y_pred[:, 0]))  # Root Mean Squared Error

        # Calculate evaluation metrics for Fuel Consumption (y_test.iloc[:, 1])
        r2_fuel = r2_score(y_test.iloc[:, 1], y_pred[:, 1])    # R-squared (Coefficient of Determination)
        mae_fuel = mean_absolute_error(y_test.iloc[:, 1], y_pred[:, 1])  # Mean Absolute Error
        rmse_fuel = np.sqrt(mean_squared_error(y_test.iloc[:, 1], y_pred[:, 1]))  # Root Mean Squared Error

        # Append evaluation results to the list
        results.append({
            'Model': model_name,
            'R2_CO2': r2_co2,
            'MAE_CO2': mae_co2,
            'RMSE_CO2': rmse_co2,
            'R2_Fuel': r2_fuel,
            'MAE_Fuel': mae_fuel,
            'RMSE_Fuel': rmse_fuel
        })

    # Plot predictions versus actual values
    plot_predictions(models_dict, X_test, y_test)

    # Return the evaluation results as a DataFrame
    return pd.DataFrame(results)

# Note:
# - This function evaluates regression models using various metrics for CO2 emissions and fuel consumption.
# - It iterates over each model in the models_dict dictionary, predicts target variables using the test data,
#   and calculates evaluation metrics such as R-squared, Mean Absolute Error, and Root Mean Squared Error for each model.
# - The evaluation results are stored in a DataFrame and returned.
# - Additionally, the function plots predictions versus actual values for visualization using the plot_predictions function.

In [None]:
# Evaluate regression models using the evaluate_model function and store the results in a DataFrame
res_df = evaluate_model(models_dict, X_test, y_test)
res_df
# Note:
# - The evaluate_model function is called with the provided models_dict, X_test, and y_test as arguments.
# - This function evaluates regression models using various metrics for CO2 emissions and fuel consumption.
# - The evaluation results are stored in a DataFrame named res_df.

In [None]:
# Define a function to plot individual predictions versus actual values for CO2 emissions and fuel consumption for each model
def plot_individual_predictions(models_dict, X_test, y_test):
    """
    Plot individual predictions versus actual values for CO2 emissions and fuel consumption for each model.

    Parameters:
    - models_dict (dict): A dictionary containing model names as keys and corresponding trained models as values.
    - X_test (pd.DataFrame): The feature values of the test dataset.
    - y_test (pd.DataFrame): The true target values of the test dataset.
    """

    # Extract true values for CO2 emissions and fuel consumption from the test data
    y_true_co2 = y_test.iloc[:, 0]
    y_true_fuel = y_test.iloc[:, 1]

    # Iterate over each model in the models_dict dictionary
    for model_name, model in models_dict.items():
        # Predictions for CO2 and Fuel
        y_pred = model.predict(X_test)
        y_pred_co2 = y_pred[:, 0]
        y_pred_fuel = y_pred[:, 1]

        # Create a figure for each model
        plt.figure(figsize=(12, 6))

        # Plot for CO2 emissions
        plt.subplot(1, 2, 1)
        plt.scatter(y_true_co2, y_pred_co2, alpha=0.7)
        plt.plot([y_true_co2.min(), y_true_co2.max()], [y_true_co2.min(), y_true_co2.max()], linestyle='--', color='red', linewidth=2)
        plt.title(f'{model_name} - CO2 Emissions Predictions vs. Actual')
        plt.xlabel('Actual CO2 Emissions')
        plt.ylabel('Predicted CO2 Emissions')

        # Plot for fuel consumption
        plt.subplot(1, 2, 2)
        plt.scatter(y_true_fuel, y_pred_fuel, alpha=0.7)
        plt.plot([y_true_fuel.min(), y_true_fuel.max()], [y_true_fuel.min(), y_true_fuel.max()], linestyle='--', color='red', linewidth=2)
        plt.title(f'{model_name} - Fuel Consumption Predictions vs. Actual')
        plt.xlabel('Actual Fuel Consumption')
        plt.ylabel('Predicted Fuel Consumption')

        # Adjust layout for better visualization
        plt.tight_layout()
        plt.show()

# Plot individual predictions versus actual values for each model
plot_individual_predictions(models_dict, X_test, y_test)

In [None]:
# Select the row corresponding to the best model based on the sum of R-squared values for CO2 emissions and fuel consumption
best_model = res_df.iloc[res_df[['R2_CO2', 'R2_Fuel']].sum(axis=1).idxmax()]
best_model
# Note:
# - The 'idxmax()' method returns the index of the first occurrence of the maximum value along the specified axis.
# - The 'sum(axis=1)' method calculates the sum of R-squared values for CO2 emissions and fuel consumption for each model.
# - 'res_df[['R2_CO2', 'R2_Fuel']].sum(axis=1)' returns a Series containing the sum of R-squared values for each model.
# - 'idxmax()' then returns the index corresponding to the row with the maximum sum of R-squared values.
# - 'res_df.iloc[]' is used to select the row from the DataFrame based on the index returned by 'idxmax()'.
# - The selected row represents the best model based on the sum of R-squared values for CO2 emissions and fuel consumption.


### Which vehicle characteristics have the most significant influence on CO2 emissions and fuel consumption.

In [None]:
# Extract feature importances from the Random Forest model
feature_importances = models_dict['Random Forest'].feature_importances_

# Define more descriptive feature names
features = ['Model year', 'Make', 'Model', 'Vehicle Class', 'Engine size', 'Cylinders',
            'Transmission', 'Fuel Type', 'Vehicle Class Group']

# Create a DataFrame to visualize the feature importances
feature_importance_df = pd.DataFrame({'Feature': features, 'Importance Score': feature_importances})

# Sort the DataFrame by the 'Importance Score' column in descending order
feature_importance_df.sort_values(by='Importance Score', ascending=False, inplace=True)

# Generate a bar plot for the feature importances
plt.figure(figsize=(12, 5))
plt.bar(feature_importance_df['Feature'], feature_importance_df['Importance Score'], color='skyblue')
plt.title('Random Forest Feature Importance')
plt.xlabel('Vehicle Specification')
plt.ylabel('Importance Score')
plt.xticks(rotation=45, ha='right')  # Rotate the x-axis labels for better readability
plt.tight_layout()  # Adjust the layout to make sure everything fits without overlap
plt.show()

# Note:
# - This code extracts feature importances from the Random Forest model and visualizes them using a bar plot.
# - The feature importances are stored in a DataFrame for better visualization and sorting.
# - Features are sorted in descending order based on their importance scores to show the most important features first.
# - A bar plot is generated to display the feature importances, with the x-axis representing vehicle specifications
#   and the y-axis representing the importance scores.
# - The layout is adjusted to ensure readability and to prevent overlap of plot elements.

## Employing timeseries analysis for forecasting future CO2 emissions and fuel efficiency for Toyota.

In [None]:
# Filter the dataset to include only cars manufactured by Toyota
filtered_df = data[data['make'] == 'Toyota']

# Group the filtered data by year and calculate the mean CO2 emissions and combined fuel efficiency (MPG)
# for Toyota cars in each year
filtered_df = filtered_df.groupby(by='year')[['co2_emissions', 'combined_fuel_efficiency_mpg']].mean().reset_index()

# Note:
# - This code filters the dataset to include only cars manufactured by Toyota.
# - The filtered data is then grouped by year using the 'groupby' method.
# - For each year, the mean CO2 emissions and combined fuel efficiency (MPG) of Toyota cars are calculated.
# - The 'reset_index()' method is used to reset the index of the resulting DataFrame.
# - The resulting DataFrame contains the mean CO2 emissions and combined fuel efficiency (MPG) for Toyota cars
#   in each year, making it suitable for further analysis or visualization.

In [None]:
# Selecting the 'year' and 'co2_emissions' columns from the filtered DataFrame
filtered_df[['year', 'co2_emissions']]

# Note:
# - This code selects the 'year' and 'co2_emissions' columns from the filtered DataFrame 'filtered_df'.
# - The resulting DataFrame contains only these two columns, which represent the year and the corresponding CO2 emissions
#   for Toyota cars.
# - This selection might be used for further analysis or visualization specifically focusing on CO2 emissions over the years
#   for Toyota cars.

In [None]:
from prophet import Prophet
from prophet.plot import plot

In [None]:
def prepare_data(df, target_column='co2_emissions'):
    """
    Collects data for the target column and prepares it for Prophet forecasting.

    Args:
    - df: DataFrame containing 'year' and the target column
    - target_column: Name of the target column

    Returns:
    - target_df: DataFrame suitable for Prophet modeling
    """
    # Selecting 'year' and the target column and renaming them as 'ds' and 'y' for Prophet modeling
    target_df = df[['year', target_column]].rename(columns={'year': 'ds', target_column: 'y'})
    return target_df

# Note:
# - This function prepares data for Prophet forecasting by selecting the 'year' and the target column from the input DataFrame.
# - It renames the columns as 'ds' (representing time) and 'y' (representing the target variable) to comply with Prophet's
#   data format requirements.
# - The resulting DataFrame 'target_df' is suitable for use in Prophet modeling for time series forecasting.

In [None]:
def train_prophet_model(data):
    """
    Train a Facebook Prophet model.

    Args:
    - data: DataFrame with 'ds' and 'y' columns.

    Returns:
    - model: Trained Prophet model.
    """
    # Initialize a Prophet model
    model = Prophet()

    # Fit the model to the data
    model.fit(data)

    # Return the trained Prophet model
    return model

# Note:
# - This function trains a Facebook Prophet model on the provided DataFrame 'data' containing 'ds' (time) and 'y' (target) columns.
# - It initializes a Prophet model object and fits it to the provided data.
# - The trained Prophet model is returned for further use in forecasting.

In [None]:
def generate_forecast(df, model, end_year=2050):
    """
    Generates a forecast for the target variable using the fitted Prophet model.

    Args:
    - model: Fitted Prophet model
    - end_year: Year until which forecast is generated

    Returns:
    - forecast: DataFrame containing forecasted values
    """
    # Find the last year in the dataset
    last_year = df['year'].max()

    # Create a DataFrame for future dates up to the specified end_year
    future_years = list(range(last_year, end_year + 1))  # Years from the last year to end_year
    future_df = pd.DataFrame({'ds': future_years})

    # Generate the forecast using the fitted Prophet model
    forecast = model.predict(future_df)

    return forecast

# Note:
# - This function generates a forecast for the target variable using the fitted Prophet model.
# - It first finds the last year in the provided DataFrame 'df'.
# - Then, it creates a DataFrame 'future_df' containing future dates up to the specified 'end_year'.
# - Finally, it generates the forecast using the fitted Prophet model for the future dates in 'future_df'.
# - The forecasted values are returned as a DataFrame.

In [None]:
def plot_forecast(model, forecast, target_column):
    """
    Plots the Prophet forecast for the target variable.

    Args:
    - model: Fitted Prophet model
    - forecast: DataFrame containing forecasted values
    - target_column: Name of the target column
    """
    # Plot the Prophet forecast using the provided model and forecast DataFrame
    fig = plot(model, forecast, figsize=(9, 5), include_legend=True)

    # Format the target column name for better readability
    target_column_str = ' '.join(target_column.split('_')).title()

    # Set plot title, x-axis label, and y-axis label
    plt.title('Forecast - {}'.format(target_column_str))
    plt.xlabel('Year')
    plt.ylabel(target_column)

    # Display the plot
    plt.show()

# Note:
# - This function plots the Prophet forecast for the target variable using the provided model and forecast DataFrame.
# - It uses the 'plot' function from Prophet to generate the forecast plot.
# - The target column name is formatted for better readability in the plot title.
# - The plot title, x-axis label, and y-axis label are set accordingly.
# - The generated plot is displayed.

In [None]:
def get_forecast_results(target_variable, forecast):
    """
    Retrieves the forecasted values.

    Args:
    - target_variable: Variable being forecasted
    - forecast: DataFrame containing forecasted values

    Returns:
    - res: DataFrame containing forecasted values
    """
    # Initialize an empty DataFrame to store the forecasted values
    res = pd.DataFrame()

    # Extract the year from the 'ds' column in the forecast DataFrame
    res['year'] = forecast['ds'].dt.year

    # Assign the forecasted values to the target variable in the result DataFrame
    res[target_variable] = forecast['yhat']

    # Return the DataFrame containing forecasted values
    return res

# Note:
# - This function retrieves the forecasted values from the Prophet forecast DataFrame.
# - It initializes an empty DataFrame 'res' to store the forecasted values.
# - It extracts the year from the 'ds' column in the forecast DataFrame and assigns it to the 'year' column in 'res'.
# - It assigns the forecasted values to the specified target variable column in 'res'.
# - The resulting DataFrame 'res' contains the forecasted values for the specified target variable.

### Forecast CO2 Emissions

In [None]:
target_column = 'co2_emissions'

In [None]:
# get target data
target_data = prepare_data(filtered_df, target_column)

# train a facebook prophet model
model = train_prophet_model(target_data)

# generate forecast using model
forecast = generate_forecast(filtered_df, model)

#plot model forecast
plot_forecast(model, forecast, target_column)

In [None]:
# store co2_emissions forecast result
co2_forecast_results = get_forecast_results('co2_emissions', forecast)
co2_forecast_results.to_csv('co2_forecast_results.csv', index=False)
co2_forecast_results.head()

In [None]:
# Create a bar plot to visualize the forecasted CO2 emissions over the years
fig, ax = plt.subplots(figsize=(12,5))
sns.barplot(data=co2_forecast_results, x='year', y='co2_emissions')

# Rotate the x-axis labels for better readability
plt.xticks(rotation=45, ha='right')

# Set the plot title
plt.title('Forecast of CO2 Emissions')

# Display the plot
plt.show()

### Forecast  Fuel Efficiency

In [None]:
target_column = 'combined_fuel_efficiency_mpg'

In [None]:
# get target data
target_data = prepare_data(filtered_df, target_column)

# train a facebook prophet model
model = train_prophet_model(target_data)

# generate forecast using model
forecast = generate_forecast(filtered_df, model)

#plot model forecast
plot_forecast(model, forecast, target_column)

In [None]:
#store results
fuel_eff_forecast_results = get_forecast_results('combined_fuel_efficiency_mpg', forecast)
fuel_eff_forecast_results.to_csv('fuel_efficiency_forecast_results.csv', index=False)
fuel_eff_forecast_results.head()

In [None]:
# Create a bar plot to visualize the forecasted combined fuel efficiency over the years
fig, ax = plt.subplots(figsize=(12,5))
sns.barplot(data=fuel_eff_forecast_results, x='year', y='combined_fuel_efficiency_mpg')

# Rotate the x-axis labels for better readability
plt.xticks(rotation=45, ha='right')

# Set the plot title
plt.title('Forecast of Fuel Efficiency')

# Display the plot
plt.show()