In [None]:
import pandas as pd

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# Read the CSV file into a DataFrame
df = pd.read_csv('data/cars.csv')

# Display the first 5 rows
print(df.head().to_markdown(index=False, numalign="left", stralign="left"))

# Print the column names and their data types
print(df.info())

In [None]:
import altair as alt
import numpy as np

# Calculate descriptive statistics for the specified columns
descriptive_stats = df[['MSRP', 'Invoice', 'Horsepower', 'MPG_City', 'MPG_Highway', 'Weight', 'Wheelbase', 'Length']].describe()

# Print the descriptive statistics
print("Descriptive Statistics:")
print(descriptive_stats.to_markdown(numalign="left", stralign="left"))

# Filter out rows where MSRP or Invoice is '$0'
df_filtered = df[(df['MSRP'] != '$0') & (df['Invoice'] != '$0')].copy()

# Remove '$' and ',' from the columns and convert to numeric
for col in ['MSRP', 'Invoice']:
    df_filtered[col] = df_filtered[col].astype(str).str.replace(r'[$,]', '', regex=True).str.replace(' ', '', regex=True)
    df_filtered[col] = pd.to_numeric(df_filtered[col])

# Create histograms for the specified columns
for col in ['MSRP', 'Invoice', 'Horsepower', 'MPG_City', 'MPG_Highway', 'Weight', 'Wheelbase', 'Length']:
    chart = alt.Chart(df_filtered).mark_bar().encode(
        x=alt.X(col, bin=True),
        y='count()',
        tooltip=[alt.Tooltip(col, bin=True), 'count()']
    ).properties(
        title=f'Histogram of {col}'
    ).interactive()

    chart.save(f'{col}_histogram.json')

# Create boxplots for the specified columns against 'Make'
for col in ['MSRP', 'Invoice', 'Horsepower', 'MPG_City', 'MPG_Highway', 'Weight', 'Wheelbase', 'Length']:
    chart = alt.Chart(df_filtered).mark_boxplot(extent='min-max').encode(
        x='Make:N',
        y=col,
        color='Make:N',
        tooltip = ['Make:N', col]
    ).properties(
        title=f'Boxplot of {col} by Make'
    ).interactive()

    chart.save(f'{col}_by_make_boxplot.json')

# Create boxplots for the specified columns against 'Type'
for col in ['MSRP', 'Invoice', 'Horsepower', 'MPG_City', 'MPG_Highway', 'Weight', 'Wheelbase', 'Length']:
    chart = alt.Chart(df_filtered).mark_boxplot(extent='min-max').encode(
        x='Type:N',
        y=col,
        color='Type:N',
        tooltip = ['Type:N', col]
    ).properties(
        title=f'Boxplot of {col} by Type'
    ).interactive()

    chart.save(f'{col}_by_type_boxplot.json')

# Create boxplots for the specified columns against 'Origin'
for col in ['MSRP', 'Invoice', 'Horsepower', 'MPG_City', 'MPG_Highway', 'Weight', 'Wheelbase', 'Length']:
    chart = alt.Chart(df_filtered).mark_boxplot(extent='min-max').encode(
        x='Origin:N',
        y=col,
        color='Origin:N',
        tooltip = ['Origin:N', col]
    ).properties(
        title=f'Boxplot of {col} by Origin'
    ).interactive()

    chart.save(f'{col}_by_origin_boxplot.json')

# Create boxplots for the specified columns against 'DriveTrain'
for col in ['MSRP', 'Invoice', 'Horsepower', 'MPG_City', 'MPG_Highway', 'Weight', 'Wheelbase', 'Length']:
    chart = alt.Chart(df_filtered).mark_boxplot(extent='min-max').encode(
        x='DriveTrain:N',
        y=col,
        color='DriveTrain:N',
        tooltip = ['DriveTrain:N', col]
    ).properties(
        title=f'Boxplot of {col} by DriveTrain'
    ).interactive()

    chart.save(f'{col}_by_drivetrain_boxplot.json')

# Determine the top 10 car makes by the number of models
top_10_makes = df_filtered['Make'].value_counts().head(10)

# Print the top 10 car makes
print("\nTop 10 Car Makes by Number of Models:")
print(top_10_makes.to_markdown(numalign="left", stralign="left"))

# Calculate the average MSRP for the top 10 car makes
avg_msrp_top_makes = df_filtered[df_filtered['Make'].isin(top_10_makes.index)].groupby('Make')['MSRP'].mean()

# Print the average MSRP for the top 10 car makes
print("\nAverage MSRP for Top 10 Car Makes:")
print(avg_msrp_top_makes.to_markdown(numalign="left", stralign="left"))

# Create a pivot table to calculate the average MPG_City and MPG_Highway by Type
pivot_table_type = df_filtered.pivot_table(index='Type', values=['MPG_City', 'MPG_Highway'], aggfunc=np.mean)

# Print the pivot table
print("\nAverage MPG by Type:")
print(pivot_table_type.to_markdown(numalign="left", stralign="left"))

# Create a pivot table to calculate the average MPG_City and MPG_Highway by Origin
pivot_table_origin = df_filtered.pivot_table(index='Origin', values=['MPG_City', 'MPG_Highway'], aggfunc=np.mean)

# Print the pivot table
print("\nAverage MPG by Origin:")
print(pivot_table_origin.to_markdown(numalign="left", stralign="left"))