In [35]:
# Modules imported
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency, t, f, f_oneway, pearsonr
from statsmodels.stats.multicomp import pairwise_tukeyhsd
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm

In [17]:
data = pd.read_csv('FinalCleanedDataset.csv')

In [None]:
# Function to perform Chi-Square test for independence
def chi_square_test(df, var1, var2):
    contingency_table = pd.crosstab(df[var1], df[var2])
    chi2, p, dof, _ = chi2_contingency(contingency_table)
    return {"Variable1": var1, "Variable2": var2, "Chi-Square": round(chi2, 4), "p-value": round(p, 4), "Degrees of Freedom": dof, "Significant (α=0.05)": p < 0.05}

In [None]:
categorical_vars_all = data.select_dtypes(include='object').columns.tolist()

# Perform Chi-Square tests for all pairs of categorical variables
chi_square_results_all = []
for i, var1 in enumerate(categorical_vars_all):
    for var2 in categorical_vars_all[i+1:]:  # Avoid duplicate pairs and self-pairs
        try:
            result = chi_square_test(data, var1, var2)
            chi_square_results_all.append(result)
        except ValueError:  # Handle cases where the contingency table is invalid
            continue
chi_square_results_all_df = pd.DataFrame(chi_square_results_all)
chi_square_results_all_df

Unnamed: 0,Variable1,Variable2,Chi-Square,p-value,Degrees of Freedom,Significant (α=0.05)
0,Brand,Model,732967.4343,0.0,17136,True
1,Brand,UsedOrNew,5010.3961,0.0,96,True
2,Brand,Transmission,728.0241,0.0,48,True
3,Brand,DriveType,11362.1997,0.0,192,True
4,Brand,FuelType,14205.3475,0.0,336,True
5,Brand,BodyType,11050.1768,0.0,432,True
6,Brand,ColourExt,4982.0285,0.0,816,True
7,Brand,City,63850.7809,0.0,24480,True
8,Brand,State,4116.1968,0.0,384,True
9,Brand,YearRanges,3227.0351,0.0,144,True


In [19]:
# Function to calculate confidence intervals for the difference in means
def confidence_interval_difference_means(group1, group2, alpha=0.05):
    n1, n2 = len(group1), len(group2)
    mean1, mean2 = np.mean(group1), np.mean(group2)
    var1, var2 = np.var(group1, ddof=1), np.var(group2, ddof=1)
    pooled_se = np.sqrt((var1 / n1) + (var2 / n2))
    df = min(n1 - 1, n2 - 1)  # Degrees of freedom approximation
    t_critical = t.ppf(1 - alpha / 2, df)
    margin_of_error = t_critical * pooled_se
    lower_bound = (mean1 - mean2) - margin_of_error
    upper_bound = (mean1 - mean2) + margin_of_error
    return {
        "Mean Difference": mean1 - mean2,
        "Lower Bound": lower_bound,
        "Upper Bound": upper_bound,
        "Confidence Level": 1 - alpha,
        "Degrees of Freedom": df,
    }

In [21]:
# Price by UsedOrNew (New vs Used)
group1_usedOrNew = data[data['UsedOrNew'] == 'NEW']['Price']
group2_usedOrNew = data[data['UsedOrNew'] == 'USED']['Price']

ci_usedOrNew = confidence_interval_difference_means(group1_usedOrNew, group2_usedOrNew)
ci_usedOrNew

# Price by Transmission (Automatic vs Manual)
group1_trans = data[data['Transmission'] == 'Automatic']['Price'].dropna()
group2_trans = data[data['Transmission'] == 'Manual']['Price'].dropna()
ci_transmission = confidence_interval_difference_means(group1_trans, group2_trans)

# Price by FuelType (Unleaded vs Diesel)
group1_fuel = data[data['FuelType'] == 'Unleaded']['Price'].dropna()
group2_fuel = data[data['FuelType'] == 'Diesel']['Price'].dropna()
ci_fueltype = confidence_interval_difference_means(group1_fuel, group2_fuel)

# Summarize the results for both comparisons
ci_results_summary = pd.DataFrame([
    {**ci_usedOrNew, "Comparison": "New vs Used (UsedOrNew)"},
    {**ci_transmission, "Comparison": "Automatic vs Manual (Transmission)"},
    {**ci_fueltype, "Comparison": "Unleaded vs Diesel (FuelType)"}
])

ci_results_summary

Unnamed: 0,Mean Difference,Lower Bound,Upper Bound,Confidence Level,Degrees of Freedom,Comparison
0,31547.009052,28288.474502,34805.543603,0.95,1105,New vs Used (UsedOrNew)
1,8639.984768,7363.289442,9916.680095,0.95,1827,Automatic vs Manual (Transmission)
2,-15742.563997,-16644.403625,-14840.724368,0.95,4819,Unleaded vs Diesel (FuelType)


In [23]:
# Function to compute confidence interval for ratio of variances
def confidence_interval_ratio_variances(group1, group2, alpha=0.05):
    n1, n2 = len(group1), len(group2)
    var1, var2 = np.var(group1, ddof=1), np.var(group2, ddof=1)
    f_ratio = var1 / var2
    lower_bound = f_ratio / f.ppf(1 - alpha / 2, n1 - 1, n2 - 1)
    upper_bound = f_ratio * f.ppf(1 - alpha / 2, n2 - 1, n1 - 1)
    return {
        "Variance Ratio (F)": f_ratio,
        "Lower Bound": lower_bound,
        "Upper Bound": upper_bound,
        "Confidence Level": 1 - alpha,
        "Degrees of Freedom Group1": n1 - 1,
        "Degrees of Freedom Group2": n2 - 1,
    }

In [24]:
# Variance ratio for Price (New vs Used)
group1_new = data[data['UsedOrNew'] == 'NEW']['Price'].dropna()
group2_used = data[data['UsedOrNew'] == 'USED']['Price'].dropna()
ci_variance_new_used = confidence_interval_ratio_variances(group1_new, group2_used)

# Variance ratio for Price (Automatic vs Manual Transmission)
group1_auto = data[data['Transmission'] == 'Automatic']['Price'].dropna()
group2_manual = data[data['Transmission'] == 'Manual']['Price'].dropna()
ci_variance_auto_manual = confidence_interval_ratio_variances(group1_auto, group2_manual)

# Variance ratio for Price (Unleaded vs Diesel FuelType)
group1_unleaded = data[data['FuelType'] == 'Unleaded']['Price'].dropna()
group2_diesel = data[data['FuelType'] == 'Diesel']['Price'].dropna()
ci_variance_unleaded_diesel = confidence_interval_ratio_variances(group1_unleaded, group2_diesel)

# Summarize results
variance_ci_summary = pd.DataFrame([
    {**ci_variance_new_used, "Comparison": "New vs Used (Price)"},
    {**ci_variance_auto_manual, "Comparison": "Automatic vs Manual (Transmission)"},
    {**ci_variance_unleaded_diesel, "Comparison": "Unleaded vs Diesel (FuelType)"}
])
variance_ci_summary

Unnamed: 0,Variance Ratio (F),Lower Bound,Upper Bound,Confidence Level,Degrees of Freedom Group1,Degrees of Freedom Group2,Comparison
0,5.101604,4.685971,5.57219,0.95,1105,14185,New vs Used (Price)
1,1.370025,1.277521,1.466507,0.95,13950,1827,Automatic vs Manual (Transmission)
2,0.284365,0.269981,0.299438,0.95,7074,4819,Unleaded vs Diesel (FuelType)


In [30]:
# Perform one-way ANOVA for 'Price' grouped by 'BodyType'
anova_result = f_oneway(
    *[data[data['Transmission'] == group]['Price'].dropna() for group in data['Transmission'].unique()]
)

# Perform Tukey's HSD for pairwise comparisons
tukey_result = pairwise_tukeyhsd(
    data['Price'].dropna(),
    data['Transmission'][data['Price'].notna()],
    alpha=0.05
)

anova_summary = {
    "ANOVA F-Statistic": anova_result.statistic,
    "ANOVA p-value": anova_result.pvalue,
    "Degrees of Freedom (Between)": len(data['Transmission'].unique()) - 1,
    "Degrees of Freedom (Within)": len(data['Price'].dropna()) - len(data['Transmission'].unique())
}

# Display ANOVA summary
anova_summary_df = pd.DataFrame([anova_summary])
anova_summary_df

Unnamed: 0,ANOVA F-Statistic,ANOVA p-value,Degrees of Freedom (Between),Degrees of Freedom (Within)
0,138.428033,7.979483e-32,1,15777


In [31]:
# Display Tukey HSD results
tukey_summary_df = pd.DataFrame(data=tukey_result.summary().data[1:], columns=tukey_result.summary().data[0])
tukey_summary_df


Unnamed: 0,group1,group2,meandiff,p-adj,lower,upper,reject
0,Automatic,Manual,-8639.9848,0.0,-10079.3876,-7200.5819,True


In [34]:
# Perform Two-Way ANOVA for 'Price' based on 'Transmission' and 'FuelType'
formula = 'Price ~ C(Transmission) + C(FuelType) + C(Transmission):C(FuelType)'
model = ols(formula, data=data.dropna(subset=['Price', 'Transmission', 'FuelType'])).fit()
anova_results = anova_lm(model)
anova_results

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
C(Transmission),1.0,120650200000.0,120650200000.0,155.411103,1.673834e-35
C(FuelType),7.0,1508866000000.0,215552200000.0,277.655657,0.0
C(Transmission):C(FuelType),7.0,2507695000.0,358242100.0,0.461456,0.8629042
Residual,15767.0,12240380000000.0,776329300.0,,


In [36]:
# Perform correlation analysis with hypothesis testing
numerical_vars = ['Price', 'Kilometres', 'EngineCapacity', 'FuelConsumptionPer100km']
correlation_results = []

for i, var1 in enumerate(numerical_vars):
    for var2 in numerical_vars[i+1:]:  # Avoid duplicate pairs and self-correlation
        corr_coef, p_value = pearsonr(data[var1].dropna(), data[var2].dropna())
        correlation_results.append({
            "Variable 1": var1,
            "Variable 2": var2,
            "Correlation Coefficient": corr_coef,
            "p-value": p_value,
            "Significant": p_value < 0.05
        })

correlation_results_df = pd.DataFrame(correlation_results)
correlation_results_df

Unnamed: 0,Variable 1,Variable 2,Correlation Coefficient,p-value,Significant
0,Price,Kilometres,-0.443446,0.0,True
1,Price,EngineCapacity,0.347329,0.0,True
2,Price,FuelConsumptionPer100km,0.107666,6.656307e-42,True
3,Kilometres,EngineCapacity,0.218354,1.2507230000000001e-169,True
4,Kilometres,FuelConsumptionPer100km,0.325385,0.0,True
5,EngineCapacity,FuelConsumptionPer100km,0.716208,0.0,True
