<a href="https://colab.research.google.com/github/Isna-gif/ST1_Assessment/blob/main/Australian_Vehicle_Price.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns  # For advanced visualization styles
from scipy.stats import skew, kurtosis  # For skewness and kurtosis
import plotly.express as px
import matplotlib.ticker as mtick
from scipy import stats
from scipy.stats import f_oneway
from sklearn.feature_selection import f_classif
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, median_absolute_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
import warnings
import joblib
import tkinter as tk
from tkinter import messagebox



# Define the ticker (not used in the current context but available for future use)
ticker = 'USIX'

# Reading the CSV file
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Australian Vehicle Prices(Australian Vehicle Prices).csv')

# Display all columns
pd.set_option('display.max_columns', None)

# Step 2: Problem Definition
print("\n### Defining the Problem ###")
print("In this project, we aim to predict vehicle prices based on various factors.")

# Step 3: Target variable
print("\n### Target Variable ###")
print("Dependent Variable (Target): Price of vehicles")
print("Independent Variables: Kilometres, FuelConsumption, Year, Brand, Model, Car/Suv, Title, \
UsedOrNew, Transmission, Engine, DriveType, FuelType, ColourExtInt, Location, BodyType, \
CylindersinEngine, Doors, Seats")

# Ensure that the 'Price' column contains only numeric values and drop non-numeric values or NaNs
df["Price"] = pd.to_numeric(df["Price"], errors='coerce')  # Convert to numeric, invalid parsing will be set as NaN
df = df.dropna(subset=["Price"])  # Drop rows where 'Price' is NaN

# Plot: Distribution of 'Price'
plt.figure(figsize=(12, 7))
plt.hist(df["Price"], bins=40, color='steelblue', edgecolor='black', alpha=0.7)
sns.kdeplot(df["Price"], color='darkorange', lw=2)
plt.title("Figure 1: Distribution of Vehicle Prices", fontsize=16)
plt.xlabel("Price (in thousands)", fontsize=14)
plt.ylabel("Frequency", fontsize=14)
plt.xlim(0, 200000)
plt.ylim(0, 14000)
plt.xticks(ticks=range(0, 200001, 10000), labels=[f'{i // 1000}K' for i in range(0, 200001, 10000)])
plt.grid(axis='y', alpha=0.75)
plt.show()

# Analyze skewness and kurtosis of Price
print("\n### Analyzing Distribution for Class Imbalance ###")

# Calculate skewness and kurtosis using the cleaned DataFrame 'df'
price_skew = df["Price"].skew()
price_kurtosis = df["Price"].kurt()
print(f'Skewness = {round(price_skew, 2)}')
print(f'Kurtosis = {round(price_kurtosis, 2)}')

# Interpretation of skewness and kurtosis
if price_skew > 0:
    print(f"A skewness of {round(price_skew, 2)} indicates positive skew (longer tail on the right).")
else:
    print(f"A skewness of {round(price_skew, 2)} indicates negative skew (longer tail on the left).")

if price_kurtosis > 3:
    print(f"A kurtosis of {round(price_kurtosis, 2)} indicates a high degree of peakedness, suggesting many outliers.")
else:
    print(f"A kurtosis of {round(price_kurtosis, 2)} is within normal range.")

#Step 4: Data exploration at basic level
print("\n### Data Exploration At Basic Level ###")

# Display original DataFrame (first 5 rows)
print("\nOriginal DataFrame:")
print(df.head())

# Dataset size and shape
print("\nOriginal Dataset")
print(f"Size: {df.size} elements")
print(f"Shape: {df.shape}")

# Display column names and data types
print(f"Attributes (Column Headers): {df.columns.tolist()}")
print("\nData types before conversion:")
print(df.dtypes)

# Missing values check
print("\nMissing values before conversion:")
print(df.isnull().sum())

# Data cleaning: remove duplicates, convert Price and Kilometres to numeric, and remove NaNs and $0 prices
df = df.drop_duplicates()
df["Price"] = pd.to_numeric(df["Price"], errors="coerce")
df["Kilometres"] = pd.to_numeric(df["Kilometres"], errors="coerce")
df = df.dropna(subset=["Price", "Kilometres"])
df = df[df["Price"] > 0]
print(f"\nAfter cleaning, DataFrame shape: {df.shape}")
print(df.head())

# Classify columns based on types and relevance
quantitative_cols = ["CylindersinEngine", "Doors", "Seats"]
qualitative_cols = ["Brand", "Model", "Car/Suv", "Title", "UsedOrNew", "Transmission", "Engine", "DriveType",
                    "FuelType", "ColourExtInt", "Location", "BodyType"]
continuous_cols = ["FuelConsumption", "Kilometres", "Price", "Year"]

print(f"Quantitative Columns: {quantitative_cols}")
print(f"Qualitative Columns: {qualitative_cols}")
print(f"Continuous Columns: {continuous_cols}")

# Remove unwanted columns for analysis
df_cleaned = df.drop(columns=qualitative_cols)
print("\nDataFrame after removing unwanted columns:")
print(df_cleaned.head())
print(f"\nShape after removing unwanted columns: {df_cleaned.shape}")

# Data types and missing values after cleaning
print("\nData types after cleaning:")
print(df_cleaned.dtypes)
print("\nMissing values after cleaning:")
print(df_cleaned.isnull().sum())

#Step 5: Visual Exploratory Data Analysis (EDA) of data (with histogram and barcharts)
print("\n### Visual Exploratory Data Analysis ###")

# Plot: Vehicle count by Brand (Bar plot)
plt.figure(figsize=(14, 8))
sns.countplot(x='Brand', data=df, order=df['Brand'].value_counts().index)
plt.title('Figure 2: Vehicle Count by Brand', fontsize=16)
plt.xlabel('Brand', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# More categorical and continuous variable visualizations (Figure numbers start from 3)
categorical_variables = ['UsedOrNew', 'Transmission', 'Engine', 'DriveType', 'FuelType', 'ColourExtInt', 'Location',
                         'BodyType']
continuous_variables = ["FuelConsumption", "Kilometres", "Price", "Year"]


# Function for categorical variable plots
def create_bar_plot(dataframe, column_name, figure_number, threshold=5):
    cleaned_data = dataframe[column_name].dropna()
    counts = cleaned_data.value_counts()
    combined_counts = pd.concat([counts[counts >= threshold], pd.Series({'Other': counts[counts < threshold].sum()})])
    fig = px.bar(x=combined_counts.index, y=combined_counts.values, labels={'x': column_name, 'y': 'Count'},
                 title=f'Figure {figure_number}: Count of Vehicles by {column_name}')
    fig.update_layout(xaxis_tickangle=-45, height=600)
    fig.update_traces(marker_line_color='black', marker_line_width=1.5)
    fig.show()

# Create bar plots for each categorical variable
for idx, var in enumerate(categorical_variables, start=3):
    create_bar_plot(df, var, idx)

# Function for continuous variable histograms
def create_histogram(dataframe, column_name, figure_number, bins=30):
    fig = px.histogram(dataframe[column_name].dropna(), x=column_name, nbins=bins,
                       title=f'Figure {figure_number}: Distribution of {column_name}')
    fig.update_layout(xaxis_title=column_name, yaxis_title='Count', height=600)
    fig.update_traces(marker_line_color='black', marker_line_width=1.5)
    fig.show()

# Create histograms for continuous variables
for idx, var in enumerate(continuous_variables, start=11):
    create_histogram(df, var, idx)

# Step 6: Outlier Analysis
print("\n### Outlier Analysis ###")

# Function to detect and remove outliers using the IQR method
def remove_outliers_iqr(dataframe, columns):

    # Create a copy of the DataFrame to avoid modifying the original
    dataframe_cleaned = dataframe.copy()

    # Calculate Q1 (25th percentile) and Q3 (75th percentile)
    Q1 = columns.quantile(0.25)
    Q3 = columns.quantile(0.75)
    IQR = Q3 - Q1

    # Define the lower and upper bounds for outliers
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Remove outliers from the DataFrame based on the IQR range
    for col in columns.columns:
        dataframe_cleaned = dataframe_cleaned[(dataframe_cleaned[col] >= lower_bound[col]) &
                                              (dataframe_cleaned[col] <= upper_bound[col])]

    # Count the number of outliers per column
    outliers = (columns < lower_bound) | (columns > upper_bound)
    outlier_count = outliers.sum()

    # Create a DataFrame summarizing the number of outliers per column
    dtype_dataframe_outlier = pd.DataFrame(outlier_count, columns=["Number of Outliers"]).reset_index()
    dtype_dataframe_outlier.rename(columns={"index": "Column Name"}, inplace=True)

    return dtype_dataframe_outlier, dataframe_cleaned

# Apply the outlier removal function to the DataFrame
selected_numeric_columns = df_cleaned.select_dtypes(include=['float64', 'int64'])
df_outlier_summary, df_cleaned_without_outliers = remove_outliers_iqr(df_cleaned, selected_numeric_columns)

# Print the outlier summary
print(f"\nOutlier Summary:\n{df_outlier_summary}")

# Optional: Print the remaining rows after outlier removal
print(f"\nDataFrame after outlier removal (first 5 rows):\n{df_cleaned_without_outliers.head()}")
print(f"Remaining rows after outlier removal: {df_cleaned_without_outliers.shape}")


# Visualize the distribution of a continuous variable before and after outlier removal
for var in continuous_variables:
    plt.figure(figsize=(12, 5))

    # Original data histogram
    plt.subplot(1, 2, 1)
    plt.hist(df[var], bins=40, color='steelblue', edgecolor='black', alpha=0.7)
    plt.title(f'Original Distribution of {var}')
    plt.xlabel(var)
    plt.ylabel('Frequency')

    # Cleaned data histogram
    plt.subplot(1, 2, 2)
    plt.hist(df_cleaned[var], bins=40, color='darkorange', edgecolor='black', alpha=0.7)
    plt.title(f'Cleaned Distribution of {var} (Outliers Removed)')
    plt.xlabel(var)
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.show()

# Step 7: Missing Value Analysis and Treatment Options
print("\n### Missing Value Analysis ###")

# Calculate the number of missing values for each column in df_cleaned_without_outliers
missing_values = df_cleaned_without_outliers.isnull().sum()

# Create a DataFrame to display missing values per column
dtype_df_missing_values = pd.DataFrame(missing_values, columns=["Missing Values"]).reset_index()
dtype_df_missing_values = dtype_df_missing_values.rename(columns={"index": "Column Name"})

# Display the missing values
print(dtype_df_missing_values)

# Delete rows with missing values
df_cleaned_without_missing = df_cleaned_without_outliers.dropna()
print(f"\nDataFrame after deleting rows with missing values (first 5 rows): {df_cleaned_without_missing.shape}")

# Impute missing values with MEDIAN for continuous variables
continuous_cols = ["FuelConsumption", "Kilometres", "Price", "Year"]

for col in continuous_cols:
    if col in df_cleaned_without_outliers.columns:
        if df_cleaned_without_outliers[col].isnull().sum() > 0:
            median_value = df_cleaned_without_outliers[col].median()
            df_cleaned_without_outliers[col].fillna(median_value, inplace=True)

print(f"\nDataFrame after imputing continuous variables with MEDIAN (first 5 rows):\n{df_cleaned_without_outliers.head()}")

# Impute missing values with MODE for categorical variables
qualitative_cols = ["Brand", "Model", "Car/Suv", "Title", "UsedOrNew", "Transmission", "Engine", "DriveType",
                    "FuelType", "ColourExtInt", "Location", "BodyType"]

for col in qualitative_cols:
    if col in df_cleaned_without_outliers.columns:
        if df_cleaned_without_outliers[col].isnull().sum() > 0:
            mode_value = df_cleaned_without_outliers[col].mode()[0]
            df_cleaned_without_outliers[col].fillna(mode_value, inplace=True)

print(f"\nDataFrame after imputing categorical variables with MODE (first 5 rows):\n{df_cleaned_without_outliers.head()}")
print("\nFinal DataFrame after Missing Value Treatment (first 5 rows)")
print(df_cleaned_without_outliers.head())
print(f"Remaining rows after missing value treatment: {df_cleaned_without_outliers.shape}")

# Step 8: Feature selection - Visual and statistical correlation analysis
print("\n## Feature selection - Visual and statistical correlation analysis ##")

def analyze_correlation(data, x_col, y_col, title_prefix):
    # Select only numeric columns for analysis
    numeric_columns = data[[x_col, y_col]].dropna()
    if not numeric_columns.empty:
        # Calculate and display the correlation matrix
        correlation_matrix = numeric_columns.corr()
        print(f"\nCorrelation Matrix between {y_col} and {x_col}:")
        print(correlation_matrix)

        # Plot the correlation matrix as a heatmap
        fig, ax = plt.subplots(figsize=(6, 4))
        sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', ax=ax)
        plt.title(f"Correlation Matrix: {title_prefix} {x_col} and {y_col}")  # Adjusted title order
        plt.show()

        # Extract the correlation value
        correlation_value = correlation_matrix.loc[y_col, x_col]
        print(f"\nPearson Correlation between {y_col} and {x_col}: {correlation_value:.4f}")

        # Interpretation of the correlation strength
        if abs(correlation_value) >= 0.7:
            print("This is a strong correlation.")
        elif abs(correlation_value) >= 0.3:
            if correlation_value < 0:
                print("This is a moderate negative correlation.")
            else:
                print("This is a moderate positive correlation.")
        else:
            if correlation_value < 0:
                print("This is a weak negative correlation.")
            else:
                print("This is a weak positive correlation.")

        # Scatter plot with a regression line to visualize the relationship
        plt.figure(figsize=(10, 6))
        sns.regplot(x=x_col, y=y_col, data=numeric_columns, scatter_kws={'alpha': 0.5}, line_kws={'color': 'red'})

        # Add the Pearson correlation value to the plot
        plt.title(f'Scatter Plot of {y_col} vs {x_col}\nPearson Correlation: {correlation_value:.4f}', fontsize=16)
        plt.xlabel(x_col, fontsize=14)
        plt.ylabel(y_col, fontsize=14)

        # Adjust x-axis ticks for Year
        if x_col == 'Year':
            plt.xticks(ticks=numeric_columns[x_col].unique(), rotation=45)

        # Show the plot
        plt.tight_layout()
        plt.show()
    else:
        print(f"No continuous numeric columns available for correlation analysis of {y_col} and {x_col}.")

# Clean and prepare the DataFrame
df_cleaned_without_outliers['FuelConsumption'] = df_cleaned_without_outliers['FuelConsumption'].replace(
    r'[^0-9.]+', '', regex=True
)
df_cleaned_without_outliers['FuelConsumption'] = pd.to_numeric(df_cleaned_without_outliers['FuelConsumption'], errors='coerce')
df_cleaned_without_outliers.dropna(subset=['FuelConsumption', 'Year'], inplace=True)

# Analyze Price vs Kilometres
analyze_correlation(df_cleaned_without_outliers, 'Kilometres', 'Price', 'Price and')

# Analyze Price vs Fuel Consumption
analyze_correlation(df_cleaned_without_outliers, 'FuelConsumption', 'Price', 'Price and')

# Analyze Price vs Year
analyze_correlation(df_cleaned_without_outliers, 'Year', 'Price', 'Price and')

# Box Plot: Vehicle Prices by Brand
plt.figure(figsize=(14, 8))
sns.boxplot(x='Brand', y='Price', data=df)
plt.title('Box Plot of Vehicle Prices by Brand', fontsize=16)
plt.xlabel('Brand', fontsize=14)
plt.ylabel('Price (In Thousands)', fontsize=14)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# Calculate the median price for each Car/SUV type
top_30 = df.groupby('Car/Suv')['Price'].median().nlargest(30).index
plt.figure(figsize=(16, 10))  # Set figure size

# Filter the dataframe to only include the top 20 categories
sns.boxplot(x='Car/Suv', y='Price', data=df[df['Car/Suv'].isin(top_30)], order=top_30)
plt.title('Box Plot of Vehicle Prices by Top 30 Car/SUV Types', fontsize=16)
plt.xlabel('Car/SUV Type', fontsize=14)
plt.ylabel('Price (In Thousands)', fontsize=14)

# Rotate x-axis labels to prevent overlap
plt.xticks(rotation=45, ha='right', fontsize=12)

# Ensure the layout is not cut off
plt.tight_layout()
plt.show()

# Box Plot: Vehicle Prices by Used or New status
plt.figure(figsize=(8, 6))
sns.boxplot(x='UsedOrNew', y='Price', data=df)
plt.title('Box Plot of Vehicle Prices by Used or New', fontsize=16)
plt.xlabel('Used or New', fontsize=14)
plt.ylabel('Price (In Thousands)', fontsize=14)
plt.tight_layout()
plt.show()

# Box Plot: Vehicle Prices by Transmission type
plt.figure(figsize=(10, 6))
sns.boxplot(x='Transmission', y='Price', data=df)
plt.title('Box Plot of Vehicle Prices by Transmission Type', fontsize=16)
plt.xlabel('Transmission Type', fontsize=14)
plt.ylabel('Price (In Thousands)', fontsize=14)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# Calculate the median price for each engine type and get the top 30
top_30_engines = df.groupby('Engine')['Price'].median().nlargest(30).index
plt.figure(figsize=(14, 8))  # Set a larger figure size for better display

# Filter the dataframe to only include the top 30 engine types
sns.boxplot(x='Engine', y='Price', data=df[df['Engine'].isin(top_30_engines)], order=top_30_engines)
plt.title('Box Plot of Vehicle Prices by Top 30 Engine Types', fontsize=16)
plt.xlabel('Engine Type', fontsize=14)
plt.ylabel('Price (In Thousands)', fontsize=14)

# Rotate x-axis labels to prevent overlap
plt.xticks(rotation=45, ha='right', fontsize=12)

# Ensure the layout is not cut off
plt.tight_layout()
plt.show()

# Box Plot: Vehicle Prices by Drive Type
plt.figure(figsize=(10, 6))
sns.boxplot(x='DriveType', y='Price', data=df)
plt.title('Box Plot of Vehicle Prices by Drive Type', fontsize=16)
plt.xlabel('Drive Type', fontsize=14)
plt.ylabel('Price (In Thousands)', fontsize=14)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# Box Plot: Vehicle Prices by Fuel Type
plt.figure(figsize=(10, 6))
sns.boxplot(x='FuelType', y='Price', data=df)
plt.title('Box Plot of Vehicle Prices by Fuel Type', fontsize=16)
plt.xlabel('Fuel Type', fontsize=14)
plt.ylabel('Price (In Thousands)', fontsize=14)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# Calculate the median price for each ColourExtInt and get the top 30
top_30_colors = df.groupby('ColourExtInt')['Price'].median().nlargest(30).index
plt.figure(figsize=(16, 8))  # Set a larger figure size for better display

# Filter the dataframe to only include the top 30 color combinations
sns.boxplot(x='ColourExtInt', y='Price', data=df[df['ColourExtInt'].isin(top_30_colors)], order=top_30_colors)
plt.title('Box Plot of Vehicle Prices by Top 30 Colour Combinations (Exterior/Interior)', fontsize=16)
plt.xlabel('Colour (Exterior/Interior)', fontsize=14)
plt.ylabel('Price (In Thousands)', fontsize=14)

# Rotate x-axis labels to prevent overlap
plt.xticks(rotation=45, ha='right', fontsize=12)

# Ensure the layout is not cut off
plt.tight_layout()
plt.show()

# Calculate the median price for each location and get the top 30
top_30_locations = df.groupby('Location')['Price'].median().nlargest(30).index
plt.figure(figsize=(16, 8))  # Set a larger figure size for better display

# Filter the dataframe to only include the top 30 locations
sns.boxplot(x='Location', y='Price', data=df[df['Location'].isin(top_30_locations)], order=top_30_locations)
plt.title('Box Plot of Vehicle Prices by Top 30 Locations', fontsize=16)
plt.xlabel('Location', fontsize=14)
plt.ylabel('Price (In Thousands)', fontsize=14)

# Rotate x-axis labels to prevent overlap
plt.xticks(rotation=45, ha='right', fontsize=12)

# Ensure the layout is not cut off
plt.tight_layout()
plt.show()

# Box Plot: Vehicle Prices by Transmission type
plt.figure(figsize=(10, 6))
sns.boxplot(x='Transmission', y='Price', data=df)
plt.title('Box Plot of Vehicle Prices by Transmission Type', fontsize=16)
plt.xlabel('Transmission Type', fontsize=14)
plt.ylabel('Price (In Thousands)', fontsize=14)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# Box Plot: Vehicle Prices by Body Type
plt.figure(figsize=(14, 6))
sns.boxplot(x='BodyType', y='Price', data=df)
plt.title('Box Plot of Vehicle Prices by Body Type', fontsize=16)
plt.xlabel('Body Type', fontsize=14)
plt.ylabel('Price (In Thousands)', fontsize=14)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# Step 9 - Statistical feature selection (categorical vs. continuous) using ANOVA test
print("\n## Statistical feature selection (categorical vs. continuous) using ANOVA test ##")

# Define the target variable and categorical columns
target_variable = 'Price'  # Replace with the name of your target variable
categorical_columns = ['Brand', 'Model', 'Car/Suv', 'Title', 'UsedOrNew', 'Transmission', 'Engine', 'DriveType', 'FuelType', 'ColourExtInt', 'Location', 'BodyType']

# Initialize a list to store ANOVA results
anova_results = []

# Define a function to perform ANOVA for each categorical variable
def perform_anova(df, categorical_column, continuous_column):
    """
    This function performs ANOVA to test if there is any significant relationship
    between the categorical variables and the continuous target variable.
    H0 (null hypothesis): The independent variable does not have any effect on the dependent variable (Price).
    H1 (alternative hypothesis): The independent variable does have an effect on the dependent variable (Price).
    """

    # Group the continuous data by the categorical variable
    groups = [df[continuous_column][df[categorical_column] == category] for category in df[categorical_column].unique()]

    # Perform ANOVA
    anova_result = stats.f_oneway(*groups)

    # Return a dictionary with the results
    return {
        'Feature': categorical_column,
        'F-Statistic': anova_result.statistic,
        'P-Value': anova_result.pvalue
    }

# Print the hypothesis
print("Statistical Feature selection using ANOVA test")
print("Null Hypothesis (H₀): The independent variable does not have any effect on the dependent variable (Price)")
print("Alternative Hypothesis (H₁): The independent variable does have an effect on the dependent variable (Price)\n")

# Perform ANOVA for each categorical column
for column in categorical_columns:
    try:
        # Perform ANOVA for the current column and append the result
        result = perform_anova(df, column, target_variable)

        # Append the result to the list along with an interpretation
        p_value = result['P-Value']
        interpretation = ""

        # Interpret the p-value
        if p_value < 0.01:
            interpretation = "Strong evidence to reject H0: Significant relationship."
        elif p_value < 0.05:
            interpretation = "Moderate evidence to reject H0: Likely significant relationship."
        elif p_value < 0.10:
            interpretation = "Weak evidence to reject H0: Possible relationship."
        else:
            interpretation = "Fail to reject H0: No significant relationship."

        # Add the interpretation to the result
        result['Interpretation'] = interpretation

        # Append the result with interpretation to the list
        anova_results.append(result)

    except ValueError as e:
        print(f"Could not perform ANOVA for {column}: {e}")

# Convert the results list into a DataFrame
anova_results_df = pd.DataFrame(anova_results)

# Format the results to display 6 decimal places
anova_results_df['F-Statistic'] = anova_results_df['F-Statistic'].apply(lambda x: f"{x:.6f}")
anova_results_df['P-Value'] = anova_results_df['P-Value'].apply(lambda x: f"{x:.6f}")

# Display the formatted ANOVA results with interpretation
print("\nFormatted ANOVA Results:")
print(anova_results_df[['Feature', 'F-Statistic', 'P-Value', 'Interpretation']])
print("First few rows of the dataset:") # Displaying the first few rows of the dataset to understand its structure
print(df.head())

print("Columns in the dataset:") # Displaying the columns in the DataFrame
print(df.columns)

Sel_Columns = ['Brand', 'Model', 'UsedOrNew', 'Transmission', 'Engine', 'DriveType', 'FuelType','Kilometres'] # Selecting the final predictor

DataFor_ML = df[Sel_Columns] # Creating a new DataFrame with the selected predictors

print("Data for Machine Learning (Predictors):") # Displaying  the first few rows of the new DataFrame for verification
print(DataFor_ML.head())

# Define the target variable
target_variable = 'Price'
if target_variable in df.columns:
    y = df[target_variable]
    print("\nTarget Variable:")
    print(y.head())
else:
    print(f"Error: Target variable '{target_variable}' not found in the dataset.")

# Saving the final DataFrame for model traning
DataFor_ML.to_pickle('DataForML.pkl')


# Step 11 : Data conversion to numeric values for machine learning/predictive analysis

print("Original DataFrame:") # Displaying the first few rows of the original DataFrame
print(df.head())

print("\nColumn Names:") # Checking the column names to identify the correct names
print(df.columns)

# Converting the specified categorical variables to dummy variables
categorical_columns = [
    'Brand', 'Model', 'Car/Suv', 'Title', 'UsedOrNew',
    'Transmission', 'Engine', 'DriveType', 'FuelType',
    'ColourExtInt', 'Location', 'BodyType'
]

# Use get_dummies to convert categorical variables into dummy/indicator variables
df_with_dummies = pd.get_dummies(df, columns=categorical_columns, drop_first=True)

# Print the column names after get_dummies
print("\nColumn Names after get_dummies:")
print(df_with_dummies.columns)

# As the column 'Engine' had some NaN Values
# Clean the 'Engine' column (extract numeric values) if it exists
if 'Engine' in df_with_dummies.columns:
    def clean_engine_value(value):

# Using Try and Except Method
        try:
            # Extract the first number (assume it is in the format 'X.X L / 100 km' or similar)
            return float(value.split()[0])  # Split by space and take the first part
        except (ValueError, IndexError):
            return np.nan  # Return NaN if it can't be converted

    # Apply the cleaning function
    df_with_dummies['Engine'] = df_with_dummies['Engine'].apply(clean_engine_value)

# Handle NaN values: filling with 0 for numeric columns
df_with_dummies.fillna(0, inplace=True)

# Converting all boolean columns to 0 and 1
bool_columns = df_with_dummies.select_dtypes(include='bool').columns
df_with_dummies[bool_columns] = df_with_dummies[bool_columns].astype(int)

# If any string format column then Converting it to numeric
for col in df_with_dummies.columns:
    if df_with_dummies[col].dtype == 'object':
        df_with_dummies[col] = pd.to_numeric(df_with_dummies[col], errors='coerce').fillna(0)

# Now converting all remaining columns to int dtype
df_with_dummies = df_with_dummies.astype(int)

# Displaying the first few rows of the modified DataFrame
print("\nDataFrame with Dummy Variables (0 and 1):")
print(df_with_dummies.head())

# Saving the Modified DataFrame to a new CSV file
df_with_dummies.to_csv('modified_file.csv', index=False)



# Step 12 : Train/test data split and standardisation/normalisation of data

PredictorScaler = MinMaxScaler()  #Choose Min-Max normalization

# Storing the fit object
PredictorScalerFit = PredictorScaler.fit(X)

# Generating the normalized values of X
X_normalized = PredictorScalerFit.transform(X)

# Split the normalized data into training and testing sets
X_training, X_testing, y_training, y_testing = train_test_split(X_normalized, y, test_size=0.3, random_state=42)

# Displaying the data
print("Training set shapes:")
print(f'X_train shape: {X_training.shape}')
print(f'y_train shape: {y_training.shape}')
print("Testing set shapes:")
print(f'X_test shape: {X_testing.shape}')
print(f'y_test shape: {y_testing.shape}')



# Step 13 : Investigating multiple regression algorithms
warnings.simplefilter(action='ignore', category=FutureWarning) # Suppressing  FutureWarnings

# Sample dataset
np.random.seed(42)
X = np.random.rand(100, 5)  # Any features
y = 10000 + (500 * X[:, 0]) + (300 * X[:, 1]) + np.random.randn(100) * 100  # In target variable adding randomness to y

# Scaling/normalizing data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Spliting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize the models and Comparing the Performance of Each  Machine Learning Model
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(random_state=42, max_depth=5),
    "Random Forest": RandomForestRegressor(random_state=42, n_estimators=100, max_depth=10),
    "AdaBoost": AdaBoostRegressor(random_state=42, n_estimators=100),
    "XGBoost": XGBRegressor(random_state=42, n_estimators=100, max_depth=5)
}

# Training, Evaluting and Predicting Each Model
for name, model in models.items():
    model.fit(X_train, y_train) # Training the model
    y_pred = model.predict(X_test) # Making predictions on the test set

    # Calculating
    r2 = r2_score(y_test, y_pred)
    mean_accuracy = 100 - (mean_squared_error(y_test, y_pred, squared=False) / np.mean(y_test) * 100)
    median_accuracy = 100 - (median_absolute_error(y_test, y_pred) / np.median(y_test) * 100)
    cross_val_accuracies = cross_val_score(model, X_scaled, y, cv=10, scoring='r2')
    accuracy_values = (100 * cross_val_accuracies)  # Converting R² to percentage
    final_avg_accuracy = np.mean(accuracy_values)

    # Printing results for each model
    print(f"{name}:")
    print(f"  R² Value: {r2:.3f}")
    print(f"  Mean Accuracy on test data: {mean_accuracy:.2f}%")
    print(f"  Median Accuracy on test data: {median_accuracy:.2f}%")
    print(f"  Final Average Accuracy: {final_avg_accuracy:.2f}%")
    print("-" * 30)

    # Ploting histogram of predicted values for better understanding
    plt.figure()
    plt.hist(y_pred, bins=20, alpha=0.7, color='blue', edgecolor='black')
    plt.title(f'Histogram of Predicted Values: {name}')
    plt.xlabel('Predicted Values')
    plt.ylabel('Frequency')
    plt.grid(axis='y', alpha=0.75)
    plt.show()

# Step 14 : Selection of Best Model

# Comparing Accuracies
# Keeping the variable empty
Best_Model = ""

# Keeping the best accuracy 0.0
Best_Accuracy = 0.0

# Compare model accuracies
# Linear Regression
if 0.50 > Best_Accuracy:
    Best_Model = "Linear Regression"
    Best_Accuracy = 0.50 # Accuracy of Linear Regression

# Decision Tree Regressor
if -0.34 > Best_Accuracy:
    Best_Model = "Decision Tree Regressor"
    Best_Accuracy = -0.34 # Accuracy of Decision Tree Regressor

# Random Forest Regressor
if 0.41 > Best_Accuracy:
    Best_Model = "Random Forest Regressor"
    Best_Accuracy = 0.41 # Accuracy of Random Forest Regressor

# AdaBoost Regressor
if 0.48 > Best_Accuracy:
    Best_Model = "AdaBoost Regressor"
    Best_Accuracy = 0.48 # Accuracy of AdaBoost Regressor

# XGBoost Regressor
if 0.31 > Best_Accuracy:
    Best_Model = "XGBoost Regressor"
    Best_Accuracy = 0.31 # Accuracy of XGBoost Regressor

# Display the best model and its accuracy
print("The best model is:", Best_Model, "with an average accuracy of:", f"{Best_Accuracy:.2f}")


# Step 15 : Tkinter
# Step 1: Load your complete dataset
data = pd.read_csv('AustralianVehiclePrices.csv')

# Step 2: Clean the dataset
data['Price'] = pd.to_numeric(data['Price'], errors='coerce')
data.dropna(subset=['Price'], inplace=True)

data['Kilometres'] = pd.to_numeric(data['Kilometres'], errors='coerce')
data['Year'] = pd.to_numeric(data['Year'], errors='coerce')

data.dropna(subset=['Kilometres', 'Year'], inplace=True)

X = data[['Kilometres', 'Year']]
y = data['Price']

# Step 3: Train the model using all data
model = LinearRegression()
model.fit(X, y)

# Step 4: Save the model as a serialized file
joblib.dump(model, 'linear_regression_model.pkl')
print("Model saved as 'linear_regression_model.pkl'")

def predict_vehicle_price(kilometres, year):
    """Predict the vehicle price based on input kilometres and year."""
    loaded_model = joblib.load('linear_regression_model.pkl')
    input_data = pd.DataFrame([[kilometres, year]], columns=['Kilometres', 'Year'])
    predicted_price = loaded_model.predict(input_data)
    return predicted_price[0]

def on_predict():
    """Handle the predict button click event."""
    try:
        kilometres = float(kilometres_entry.get())
        year = int(year_entry.get())
        predicted_price = predict_vehicle_price(kilometres, year)
        messagebox.showinfo("Prediction", f'The predicted vehicle price is: ${predicted_price:.2f}')
    except ValueError:
        messagebox.showerror("Input Error", "Please enter valid numeric values for Kilometres and Year.")

# Create the main window
root = tk.Tk()
root.title("Vehicle Price Predictor")

# Create input fields
tk.Label(root, text="Kilometres:").pack(pady=5)
kilometres_entry = tk.Entry(root)
kilometres_entry.pack(pady=5)

tk.Label(root, text="Year:").pack(pady=5)
year_entry = tk.Entry(root)
year_entry.pack(pady=5)

# Create Predict button
predict_button = tk.Button(root, text="Predict", command=on_predict)
predict_button.pack(pady=20)

# Add a Quit button
quit_button = tk.Button(root, text="Quit", command=root.quit)
quit_button.pack(pady=5)

# Run the application
root.mainloop()

FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/Colab Notebooks/Australian Vehicle Prices(Australian Vehicle Prices).csv'