# ITU MINDSET DATA SCIENCE PROJECT

   #### Team Member 1: Emrullah DAĞ
   #### Team Member 2: Metin ÖKTEM
   #### Team Member 3: Ömer Faruk SOY
   
   #### DATE: 29.01.2024

# 1. Import Packages

In [None]:
# Used for scientific computations.
import numpy as np 

# Used for data analysis and manipulation.
import pandas as pd 

# Used to control warning messages in a Python program.
import warnings 
warnings.filterwarnings('ignore')

# Used for data visualization.
import seaborn as sns
import matplotlib.pyplot as plt  
import IPython
import plotly.express as px
import plotly.graph_objects as go
from ipywidgets import VBox
from scipy import stats

# Splits the dataset into training and testing sets.
from sklearn.model_selection import train_test_split

# Modelling methods.
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor as randomForestRegressor
from xgboost import XGBRegressor

# Evaluate regression model performance.
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Standardilization for deep learning model
from sklearn.preprocessing import StandardScaler

# Build deep learning models.
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

# These libraries are used for various tasks in data processing, machine learning and deep learning.

# 2.  Description of Data

## 2.1 Feature Dictionary

* Brand: The brand or manufacturer of the vehicle.

* Year: The year the vehicle was manufactured.

* Model: The specific model name or identifier of the vehicle.

* Car/Suv: Indicates whether the vehicle is a car or an SUV.

* Title: The title status of the vehicle (e.g., clean title, salvage title).

* UsedOrNew: Indicates whether the vehicle is used or new.

* Transmission: The type of transmission the vehicle has (e.g., automatic, manual).

* Engine: Details about the vehicle's engine, which may include information like engine type, size, or configuration.

* DriveType: The type of drive the vehicle uses (e.g., front-wheel drive, rear-wheel drive, all-wheel drive).

* FuelType: The type of fuel the vehicle uses (e.g., gasoline, diesel, electric).

* FuelConsumption: Information about the vehicle's fuel consumption.

* Kilometres: The number of kilometers the vehicle has been driven.

* ColourExtInt: The exterior and interior colors of the vehicle.

* Location: The geographical location or region where the vehicle is located.

* CylindersinEngine: The number of cylinders in the vehicle's engine.

* BodyType: The body type of the vehicle (e.g., sedan, coupe, SUV).

* Doors: The number of doors on the vehicle.

* Seats: The number of seats in the vehicle.

* Price: The price of the vehicle.

## 2.2  Get Data
### This data has been imported from the Kaggle dataset available at: https://www.kaggle.com/datasets/nelgiriyewithana/australian-vehicle-prices.


In [None]:
path = "/Users/apple/Downloads/Australian Vehicle Prices.csv"

In [None]:
df = pd.read_csv(path)

In [None]:
df.head()

# 3. Data Processing

## 3.1 Structure of the data

In [None]:
def get_know_data(data):
    if isinstance(data, (pd.DataFrame, pd.Series)):
        print("-"*70)
        print("\033[1m"+"\nDataset overall informations:\n"+ "\033[0m")
        print(data.info())
        print("-"*70)
        print("\033[1m"+"\nTake a statistical look:\n"+ "\033[0m")
        print(data.describe())
        print("-"*70)
        print("\033[1m"+"\n Duplicated values count:\n"+ "\033[0m")
        print(df.duplicated().sum())
        print("\033[1m"+"\n NaN values:\n"+ "\033[0m")
        print(df.isnull().sum())
        print("-"*70)
        print("\033[1m"+"\nSize of dataset: \n "+ "\033[0m")
        print("Columns:\t{}".format(df.shape[1]))
        print("Rows:\t\t{}".format(df.shape[0]))
    else:
        print("\033[1m"+"\nError: Out of use data type. Expected Dataframe or Series.\n"+ "\033[0m")
        
get_know_data(df)

In [None]:
df_box = df.copy()
df_box["FuelConsumption"] = df_box['FuelConsumption'].str.split(' L / 100 km', expand=True)[0]
df_box["FuelConsumption"] = pd.to_numeric(df_box["FuelConsumption"], errors='coerce')
sns.boxplot(df_box["FuelConsumption"])

In [None]:
sns.boxplot(df["Year"])

In [None]:
df_box = df_box[df_box["Price"] != 1500000]
df_box["Price"] = pd.to_numeric(df_box["Price"], errors='coerce')
sns.boxplot(df_box["Price"])

In [None]:
df_box = df_box[df_box["Price"] < 200000]
df_box["Price"] = pd.to_numeric(df_box["Price"], errors='coerce')
sns.boxplot(df_box["Price"])

In [None]:
df_visual = df.copy()
df_visual.drop(['Title','ColourExtInt','Model','Location','Car/Suv',"Engine"],axis=1,inplace=True)


df_visual["Seats"] = df_visual['Seats'].str.split(' Seats', expand=True)[0]
df_visual["Doors"] = df_visual['Doors'].str.split(' Doors', expand=True)[0]
df_visual["CylindersinEngine"] = df_visual['CylindersinEngine'].str.split(' cyl', expand=True)[0]

columns = ["Kilometres","Seats","Price","Doors","CylindersinEngine"]
for column in columns:
    df_visual[column] = pd.to_numeric(df_visual[column], errors='coerce')
df_visual.dropna()


numeric_columns = df_visual.select_dtypes(include=['float64', 'int64']).columns
corr = df_visual[numeric_columns].corr()


plt.figure(figsize=(10, 8))
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap')
plt.show()

In [None]:
columns = [ 'UsedOrNew', 'Transmission', 'DriveType', 'FuelType', 'CylindersinEngine', 'BodyType', 'Doors', 'Seats']

for column in columns:
    fig = px.pie(df, names = column, title = "Distribution of "+column)
    fig.update_layout(
        autosize=False,
        width=1000,   
        height=600    
    )
    fig.show()


##### Percentage of NaN values

In [None]:
percentage_nulls = np.round((df.isnull().sum()/len(df)*100).sort_values(ascending=False))
percentage_nulls = percentage_nulls[percentage_nulls != 0]
percentage_nulls

In [None]:
df_price = df.copy()
df_price["Kilometres"] = pd.to_numeric(df_price["Kilometres"], errors='coerce')
df_price["Price"] = pd.to_numeric(df_price["Price"], errors='coerce')
df_price.dropna(subset=["Kilometres", "Price"], inplace=True)


target = df_price['Price']
column_sels = ['Year', 'Kilometres']
fig, axs = plt.subplots(2, 1, figsize=(12, 6))

for i, col in enumerate(column_sels):
    sns.regplot(x=col, y=target, data=df_price, ax=axs[i])
    
plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=5.0)
plt.show()

## 3.2 Manipulation of the Data

##### Data manipulation refers to the process of making changes, transformations, or adjustments to a dataset, often involving tasks such as 
* cleaning 
* filtering
* reformatting 
* aggregating
##### to prepare it for analysis or modeling.

In [None]:
url = 'https://uploads-ssl.webflow.com/5a00e7aa079aa40001b3c4fb/5d5c22e040c6beab16860e8e_data-cleaning-thumb.png'
IPython.display.Image(url,  width= 750)

In [None]:
# Copying data for preserve original data
df2 = df.copy()

In [None]:
# Define class 
class DataFrameManipulator:
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def rename_columns(self, new_names):
        self.dataframe.rename(columns=new_names, inplace=True)

    def drop_columns(self, columns_to_drop):
        self.dataframe.drop(columns=columns_to_drop,axis=1, inplace=True)

manipulator = DataFrameManipulator(df2)

In [None]:
# Renaming
manipulator.rename_columns({"Engine":"VolumeEngine(L)"})
manipulator.rename_columns({"Price":"Price(AUD)"})
manipulator.rename_columns({"FuelConsumption":"FuelConsumption(L/100km)"})

In [None]:
df2["VolumeEngine(L)"] = df2['VolumeEngine(L)'].str.split(',', expand=True)[1]
df2["Seats"] = df2['Seats'].str.split(' Seats', expand=True)[0]
df2["Doors"] = df2['Doors'].str.split(' Doors', expand=True)[0]
df2["VolumeEngine(L)"] = df2['VolumeEngine(L)'].str.split(' L', expand=True)[0]
df2["CylindersinEngine"] = df2['CylindersinEngine'].str.split(' cyl', expand=True)[0]
df2["FuelConsumption(L/100km)"] = df2['FuelConsumption(L/100km)'].str.split(' L', expand=True)[0]

In [None]:
# Cleaning
manipulator.drop_columns(['Brand', 'Model', 'Car/Suv', 'Title',  'Location',"ColourExtInt"])
df2.dropna(inplace=True)

In [None]:
# Exploring unique values
for i in df2.columns:
    print(df2[i].unique())
    print("*"*50+"\n")

In [None]:
# Getting dummies
df_dummy = pd.get_dummies(df2[["BodyType","UsedOrNew","Transmission","DriveType","FuelType"]])
df_reel = pd.concat([df2, df_dummy], axis=1);df_reel.head()

In [None]:
# Drop main features
df_reel.drop(["UsedOrNew","Transmission","DriveType","FuelType","BodyType"], axis =1, inplace = True)

In [None]:
df_reel.info()

In [None]:
# Object to numeric (convert integer if it is possible)
columns = ["Year","VolumeEngine(L)","FuelConsumption(L/100km)","Kilometres","CylindersinEngine","Doors","Seats","Price(AUD)"]
for column in columns:
    df_reel[column] = pd.to_numeric(df_reel[column], errors='coerce', downcast='integer')

In [None]:
df_reel.dropna(inplace=True)

In [None]:
df_reel.columns

In [None]:
df_reel.drop(["Transmission_-","FuelType_-","FuelType_Other","DriveType_Other","Transmission_Manual","UsedOrNew_DEMO","BodyType_Other"], axis=1, inplace=True)

# 4. Modelling
## 4.1 Split Train and Test Data


In [None]:
df3 = df_reel.copy()

In [None]:
X = df3.drop("Price(AUD)", axis = 1)
y = df3["Price(AUD)"]
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.20, random_state = 41)

## 4.2 Implementation Models
### 4.2.1 Linear Regression

In [None]:
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

In [None]:
ypred = linear_model.predict(X_test)
print('MAE:', mean_absolute_error(y_test, ypred))
print('MSE:', mean_squared_error(y_test, ypred))
print('RMSE:', np.sqrt(mean_squared_error(y_test, ypred)))
print("r2 score: ",r2_score(y_test,ypred))

In [None]:
ypred = linear_model.predict(X_test)
plt.plot(y_test, ypred, "*")

### 4.2.2 Random Forest Regression

##### Trying to increase r2_score by different method


In [None]:
df4 = df_reel.copy()

In [None]:
df4.head()

In [None]:
X = df4.drop("Price(AUD)", axis = 1)
y = df4["Price(AUD)"]

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.20, random_state = 41)

In [None]:
rfr=randomForestRegressor() 
rfr.fit(X_train,y_train)

In [None]:
y_pred = rfr.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error (MSE):", mse)

rmse = np.sqrt(mse)
print("Root Mean Squared Error (RMSE):", rmse)

mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error (MAE):", mae)

r2 = r2_score(y_test, y_pred)
print("R-squared (R2) Score:", r2)

In [None]:
ypred = rfr.predict(X_test)
plt.plot(y_test, ypred, "*")

### 4.2.3 XGBoost Regression


In [None]:
df5 = df_reel.copy()

In [None]:
X = df5.drop("Price(AUD)", axis = 1)
y = df5["Price(AUD)"]

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.20, random_state = 41)

In [None]:
my_model = XGBRegressor()

my_model.fit(X_train, y_train, verbose=True)

In [None]:
y_pred = my_model.predict(X_test)

# Calculate MSE
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error (MSE):", mse)

# Calculate RMSE
rmse = np.sqrt(mse)
print("Root Mean Squared Error (RMSE):", rmse)

# Calculate MAE
mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error (MAE):", mae)

# Calculate R2 score
r2 = r2_score(y_test, y_pred)
print("R-squared (R2) Score:", r2)

In [None]:
ypred = my_model.predict(X_test)
plt.plot(y_test, ypred, "*")

# 5. Evaluation Results
##### Accuracy results: 

>1. Lineer regression = 0.5952889703737149
>2. Random Forest Regression = 0.8513690157792952
>3. XGBoost Regression = 0.8525739759646642

##### Therefore, XGBoost Regression was the best performing model for this analysis.