In [None]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

%matplotlib inline

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import r2_score, mean_squared_error

from sklearn.preprocessing import LabelEncoder
lab = LabelEncoder()

import warnings
warnings.filterwarnings('ignore')

In [None]:
df =pd.read_csv('/kaggle/input/car-price-prediction-challenge/car_price_prediction.csv')


In [None]:
df.head(5)

In [None]:
df.shape

In [None]:
df.index

In [None]:
df.dtypes

In [None]:
df.duplicated().any()
df.duplicated().sum()

In [None]:
df.describe

In [None]:
df.isnull().sum()

In [None]:
df['Cylinders'].fillna(df['Cylinders'].mean(),inplace =True)

In [None]:
for col in df.columns:
    print(f'Category in {col} is :\n {df[col].unique()}\n')
    print('\\' * 50)  # Escape the backslash with another backslash


In [None]:
df=df.drop(['ID','Doors'],axis=1)

In [None]:
# Replacing '-' with 0
df['Levy']=df['Levy'].replace('-','0')

# Converting Levy type to float
df['Levy'] = df['Levy'].astype('float64')

In [None]:
dtime = dt.datetime.now()
df['Age']=dtime.year - df['Prod. year']


In [None]:
#data = data.drop('Prod. year',axis=1)


In [None]:
df.head()

In [None]:
# Replacing 'Km' with ''  
df['Mileage'] =df['Mileage'].str.replace('km',"")

# Converting Mileage type to int64
df.Mileage = df.Mileage.astype('Int64')

In [None]:
df.Mileage.head()

Engine volume

In [None]:
# Check a sample of the 'Engine volume' column to see its contents before replacement
print(df['Engine volume'].head())

# Replace 'Turbo' with an empty string in the 'Engine volume' column
df['Engine volume'] = df['Engine volume'].str.replace('Turbo','')

# Check a sample of the 'Engine volume' column after replacement
print(df['Engine volume'].head())

# Convert the modified 'Engine volume' column to float
df['Engine volume'] = df['Engine volume'].astype('float64')


In [None]:
df['Engine volume'].unique()

**Let's check the last version of our dataset after our processing**

In [None]:
df.head()

**Analysis📝 & Visualiation📊**

In [None]:
df.hist(bins=25,figsize=(15,10),color='peru')
plt.show()

THE RESULTS
1- Levy column: Most values between more than 0 and less than 2000

2- Most Engine valume in range (1 to 5)

3- Mileage for most cars are 0.0 (most cars are new)

4- in Airbags column: Most cars have 3 to 5 airbags or 13 airbags

5- in Age column: Most cars are 13 years old, but we have some very old cars

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Assuming df is your DataFrame
plt.figure(figsize=(15, 5), dpi=120)  # Setting the figure size and DPI

# Creating subplots with one plot (1 row, 1 column, plot number 1)
ax = plt.subplot(111)

# Using sns.countplot to show the count of colors from the DataFrame 'df'
sns.countplot(data=df, x='Color', palette='hot', ax=ax)

# Setting the title
plt.title("Distribution of Colors", fontsize=20)

plt.show()


In [None]:
top_10_cars = df.Manufacturer.value_counts().sort_values(ascending=False)[:10]
top_10_cars

In [None]:
plt.figure(figsize=(15, 10))
sns.barplot(x=top_10_cars, y=top_10_cars.index,palette='hot',linewidth = 4)
plt.title('Top10 The Most Frequent Cars',loc='center',fontweight='bold',fontsize=18)
plt.xlabel('Frequency',fontsize=20)
plt.ylabel('Cars',fontsize=20)
plt.tight_layout()
plt.show()

In [None]:
# Lets define the numeric columns
numeric_data = df.select_dtypes(exclude=object)

In [None]:
print(df.columns)


In [None]:
print(numeric_data)

In [None]:
numeric_data = df.select_dtypes(exclude=object)

In [None]:
#Plotting Graphs Before treating outliers of continous features
for col in numeric_data:
    fig, ax =plt.subplots(1,2, constrained_layout=True)
    fig.set_size_inches(20, 6)
    sns.distplot(df[col], ax=ax[0]).set(title="Distplot")
    sns.boxplot(df[col], ax=ax[1]).set(title="Boxplot")
    plt.suptitle(f'{col.title()} (Before handling outliers)',weight='bold')
    fig.show()

In [None]:
for col in numeric_data:
    q1 = df[col].quantile(0.75)
    q2 = df[col].quantile(0.25)
    iq = q1 - q2
    
    low = q2-1.5*iq
    high = q1-1.5*iq
    outlier = ((numeric_data[col]>high) | (numeric_data[col]<low)).sum()

    total = numeric_data[col].shape[0]
    print(f"Total Outliers in {col} are :{outlier}---{round(100*(outlier)/total,2)}%")

In [None]:
if outlier>0:
    data = df.loc[(df[col]<=high) & (df[col]>=low) ]

**Transform Data**

In [None]:
obdata = data.select_dtypes(include=object)
numdata = data.select_dtypes(exclude=object)


In [None]:
for i in range(0,obdata.shape[1]):
    obdata.iloc[:,i] = lab.fit_transform(obdata.iloc[:,i])  
data = pd.concat([obdata,numdata],axis=1)

In [None]:
data

**MODEL******

In [None]:
x= data.drop('Price',axis=1)
y= data['Price']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.25,random_state=5)

In [None]:
algorithm = ['LinearRegression','DecisionTreeClassifier','RandomForestClassifier','GradientBoostingRegressor','SVR']
R2=[]
RMSE = []

In [None]:
def models(model):
    model.fit(x_train,y_train)
    pre = model.predict(x_test)
    r2 = r2_score(y_test,pre)
    rmse = np.sqrt(mean_squared_error(y_test,pre))
    R2.append(r2)
    RMSE.append(rmse)
    score = model.score(x_test,y_test)
    print(f'The Score of Model is :{score}')

In [None]:
model1 = LinearRegression()
model2 = DecisionTreeRegressor()
model3 = RandomForestRegressor()
model4 = GradientBoostingRegressor()
model5 = SVR()

In [None]:
models(model1)
models(model2)
models(model3)
models(model4)
models(model5)

In [None]:
import pandas as pd

# Assuming 'algorithm', 'R2', and 'RMSE' are lists of equal length
data = {'Algorithm': algorithm, 'R2_score': R2, 'RMSE': RMSE}
df = pd.DataFrame(df)

df.head()


In [None]:
fig = plt.figure(figsize=(20, 8))

# Assuming 'Algorithm' and 'R2_score' are columns in the same DataFrame
plt.plot(data['Algorithm'], data['R2_score'], label='R2_score', lw=5, color='peru', marker='v', markersize=15)

plt.legend(fontsize=15)
plt.show()


In [None]:
print(df.columns)


In [None]:
print(df.head())


THANKS