In [None]:
#!pip install sklearn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge, Lasso
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

%matplotlib inline

In [None]:
df = pd.read_csv('unclean_data.csv')

In [None]:
df.head()

In [None]:
df = df.dropna(how="all")

In [None]:
df=df.reset_index(drop=True)

In [None]:
df

In [None]:
df['price']=df['price'].str.replace(',','').str.replace('£','')
df['price']=pd.to_numeric(df['price'],errors='coerce')

In [None]:
df.head()

In [None]:
df.rename(columns={'fuel type':'fuel_type', 'fuel type2':'fuel_type2',
                   'engine size':'engine_size', 'engine size2':'engine_size2'}, inplace=True)
df.head()

In [None]:
df['fuel_type2']=df.fuel_type2.fillna(df.fuel_type)
df.head()

In [None]:
df=df.drop(columns=['fuel_type'])
df.head()

In [None]:
df.rename(columns={'fuel_type2':'fuel_type'},inplace=True)
df.head()

In [None]:
df['mileage']=df.mileage.fillna(df.mileage2)
df.head()

In [None]:
df['mileage']=df['mileage'].astype(str)

In [None]:
df['mileage']=df['mileage'].str.replace(',','')
df.head()

In [None]:
df['mileage']=pd.to_numeric(df['mileage'],errors='coerce')
df.head()

In [None]:
df=df.drop(columns=['mileage2'])
df

In [None]:
df['engine_size2']=df.engine_size2.fillna(df.engine_size)
df.head()

In [None]:
df['engine_size2']=pd.to_numeric(df['engine_size2'],errors='coerce')
df.head()

In [None]:
df['engine_size2']=df['engine_size2'].apply(lambda x:round(x/1000,1) if x>1000 else round(x,1))
df.head()

In [None]:
df=df.drop(columns=['engine_size'])
df.rename(columns={'engine_size2':'engine_size'}, inplace=True)
df

In [None]:
df.describe()

In [None]:
# df[df['engine_size'].isna()]

In [None]:
df=df.dropna(how='any')
df=df.reset_index(drop=True)
df[df['engine_size'].isna()]

In [None]:
df['year']=df['year'].astype('int64')
df.head()

In [None]:
df=df.drop(columns=['model','reference'])
df.head()

In [None]:
dumTransmission=pd.get_dummies(df['transmission'])
dumTransmission

In [None]:
df=df.join(dumTransmission)

In [None]:
df

In [None]:
dumFueltype=pd.get_dummies(df['fuel_type'])
dumFueltype

In [None]:
dumFueltype.rename(columns={'Other':'OtherFuelType'}, inplace=True)

In [None]:
df=df.join(dumFueltype)

In [None]:
df

In [None]:
df.drop(columns=['transmission','fuel_type'],inplace=True)
df.head()

In [None]:
df.to_csv('my_cleaned_data.csv')

# Start EDA Below

# Starting feature observation and analysis

In [None]:
df.describe()

In [None]:
# df.corr()

In [None]:
mileage_mean = df['mileage'].mean()
mileage_stdev = df['mileage'].std()
# By empirical rule, anything with zscore more 3 or less than -3, is an outlier
upper_bound = mileage_mean + 3*mileage_stdev
lower_bound = mileage_mean - 3*mileage_stdev

In [None]:
# df.drop(df.loc[df["price"]>(df['price'].mean()+3*df['price'].std())].index, inplace=True)
# df.drop(df.loc[df["price"]<(df['price'].mean()-3*df['price'].std())].index, inplace=True)
df.drop(df.loc[df["mileage"]>( df['mileage'].mean() + 3* df['mileage'].std().index, inplace=True)
df.drop(df.loc[df["mileage"]<(mileage_mean - 3*mileage_stdev.index, inplace=True)
df.drop(df.loc[df["price"]<5000].index, inplace=True)
df.drop(df.loc[df["price"]>50000].index, inplace=True)

In [None]:
# sns.pairplot(df, height=2.5)
# plt.tight_layout()

In [None]:
import seaborn as sns
%matplotlib inline

cm = np.corrcoef(df.values.T)
sns.set(font_scale=1.5)
sns.set(rc = {'figure.figsize':(25,12)})
hm = sns.heatmap(cm, 
                 cbar=True, 
                 annot = True, 
                 square=True, 
                 fmt='.2f', 
                 annot_kws={"size":15}, 
                 yticklabels=df.columns,
                 xticklabels=df.columns)

## Split the dataset into training and test sets
We will split 75:25 and using randomstate=42 to make a repeatable result. 


In [None]:
response = df['price']
features = df[['year', 'mileage', 'engine_size', 'Manual', 'Semi-Auto','Diesel','Petrol']]
# features = df.drop('price', axis=1)

In [None]:

X_train, X_test, Y_train, Y_test = train_test_split(features, 
                                                    response, 
                                                    test_size=0.25,  
                                                    random_state=42)

print("Training and testing split was successful")

# Training and Testing

In [None]:

# build the model
# Step 1 import the libraries

# Step 2 call the linear regression model
model = LinearRegression()
# Step 3 create the model
model.fit(X_train, Y_train)
print(model)

In [None]:
# get the predictions for the test dataset
predictions = model.predict(X_test)

## Evaluate the model performance

In [None]:
# R-squared
print("R^2 in training dataset: ", round(model.score(X_train, Y_train),3))
print("R^2 in testing dataset: ", round(model.score(X_test, Y_test),3))

In [None]:
# RMSE

# Now finding the mean squared error
mse = mean_squared_error(Y_test, predictions)

print('RMSE in Test set: ',mse**0.5)

In [None]:
# plot the prediction vs actual
actual_values = Y_test
plt.scatter(predictions, actual_values, alpha=0.9, color='b')
plt.xlabel("predicted price")
plt.ylabel("Actual price")
plt.title("Linear regression model (Predicted vs Actual)")
plt.show()

In [None]:
# the coefficient list
feat = list(X_train.columns)
coef = model.coef_.transpose()

coef_table = np.vstack((feat, coef)).T
new_df = pd.DataFrame(coef_table, columns=["Features", "Coeffficients"])
print(new_df)