In [28]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
#loading data as datafram file
df = pd.read_csv('/kaggle/input/car-price-prediction-challenge/car_price_prediction.csv')

In [3]:
df.head()

In [4]:
df.describe()

In [5]:
def basic_stats(col):
    return df[col].mean(), df[col].std(), df[col].median(), df[col].max(), df[col].min()

print(basic_stats('Price'))

In [6]:
df[df['Price']>=200000]

In [7]:
df = df[df['ID']!=45812886]

In [8]:
int_cols = df.dtypes[df.dtypes == 'int64'].index
flot_cols = df.dtypes[df.dtypes == 'float64'].index

for col in int_cols:
    print(col, basic_stats(col))
    
for col in flot_cols:
    print(col, basic_stats(col))
    

In [9]:
sns.histplot(data=df,x='Manufacturer',y="Price")

In [10]:
sns.barplot(data=df,x='Manufacturer',y="Price")

In [11]:
df_price = df.groupby(by=['Manufacturer'])['Price'].mean()

In [12]:
df_price.index

In [13]:
df[df['Manufacturer']=="LAMBORGHINI"]

In [14]:
px.bar(x=df_price.index,y=df_price.values)

In [15]:
df_n = df[df['Manufacturer']!="LAMBORGHINI"]

In [16]:
df_price_n = df_n.groupby(by=['Manufacturer'])['Price'].mean()

In [17]:
px.bar(x=df_price_n.index,y=df_price_n.values)

In [18]:
df['Manufacturer'].value_counts()[df['Manufacturer'].value_counts() <= 10].index, len(df['Manufacturer'].value_counts()[df['Manufacturer'].value_counts() <= 10].index)

As we can see there are 24 Manufacturers with less than 10 listings that skew our charts

In [19]:
less_than_10 = list(df['Manufacturer'].value_counts()[df['Manufacturer'].value_counts() <= 10].index)

In [20]:
list(less_than_10)

In [21]:
df_more_than_10 = df[~df['Manufacturer'].isin(less_than_10)]

In [22]:
df_price_more_than_10 = df_more_than_10.groupby(by=['Manufacturer'])['Price'].mean()

In [23]:
px.bar(x=df_price_more_than_10.index,
       y=df_price_more_than_10.values,
       title="Mean car manufacture price",
       template="simple_white",
      labels={"x":"Manufacturer","y":"mean price"})


Compared to previous charts no large outliers change the overall view of the chart

## First Inspection

as we can see from the above descriptions and overviews the dataset consists of 18 columns.
- 13 are Strings
- 5 are Numeric (4 Int, 1 float)

In [24]:
df.columns

In [25]:
df['Fuel type'].value_counts()

In [26]:
def value_count(col):
    return df[col].value_counts()


for i in ['Gear box type', 'Drive wheels', 'Doors', 'Wheel', 'Color','Category', 'Leather interior', 'Fuel type']:
    print(value_count(i))

The door values is wrong needs to be changed to 2.5 and 4.5.

In [27]:
def change_door(val):
    if val == "04-May":
        return 4.5
    elif val =="02-Mar":
        return 2.5
    else:
        return val
df['Doors'] = df['Doors'].apply(lambda x: change_door(x))
df['Doors'].value_counts()

In [29]:
df.columns

#### Starting Feature Engineering & Modeling

In [31]:
train_data = df.sample(frac=0.8, random_state=25)
test_data = df.drop(train_data.index)

In [32]:
train_data.dropna(axis=0, subset=['Price'], inplace=True)

target = train_data.Price


cols_with_missing = [col for col in train_data.columns 
                                 if train_data[col].isnull().any()]                                  
candidate_train_predictors = train_data.drop(['ID', 'Price'] + cols_with_missing, axis=1)
candidate_test_predictors = test_data.drop(['ID'] + cols_with_missing, axis=1)

# "cardinality" means the number of unique values in a column.
# We use it as our only way to select categorical columns here. This is convenient, though
# a little arbitrary.
low_cardinality_cols = [cname for cname in candidate_train_predictors.columns if 
                                candidate_train_predictors[cname].nunique() < 10 and
                                candidate_train_predictors[cname].dtype == "object"]
numeric_cols = [cname for cname in candidate_train_predictors.columns if 
                                candidate_train_predictors[cname].dtype in ['int64', 'float64']]
my_cols = low_cardinality_cols + numeric_cols
train_predictors = candidate_train_predictors[my_cols]
test_predictors = candidate_test_predictors[my_cols]

In [33]:
one_hot_encoded_training_predictors = pd.get_dummies(train_predictors)

In [34]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor

def get_mae(X, y):
    # multiple by -1 to make positive MAE score instead of neg value returned as sklearn convention
    return -1 * cross_val_score(RandomForestRegressor(50), 
                                X, y, 
                                scoring = 'neg_mean_absolute_error').mean()

predictors_without_categoricals = train_predictors.select_dtypes(exclude=['object'])

mae_without_categoricals = get_mae(predictors_without_categoricals, target)

mae_one_hot_encoded = get_mae(one_hot_encoded_training_predictors, target)

print('Mean Absolute Error when Dropping Categoricals: ' + str(int(mae_without_categoricals)))
print('Mean Abslute Error with One-Hot Encoding: ' + str(int(mae_one_hot_encoded)))


In [35]:

bestfeatures = SelectKBest(score_func=chi2, k=10)
fit = bestfeatures.fit(one_hot_encoded_training_predictors, target)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(one_hot_encoded_training_predictors.columns)
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns
print(featureScores.nlargest(10,'Score'))

In [38]:
from sklearn.ensemble import ExtraTreesClassifier
model = ExtraTreesClassifier()
model.fit(one_hot_encoded_training_predictors, target)
print(model.feature_importances_) #use inbuilt class feature_importances of tree based classifiers
#plot graph of feature importances for better visualization
feat_importances = pd.Series(model.feature_importances_, index=one_hot_encoded_training_predictors.columns)
feat_importances.nlargest(10).plot(kind='barh')
plt.show()

In [45]:
candidate_train_predictors = train_data.drop(['ID'] + cols_with_missing, axis=1)
my_cols = low_cardinality_cols + numeric_cols + ['Price']
train_predictors_n = candidate_train_predictors[my_cols]
one_hot_encoded_pred = pd.get_dummies(train_predictors_n)

In [47]:
y = one_hot_encoded_pred.Price
X = one_hot_encoded_pred.drop(['Price'] + cols_with_missing, axis=1)

In [49]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [50]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators = 100,
                           n_jobs = -1,
                           oob_score = True,
                           bootstrap = True,
                           random_state = 42)
rf.fit(X_train, y_train)

print('R^2 Training Score: {:.2f} \nOOB Score: {:.2f} \nR^2 Validation Score: {:.2f}'.format(rf.score(X_train, y_train), 
                                                                                             rf.oob_score_,
                                                                                             rf.score(X_valid, y_valid)))


In [52]:
#!pip install rfpimp

In [53]:
from sklearn.metrics import r2_score
from rfpimp import permutation_importances

def r2(rf, X_train, y_train):
    return r2_score(y_train, rf.predict(X_train))

perm_imp_rfpimp = permutation_importances(rf, X_train, y_train, r2)

In [56]:
perm_imp_rfpimp

In [58]:
perm_imp_rfpimp.values

In [67]:
test = [i[0] for i in perm_imp_rfpimp.values]
test

In [71]:
sns.set(rc={'figure.figsize':(12,9)})
sns.barplot(test,perm_imp_rfpimp.index)

##### Credit

Sources used for creating this notebook

https://machinelearningmastery.com/feature-selection-with-categorical-data/
https://www.kaggle.com/questions-and-answers/55494
https://towardsdatascience.com/explaining-feature-importance-by-example-of-a-random-forest-d9166011959e