In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import seaborn as sns
import  matplotlib.pyplot  as plt

import warnings

%matplotlib inline 
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
#
pd.options.mode.use_inf_as_na = True
warnings.filterwarnings('ignore')
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# <font size='+3' color=blue> <b>Problem Statement </b></font> 

This project with 
- 4009 sample point with 11 feature 1 `price` target column, 3 numerical features

- There are soming missing values in `fuel_type,accident clean_title`. 
- The values in `price milage` need to be formated into numerical values
- for object-type features e.g. `[brand model engine transmission, ext_col, int_col]` 

# Load/Check Dataset

In [None]:
df_test=pd.read_csv('/kaggle/input/kagglextest/test.csv')
print('The Full dataset is {}. with {} samples and {} columns'.format(df_test.shape,df_test.shape[0],df_test.shape[1]))
print('===')
print(df_test.info())
id_test=df_test['id']

In [None]:
df_train=pd.read_csv('/kaggle/input/kagglex-train/train.csv')
print('The Full dataset is {}. with {} samples and {} columns'.format(df_train.shape,df_train.shape[0],df_train.shape[1]))
print('===')
print(df_train.info())
print('===')
print(df_train.describe())

In [None]:
y_train=df_train['price']
df_train.drop(['price'],axis=1,inplace=True)

df=pd.concat([df_train,df_test],ignore_index=True,axis=0)
print(f"The train set is from 0 to {df_train.shape[0]-1}\n\
the test set is from {df_train.shape[0]} to end")
print(f" The df shape is {df.shape}")
df.drop(['id'],axis=1,inplace=True)

In [None]:
df_org=pd.read_csv('/kaggle/input/used-car-price-prediction-dataset/used_cars.csv')
print('The Full dataset is {}. with {} samples and {} columns'.format(df_org.shape,df_org.shape[0],df_org.shape[1]))
print('===')
print(df_org.info)

In [None]:
df.head()

In [None]:
# check duplication
df.duplicated().sum()

In [None]:
# check missing values
def check_miss_value(df):
    '''
    check the missing value in the dataset
    args:
        df pd dataframe
    return a barh plot or 
    '''
    miss=df.isnull().sum()
    miss=miss[miss>0]
    miss.sort_values(inplace=True)
    
    try:
        print(miss)
        miss.plot.barh()
    except:
        print("No value missing")
        

In [None]:
# check missing values
check_miss_value(df)

In [None]:
# check the number of unique values for each feature
df.nunique()

In [None]:
# drop clean_title
df.drop(['clean_title'],axis=1,inplace=True)

In [None]:
# check numerical variables
df.select_dtypes(include=[np.number]).describe()

In [None]:
# check categorical variables
df.select_dtypes(exclude=[np.number]).describe()

In [None]:
# check it there are some patten in the value
# of each categrical value
for col in df.select_dtypes('O').columns:
    print()
    print(col)
    print(df[col].nunique())
    if df[col].nunique() <50:
        print(df[col].unique())
    else:
        print(df[col].unique()[:50])

In [None]:
features_cols=['fuel_type','accident']
for i in np.arange( len(features_cols)):
    print(df[features_cols[i]].value_counts())
    fig =plt.figure()
    df[features_cols[i]].value_counts().plot.barh()

<b>[comments]</b>

- Observiously the dtype of milage should be int and the dtype of price should be float

- for object-type features e.g. `[brand model engine transmission, ext_col, int_col]` have a wide range of values, it is better to only focus on the top 5 and group others 
- for numberical features,`milage_format, milage`  are continuous while `model_year` is discrete

# Preprocessing
 
- drop unimportant features `['model','ext_col','int_col','clean_title']`
4. reduce/format cardinality for categorical variables i.e. `['fueltype','transmission','engine','accident']`

In [None]:
# format dtypes for org_dataset
# df['price']=df['price'].str.replace('$', '').str.replace(',', '').astype(float)
# df['milage']=df['milage'].str.replace(',','').str.replace(' mi.', '').astype(int)

In [None]:
# accident
df['accident']=df['accident'].apply(lambda x:0 if 'None' in str(x) else 1)

In [None]:
# clean/formate
df['fuel_type']=df['fuel_type'].apply(lambda x:
                                      np.nan if str(x).strip()=='–' else 
                                      'electric' if str(x).strip()=='not supported' else x)

features_cols=['fuel_type']
for i in np.arange( len(features_cols)):
    print(i)
    fig =plt.figure()
    df[features_cols[i]].value_counts(normalize=True).plot.barh()

In [None]:
#trannsmission
df['transmission'].value_counts().head(20)

In [None]:
# trannsmission: A/T or not 
df['transmission']=df['transmission'].str.contains('A/T|Automatic',case=False).astype(int)

In [None]:
# process engine
df['engine'].value_counts().head(20)

In [None]:
# process engine
hp=df['engine'].apply(lambda x:x.split('HP')[0]).apply(pd.to_numeric,errors='coerce')
liters=df['engine'].apply(lambda x:x.split('L')[0].split('Liter')[0].split(' ')[-1]).apply(pd.to_numeric,errors='coerce')

df['hp']=hp
df['engineVolume_L']=liters


In [None]:
 df=df.drop(['model','ext_col','int_col','engine'],axis=1,errors='ignore')

In [None]:
df.info()

<b>[comments]</b>
Now, we can see, we have 11 feature column  + 1 target column. 
Next I will check the Distribution and the relationship between feature and target

In [None]:
df.nunique()

In [None]:
# check missing values
check_miss_value(df)

In [None]:
# for numerical variables
df.describe()  

In [None]:
# for numerical variables
df.hist(figsize=(16,10),bins=50)

In [None]:
# # for cat variables
df.select_dtypes(exclude=[np.number]).describe() 

In [None]:
df['fuel_type'].value_counts().plot.barh()

In [None]:
num_features=df.select_dtypes(include=[np.number]).columns.tolist()
cat_features=df.select_dtypes(exclude=[np.number]).columns.tolist()

#  Exploratory Data distribution

In [None]:
df_train2=pd.concat([ df[:df_train.shape[0]],
                          pd.DataFrame({'price':y_train})],
                        axis=1)
print(df_train2.head())


In [None]:
X_test=df[df_train.shape[0]:df_train.shape[0]+df_test.shape[0]].copy()

In [None]:

df=df_train2.copy()
g=df.groupby('brand').agg({'price':['mean','count']}).sort_values(('price','mean'),ascending=False)
print(g.head())
print(g.shape)

fig,ax=plt.subplots(figsize=(15,6))
ax2=ax.twinx()

g.plot.bar(ax=ax,y=('price','mean'),label='Average Price',color='orange')
g.plot.line(ax=ax2,y=('price','count'),color='blue',ls='--',lw=1,marker='.',label='Number of Instances')
#ax2.axhline(2,ls='--',color='black')
ax.legend(loc='upper left')
ax2.legend(loc='upper right')
ax2.set_yscale('log')
ax.set_title('Average Price And Count Of Instances By Brand',fontweight='bold')
ax.set_ylabel('Average Price')
ax2.set_ylabel('Number of Instances')
plt.show()

In [None]:
def boxDist(df,target,object_names):
    '''
    For each categorical feature, check and plot distribution of target value  and 
    enumerate them with boxplot; 
    
    Only work when object feature has limtited values
    
    args:
        df: a pd dataset inlcuding features+targe columns
        target: (str) y_name
        object_names:a list of names 
    returen:
    
    '''
    def boxplot(x,y, **kws):
        '''
        plot boxplot distribution for object features
        arg: x ,y
        '''
        sns.boxplot(x=x,y=y)
        plt.xticks(rotation=90)
    
    f=pd.melt(frame=df,id_vars=[target],value_vars=object_names)
    print(f.head())
    g=sns.FacetGrid(f,col='variable',col_wrap=2,sharex=False,sharey=False)
    g=g.map(boxplot,'value',target)
    plt.show()
    
def countPlot(df,cat_names):
    '''
    For categorical features, ploting countplot for each features
    args:
        df: pd dataFrame
        cat_names: (list) of categorical features` name
    return:
        bar graph
    
    '''
    fig, axes = plt.subplots(len(cat_names),
                             figsize=(12, 3*cat_names))
    axes = axes.ravel()  # Flatten the 2D array of axes
    
    for i, column in enumerate(categorical_columns):
        sns.countplot(x=df[column], 
                      data=df,
                      palette='bright', 
                      ax=axes[i], 
                      saturation=0.95)
        for container in axes[i].containers:
            axes[i].bar_label(container, color='black', size=10)
        axes[i].set_title(f'Count Plot of {column.capitalize()}')
        axes[i].set_xlabel(column.capitalize())
        axes[i].set_ylabel('Count')

    # Adjust layout and show plots
    plt.tight_layout()
    plt.show()
    
def perctDist(df,cat_names):
    '''
    For categorical features, geting the values distribution for object features
    return  percentage distribution graph for each feature
    
    inputs:
        df: pd dataset
        cat_names: a list of column names
    '''
    num_col=len(cat_names)
    for i in np.arange(num_col):
        fig,ax=plt.subplots(nrows=1,ncols=2,
                           gridspec_kw={'width_ratios': [2, 1]},
                           figsize=(20,10))
        print(cat_names[i])
        txt=100*df[cat_names[i]].value_counts()/df[cat_names[i]].value_counts().sum()
#         print(txt)
        ax[0].plot(txt,marker='*')
        ax[0].set_xlabel(cat_names[i] )
        ax[0].set_ylabel('percentage dist.')
        ax[0].set_xticks(range(len(txt.index)))
        ax[0].set_xticklabels(labels=txt.index,rotation=90)
        
        ax[1].pie(txt)
        

    plt.show()
    

def fliersOutBox(series):
    '''
    Find margnial values for each feature
    
    args:
        series: pd DatfRAME
    returns:
            the indexes of the filers
    '''
    # quantile_values q1 q2 q3
    q1,q2,q3=np.quantile(series,
                         q=[0.25, 0.5, 0.75],
                        axis=0)
#     print(quantile_values)
    iqr=q3-q1
    whisker_low=q1-1.5*iqr
    whisker_up=q3+1.5*iqr
    print(f"quantile vlaues for 0.25,0.5,0.75 is {q1,q2,q3}")
    print(f"whisker_low is {whisker_low}")
    print(f"whisker_up is {whisker_up}")
    fliers=series[((series <whisker_low )|(series >whisker_up ))]
    return(fliers.index.tolist())


In [None]:
def scatterPlot(df,num_features):
    '''
    For numerical features, plt scatter and  numer_features vs target values
    args:
        df: pd dataframe
        num_features a list of num_feature names +targe_names
    ''' 
    fig,ax=plt.subplots(len(num_features)-1,figsize=(6,5*(len(num_features)-1)))
    for i in np.arange(len(num_features)-1):
        ax[i].scatter(df[num_features[i]], df[num_features[-1]])
        ax[i].set_ylabel(num_features[-1])
        ax[i].set_xlabel(num_features[i])
def pairScatter(df,num_features):
    '''
    For numerical features, plt scatter for pais of features
    and  numer_features vs target values
    args:
        df: pd dataframe
        num_features: a list of num_feature names
    ''' 
#     fig,ax=plt.subplots(l
    sns.pairplot(df[num_features],height=2.5)

    

def boxPlot(df,num_features):
    '''
    for numerical features,  boxplotfor numer_features vs target values
    work well with limited number of values for each features
    args:
        df: pd dataframe
        num_features a list of num_feature names +targe_names
    ''' 
    fig,ax=plt.subplots(len(num_features)-1,
                          figsize=(6,5*(len(num_features)-1)))
    for i in np.arange(len(num_features)-1):
        sns.boxplot(x=df[num_features[i]],y= df[num_features[-1]])#,ax=ax[i])
#         ax[i].set_ylabel(num_features[-1])
#         ax[i].set_xlabel(num_features[i])
        
def histPlot(df,num_features):
    '''
    For numerical features, for numerical features,  
    args:
        df: pd dataframe
        num_features a list of num_feature names +targe_names
    ''' 
    fig,ax=plt.subplots(len(num_features),figsize=(6,5*(len(num_features)-1)))
    for i in np.arange(len(num_features)):
        sns.histplot(df[num_features[i]],kde='True',ax=ax[i])
#         ax[i].hist(df[num_features[i]],color='g',bins=50)
#         ax[i].set_ylabel(num_features[-1])
        ax[i].set_xlabel(num_features[i])



In [None]:
# check distribution for numberical features  
num_features=['model_year',
 'milage',
 'transmission',
 'accident',
 'hp',
 'engineVolume_L', 'price']

print(df[num_features].describe())

scatterPlot(df,num_features)

pairScatter(df,num_features)
histPlot(df,num_features)

<b>[comments]</b>
- margnial values:

     model_year <1980
     milage >350000
     hp>1000 
hp eng corr+


 <font size='+3' color=red> Drop  outliers (marginal values) </font>

In [None]:
y_price_fliers_inx=fliersOutBox(df['price'])
print(len(y_price_fliers_inx))

In [None]:
# drop outler the inx
inx=[693,3046,\
    40126,39640,8674,46001,42738,15749,15822,\
    34757,19882,25873]
# df=df[df.price<2e6]
df_drop=df.drop(inx,axis=0)
df_drop.reset_index(drop=True,inplace=True)
print(f"The train (df_drop) size now is {df_drop.shape}")
# df_drop.tail()
scatterPlot(df_drop,num_features)

pairScatter(df_drop,num_features)
histPlot(df_drop,num_features)
boxDist(df_drop,'price',cat_features)

In [None]:
# generally get ride of margnial values
# remove rare brand with count <2 
print(df_drop.shape)
df_drop=df_drop.groupby('brand').filter(lambda x:x['model_year'].count()>2) #remove rare brand
print(df_drop.shape)

# df.select_dtypes('O').describe()

# remove fuel_type  with count <2 
print(df_drop.shape)
df_drop=df_drop.groupby('fuel_type').filter(lambda x:x['model_year'].count()>2) #remove rare brand
print(df_drop.shape)

In [None]:
df_drop.select_dtypes('O').describe()

In [None]:
boxDist(df_drop,'price',cat_features)

In [None]:
num_features

In [None]:
# general kick out margnial values for numerical features
# drop numerical values outside of mean+- 2std 
#Two sigmas above or below would include about 95 percent of the data,
print(df_drop.shape)
for col in num_features:
    print(col)
    num_mean=df[col].mean()
    num_std=df[col].std()
    low_bound=num_mean- 2*num_std
    up_bound=num_mean+2*num_std
    df_drop[ (df_drop[col]>low_bound) & (df_drop[col]<up_bound) ]

print(df_drop.shape)
df_drop.describe()

In [None]:
num_features

In [None]:
# num_features=['model_year',
#  'milage',
#  'transmission',
#  'accident',
#  'hp',
#  'engineVolume_L', 'price']
# scatterPlot(df,num_features)

# pairScatter(df,num_features)
# histPlot(df,num_features)

# <font color=purple size=6> impute/drop the missing value </font>

In [None]:
from sklearn.impute import SimpleImputer

In [None]:
check_miss_value(X_test)

In [None]:
check_miss_value(df_drop)

In [None]:
df_drop[['engineVolume_L','hp']].dtypes

In [None]:
df_drop[['engineVolume_L','hp']].nunique()

In [None]:
imp_med=SimpleImputer(strategy='median')
imp_med.fit(df_drop[['engineVolume_L','hp']])
imp_med.get_params()

In [None]:
# imp_med=SimpleImputer(strategy='median')
# imp_med.fit(df_drop[['engineVolume_L','hp']])

df_drop[['engineVolume_L','hp']]=imp_med.transform(
    df_drop[['engineVolume_L','hp']])
X_test[['engineVolume_L','hp']]=imp_med.transform(
    X_test[['engineVolume_L','hp']])
check_miss_value(df_drop)
check_miss_value(X_test)

# imp_const=SimpleImputer(strategy='constant',fill_value='No')

# df_drop[['imp_clean_title']]=imp_const.fit_transform(df_drop[['clean_title']])

# imp_features=['imp_fuel_type','imp_accident','imp_clean_title']

In [None]:

imp_freq=SimpleImputer(strategy='most_frequent')
X_test[['fuel_type']]=imp_freq.fit_transform(X_test[['fuel_type']])
check_miss_value(df_drop)
check_miss_value(X_test)


## 1.5 reduce cardinality for categorical values

In [None]:
df_drop.select_dtypes(exclude=[np.number]).describe()

In [None]:
perctDist(df_drop,cat_features)

<b>[comments]</b>
 - model and engine will be not considerd in the following as the huge diversity of the values 

In [None]:
def groupCardinality (df,obj_names,tops):
    '''
    For high cardinality of cateogrical featuers, only kep the tops number
    individual features and group the others
    args:
        df dataframe
        obj_name: (str) the name of the feautres
        tops: (int) the number of levels treated individually
    return:
        df: dataframe with a new column named as group_obj_names
    '''
    txt=100*df[obj_names].value_counts()/df[obj_names].value_counts().sum()
    txt.sort_values(ascending=False,inplace=True)
    print('The accumutive percentage of the top {} level of {} is {:.2f}%'.format(tops,obj_names, txt[:tops].sum()))
    print('The top {} level are {}'.format( tops,txt[:tops].index.tolist()))
    print()
    
    df['group_'+obj_names]=df[obj_names].map(lambda x: x if x in txt[:tops].index.tolist() else 'Others' )

In [None]:
# groupCardinality(df_drop,'model',40)
# groupCardinality(df_drop,'engine',40)
# groupCardinality(df_drop,'ext_col',6)
# groupCardinality(df_drop,'brand',15)
# groupCardinality(df_drop,'int_col',7)
# groupCardinality(df_drop,'imp_fuel_type',1)
# groupCardinality(df_drop,'transmission',9)

# grouped_features= ['group_'+i for i in 
#                    ['model','engine',
#                     'ext_col','brand','int_col','imp_fuel_type','transmission']]

In [None]:
df_drop.info()

In [None]:
 
boxDist(df_drop, 'price',
         ['brand','fuel_type'])

## 1.6 Correlation for numerical features

In [None]:
def spearman(df,features):
    '''
       Get correlation of numerical features with target Y values
       feature_frame=features + y_value
       args:
           df: pd dataFrame [NUM]features columns + target Y column as the last
           features: (list) of feature names
           
       return: a barh showing correlations between each fearue and y, 
               where x-axis is the correlations-value
       '''
    spr=pd.DataFrame()
    # get feature name
    spr['feature']=features
    # get correlations between each feature and y targets
    spr['spearman']=[df[f].corr(df.iloc[:,-1]) for f in features]
    spr.sort_values('spearman',inplace=True)
    
    plt.figure(  figsize=(6,0.25*len(features)))
    sns.barplot(data=spr,x='spearman',y='feature',orient='h')

def corrDist(df,yname,k=df.shape[1]):
    '''
    plt heatmap for correlations 
    args:
        df: dataFrame including num_feature+Y_target value 
        yname: the targe column name
        k: (int) the top numer of feaures which has highest correlation value with y will be plot 
        
    return:
    '''
    fig,ax=plt.subplots()
    corr=df.corr()
    cols=corr.nlargest(k,yname)[yname].index
    cm=np.corrcoef(df[cols].values.T)
    
    mask = np.zeros_like(cm, dtype = np.bool_)
    mask[np.triu_indices_from(mask)] = True
    sns.heatmap(cm,cbar=True,
                annot=True,
                mask=mask,
                fmt='.2f', 
                annot_kws={'size':10},
                yticklabels=cols.values,
                xticklabels=cols.values,
           ax=ax)
    print(cm)

In [None]:
num_features=df_drop.select_dtypes( include=[np.number]).columns.tolist()
num_features

In [None]:
num_features=['model_year',
 'milage',
 'transmission',
 'accident',
 'hp',
 'engineVolume_L', 'price']
num_features2=['model_year',
 'milage',
 'transmission',
 'accident',
 'hp',
 'engineVolume_L']

spearman(df_drop[num_features],num_features2)

In [None]:
corrDist(df_drop[num_features]
         ,'price')

<b>[comment]</b>
    
new car with less milage has a great price

# <font color=purple size=5> Encode categorical variables </font>

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
df_drop.select_dtypes(exclude=[np.number]).columns.tolist()

In [None]:
df_drop.shape

In [None]:
df_drop.reset_index(drop=True,inplace=True)
X_test.reset_index(drop=True,inplace=True)

In [None]:
df_drop.index

In [None]:
X_test.shape

In [None]:
X_test.index

In [None]:
oh_enc=OneHotEncoder()
cat_names=['brand', 'fuel_type']
oh_fit=oh_enc.fit(df_drop[cat_names])

oh_features_in=oh_fit.feature_names_in_
oh_features=oh_fit.get_feature_names_out().tolist()

oh_return=oh_fit.transform(df_drop[cat_names]).toarray()
print(oh_return.shape)
print(f"the orignial {oh_fit.n_features_in_} \
has been to {len(oh_features)} features")

In [None]:
oh_features_df=pd.DataFrame(oh_return,columns=oh_features)
df_drop=pd.concat([df_drop,oh_features_df], 
          axis=1)
df_drop.drop(columns=['brand', 'fuel_type'],inplace=True)

In [None]:
df_drop.head()

In [None]:
check_miss_value(df_drop)

In [None]:
# for test
oh_enc_test=OneHotEncoder()
cat_names=['brand', 'fuel_type']
oh_fit_test=oh_enc.fit(X_test[cat_names])
oh_features_in_test=oh_fit_test.feature_names_in_
oh_features_test=oh_fit_test.get_feature_names_out().tolist()

oh_return_test=oh_fit_test.transform(X_test[cat_names]).toarray()
# oh_return.shape
print(f"the orignial {oh_fit_test.n_features_in_} \
has been to {len(oh_features_test)} features")

In [None]:
oh_features_df_test=pd.DataFrame(oh_return_test,columns=oh_features_test)
X_test=pd.concat([X_test,oh_features_df_test], 
          axis=1)

X_test.drop(columns=['brand', 'fuel_type'],inplace=True)

In [None]:
check_miss_value(X_test)

# <font color=purple size=5> Split datasets </font>

no spliting 

In [None]:
df_drop.head()

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
y_train=df_drop['price']
X_train=df_drop.drop(['price'],axis=1)
print(f"The X_training size is {X_train.shape} and y_train is {y_train.shape}")


In [None]:
# X_train,X_test,y_train,y_test=train_test_split(X, y,
#                                                      test_size=0.2,
#                                                      random_state=33,
#                                                     shuffle=True)
# # X_train,X_dev, y_train,y_dev=train_test_split(X_traindev,y_traindev,
# #                                                      test_size=0.2,
# #                                                      random_state=33,
# #                                                     shuffle=True)

# print(f"The X_training size is {X_train.shape} and y_train is {y_train.shape}")
# # print(f"The X_dev size is {X_dev.shape} and y_train is {y_dev.shape}")
# print(f"The X_test size is {X_test.shape} and y_train is {y_test.shape}")

# <font color=green size=5> Transformation/Scaling for numerical features [training dataset] <font>

In [None]:
# convert an existing Python function into a transformer 
from sklearn.preprocessing import FunctionTransformer
from scipy.stats import skew

In [None]:

def boxPlot(df,num_features):
    '''
    for numerical features,  boxplotfor numer_features vs target values
    work well with limited number of values for each features
    args:
        df: pd dataframe
        num_features a list of num_feature names +targe_names
    ''' 
    fig,ax=plt.subplots(len(num_features)-1,
                          figsize=(6,5*(len(num_features)-1)))
    for i in np.arange(len(num_features)-1):
        sns.boxplot(x=df[num_features[i]],y= df[num_features[-1]])#,ax=ax[i])
#         ax[i].set_ylabel(num_features[-1])
#         ax[i].set_xlabel(num_features[i])
        
def histDist(X_set,num_features):
    '''
    for numerical features,ploting hisgradm
    args:
        X_set: pd dataframe
        num_features a list of num_feature names
    ''' 
    fig,ax=plt.subplots(len(num_features),figsize=(6,5*(len(num_features))))
    for i in np.arange(len(num_features)):
        print(num_features[i])
        sns.histplot(df[num_features[i]],kde='True',ax=ax[i])
#         ax[i].hist(df[num_features[i]],color='g',bins=50)
#         ax[i].set_ylabel(num_features[-1])
        ax[i].set_xlabel(num_features[i])


In [None]:

def histDistY(y,yname):
    '''
    for numerical features,ploting hisgradm and 
    args:
        y: a pd.series
        yname (str)  names
    ''' 
    fig,ax=plt.subplots(1,2,figsize=(14,6))

    sns.histplot(y ,
                 kde='True',
                 stat='probability'
                 ,ax=ax[0])
    
    ax[0].set_xlabel(yname)
    stats.probplot(y,plot=ax[1])

In [None]:
# from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import QuantileTransformer,MinMaxScaler,StandardScaler
from scipy import stats
from scipy.stats import boxcox, yeojohnson

In [None]:
print(f"The skewness :{y_train.skew()}")
histDistY(y_train, 'price')

In [None]:
# y_train_log=y_train.map(np.log)
log_trans = FunctionTransformer(func=np.log, 
                                validate=True,
                               inverse_func=np.exp)

# log_trans.fit(pd.DataFrame(y_train))
y_train_log=log_trans.transform(pd.DataFrame(y_train))
y_train_log=np.squeeze(y_train_log)
 
print(f"The skewness after log transformation :{skew(y_train_log)}")
histDistY(pd.Series(y_train_log), 'price')

In [None]:
num_features2

In [None]:

print(f"The skewness :{X_train['model_year'].skew()}")
print(f"The skewness :{X_train['milage'].skew()}")
print(f"The skewness :{X_train['hp'].skew()}")
print(f"The skewness :{X_train['engineVolume_L'].skew()}")

histDist(X_train,['model_year','milage','hp','engineVolume_L'])

In [None]:
# milage
sqrt_trans = FunctionTransformer(np.sqrt, validate=True)
X_train['sqrt_milage']=sqrt_trans.transform(pd.DataFrame(X_train['milage']))
X_test['sqrt_milage']=sqrt_trans.transform(pd.DataFrame(X_test['milage']))
print(f"The skewness :{X_train['milage'].skew()}")
print(f"The skewness :{X_train['sqrt_milage'].skew()}")

fig, axes=plt.subplots(2,2,figsize=(20,20))
sns.histplot(X_train['milage'],kde=True,stat='probability',ax=axes[0,0])
stats.probplot(X_train['milage'],plot=axes[0,1])
sns.histplot(X_train['sqrt_milage'],kde=True,stat='probability',ax=axes[1,0])
stats.probplot(X_train['sqrt_milage'],plot=axes[1,1])

In [None]:
# hp
# sqrt_trans = FunctionTransformer(np.sqrt, validate=True)
X_train['sqrt_hp']=sqrt_trans.transform(pd.DataFrame(X_train['hp']))
X_test['sqrt_hp']=sqrt_trans.transform(pd.DataFrame(X_test['hp']))
print(f"The skewness :{X_train['hp'].skew()}")
print(f"The skewness :{X_train['sqrt_hp'].skew()}")

fig, axes=plt.subplots(2,2,figsize=(20,20))
sns.histplot(X_train['hp'],kde=True,stat='probability',ax=axes[0,0])
stats.probplot(X_train['hp'],plot=axes[0,1])
sns.histplot(X_train['sqrt_hp'],kde=True,stat='probability',ax=axes[1,0])
stats.probplot(X_train['sqrt_hp'],plot=axes[1,1])

In [None]:
# engineVolume_L
# sqrt_trans = FunctionTransformer(np.sqrt, validate=True)
X_train['sqrt_engineVolume_L']=sqrt_trans.transform(pd.DataFrame(X_train['engineVolume_L']))
X_test['sqrt_engineVolume_L']=sqrt_trans.transform(pd.DataFrame(X_test['engineVolume_L']))

print(f"The skewness :{X_train['engineVolume_L'].skew()}")
print(f"The skewness :{X_train['sqrt_engineVolume_L'].skew()}")

fig, axes=plt.subplots(2,2,figsize=(20,20))
sns.histplot(X_train['engineVolume_L'],kde=True,stat='probability',ax=axes[0,0])
stats.probplot(X_train['engineVolume_L'],plot=axes[0,1])
sns.histplot(X_train['sqrt_engineVolume_L'],kde=True,stat='probability',ax=axes[1,0])
stats.probplot(X_train['sqrt_engineVolume_L'],plot=axes[1,1])

In [None]:
X_train[['model_year']].describe()

In [None]:
# model_year
mm_scaler=MinMaxScaler()
mm_features=['mm_model_year']
minmax_scaler=mm_scaler.fit(X_train[['model_year']])
X_train[mm_features]=minmax_scaler.transform(X_train[['model_year']])
X_test[mm_features]=minmax_scaler.transform(X_test[['model_year']])

print(X_train[mm_features].describe())
print(f"The skewness :{X_train[['model_year']].skew()}")

In [None]:
# model_year
X_train['squared_mm_model_year']=X_train['mm_model_year']**2
X_test['squared_mm_model_year']=X_test['mm_model_year']**2


cube_trans = FunctionTransformer(lambda x:x**3, validate=True)
X_train[['squared_mm_model_year']]=cube_trans.transform( X_train[['mm_model_year']])


print(f"The skewness :{X_train['mm_model_year'].skew()}")
print(f"The skewness :{X_train['squared_mm_model_year'].skew()}")
fig, axes=plt.subplots(2,2,figsize=(20,20))
sns.histplot(X_train['mm_model_year'],kde=True,stat='probability',ax=axes[0,0])
stats.probplot(X_train['mm_model_year'],plot=axes[0,1])

sns.histplot(X_train['squared_mm_model_year'],kde=True,stat='probability',ax=axes[1,0])
stats.probplot(X_train['squared_mm_model_year'],plot=axes[1,1])

In [None]:
X_train.drop(columns=['model_year','mm_model_year',\
                      'milage','engineVolume_L','hp'],inplace=True)

X_test.drop(columns=['model_year','mm_model_year',\
                      'milage','engineVolume_L','hp'],inplace=True)

In [None]:
std_scaler= StandardScaler()
column_names= [ col for col in X_train.columns 
               if col not in oh_features+['transmission','accident']]

std_scaler.fit(X_train[column_names])
std_features=std_scaler.transform(X_train[column_names])
std_features_test=std_scaler.transform(X_test[column_names])

X_train[column_names]=pd.DataFrame(std_features,columns=column_names)
X_test[column_names]=pd.DataFrame(std_features_test,columns=column_names)

# 2 Feature engineering
- remvoe the unimportant features 
- using xgb forgest to measure the importance of the faeture 

In [None]:
X_train.shape

In [None]:
# import tensorflow_decision_forests as tfdf
import xgboost as xgb

In [None]:
 def obj_to_cat(df,cat_cols):
    '''
    convert dtype to categorical type
    
    args:
        df: pd dataframe
        cat_cols: a (list) of ‘category’  columns 
        
    return: ‘category’ type of columns
    '''
    print(cat_cols)
    for col in cat_cols:
        # 1st convert it to the categorical data type
        df[col]=df[col].astype('category')
        # 2nd if exiting null values
        if df[col].isnull().any():
            # new new categiry as NA
            df[col]=df[col].cat.add_categories(['NA'])
            df[col]=df[col].fillna('NA')


In [None]:
# xgb_reg=xgb.XGBRegressor(enable_categorical=True)
# xgb_features=[col for col in X_train.columns 
#          if col not in oh_features+grouped_features+\
#               ['model_year','mm_model_year','milage_format']]

# # cat_names=X_train[xgb_features].select_dtypes(exclude=[np.number]).columns.tolist()
# # ojb_to_cat(X_train,cat_names)

# xgb_reg.fit(X_train[xgb_features],y_train_log)

In [None]:
len(oh_features)

In [None]:
xgb_reg=xgb.XGBRegressor(enable_categorical=True) # with default h-params
# cat_names=X_train[xgb_features].select_dtypes(exclude=[np.number]).columns.tolist()
# ojb_to_cat(X_train,cat_names)

xgb_reg.fit(X_train,y_train_log)

In [None]:
impt_features=pd.DataFrame(xgb_reg.feature_importances_,
                           columns=['importance'],
                           index=X_train.columns)
impt_features.sort_values(by='importance',ascending=False,inplace=True)
print(impt_features.head(40))
impt_features.plot.barh()

In [None]:
# using RFECV to determine the optimal number of features to keep
from sklearn.feature_selection import RFECV
from sklearn.metrics import make_scorer, mean_squared_error

def rmse(y_ture,y_pred):
    return(mean_squared_error(y_ture,y_pred,squared=True)) # 
rmse_scorer=make_scorer(rmse,greater_is_better=False) 

In [None]:

# greater_is_better=True means no sing-flip

min_features_to_select=5

# rfecv=RFECV(xgb_reg,#estimator
#             step=1,# num of weakest feature to remove at each iteration
#             cv=5,# will do selection features 5 times
#             verbose=False,
#             scoring=rmse_scorer,
#             min_features_to_select = min_features_to_select)

# rfecv.fit(X_train ,y_train_log)
# print("Optimal number of features : %d" %rfecv.n_features_)

In [None]:
rfecv.cv_results_.keys()

In [None]:
len(rfecv.cv_results_['mean_test_score'])

In [None]:
rfecv.cv_results_.keys()
n_subsets_of_features = len(rfecv.cv_results_["mean_test_score"])
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Mean test score")
plt.errorbar(
    range(min_features_to_select, n_subsets_of_features + min_features_to_select),
    rfecv.cv_results_["mean_test_score"],
    yerr=rfecv.cv_results_["std_test_score"],
)
plt.title("Recursive Feature Elimination \nwith correlated features")
# plt.vlines(rfecv.n_features_,-0.6,-0.4,color='r')
plt.show()
features_kept = X_train.columns.values[rfecv.support_] 

# X_train_imp= X_train[features_kept]
# X_test_imp=X_test[features_kept]

In [None]:
names=list(set(X_test.columns).intersection( set(X_train.columns)))

In [None]:
 len(impt_features[(impt_features.importance>0)].index)

In [None]:
X_train_impt=X_train[names].copy() #[impt_features[(impt_features.importance>0)].index]

In [None]:
y_train_log.mean()


In [None]:
y_train_log.max()

In [None]:
y_train_log.min()

In [None]:
y_train_log.std()

In [None]:
X_train_impt.describe()

## 3 Model

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [None]:
# # doest use in this competation 
# '''
# geneerate pipeline that 
# treats different features with different transformers, 
# while leaving some features untransformed

# # in training dataset  
# # y_train: y_train_log()  transform
# # milage_format: sqrt_trans() transform
# # model_year minmax_scaler() --> cube_trans()

# '''

# mm_scaler= MinMaxScaler()
# passthrough= 'passthrough'
# raw_features=[ col for col in X_train.columns 
#               if col not in ['milage_format','model_year']]


# #  define the transformers for different features

# num_trasnformer1=Pipeline(steps=[
#     ('mm scaler',mm_scaler),
#     ('cube trans',cube_trans)])
# num_tranformer2=Pipeline(steps=[
#     ('sqrt_trans',sqrt_trans)
# ])

# cat_transformer=Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
#     ('onehot', OneHotEncoder(handle_unknown='ignore'))
# ])

# # creat a columnTransformer
# preprocessor=ColumnTransformer(
# transformers=[
#     ('milage',num_tranformer2,['milage_format']),
#     ('model_year1',num_trasnformer1, ['model_year']),
#     ('others',passthrough, raw_features)
# ])

# # then you can 
# # preprocessor.fit_transform(df)

## 3.1 preprocess for test dataset

## 3.1 Model selection using X_train and y_train cv

In [None]:
from sklearn.linear_model import LinearRegression, RidgeCV

## 3.1.1 linear regression

In [None]:
linear=LinearRegression()
linear.fit(X_train_impt,y_train_log)

In [None]:
y_pred_linear=linear.predict(X_train_impt)
print(f"The dev RMSE is { rmse( y_train_log,y_pred_linear):.4f}")
print(f"The socre of R^2 for linear is %.4f" \
      %(linear.score(X_train_impt,y_train_log)))

In [None]:
_y=pd.DataFrame({'y_true_log':y_train_log,
               'y_pred_log':y_pred_linear,
                'y_true':np.exp(y_train_log),
                'y_pred':np.exp(y_pred_linear)})
plt.scatter(_y['y_true_log'],
           _y['y_pred_log'])

In [None]:
plt.scatter(_y['y_true'],
           _y['y_pred'])

## 3.1.2 RidgeCV regression

In [None]:

reg_ridgeCV=RidgeCV(alphas=np.logspace(-6, 6, 13),
#                     cv=5,
                    scoring=rmse_scorer,
                   store_cv_values=True
                   )
reg_ridgeCV.fit(X_train_impt,y_train_log)

y_pred_ridge=reg_ridgeCV.predict(X_train_impt)
print(f"Best alpha is {reg_ridgeCV.alpha_:.0e}" )
print(f"The dev RMSE is { rmse( y_train_log, y_pred_ridge):.4f}")
print(f"The socre of R^2 for reg_ridgeCV is %.4f" \
      %(reg_ridgeCV.score(X_train_impt,y_train_log)))

In [None]:
_y=pd.DataFrame({'y_true_log':y_train_log,
               'y_pred_log':y_pred_ridge,
                'y_true':np.exp(y_train_log),
                'y_pred':np.exp(y_pred_ridge)})
plt.scatter(_y['y_true_log'],
           _y['y_pred_log'])


In [None]:
plt.scatter(_y['y_true'],
           _y['y_pred'])


## 3.1.3 XGBregressor

In [None]:
from sklearn.model_selection import RepeatedKFold, cross_val_score

In [None]:
# model
reg_xgb=xgb.XGBRegressor(eval_metric=rmse_scorer)
# Evaluate the model with repeated k-fold cross-validation
cv=RepeatedKFold(n_splits=10,
                 n_repeats=3,
                 random_state=1)
#evaluate model
cv_scores=cross_val_score(reg_xgb,
                          X=X_train_impt,
                          y=y_train_log,
                         scoring=rmse_scorer,
                         cv=cv)
print(f" Mean RMSE: %.3f+-(%.3f)" %( cv_scores.mean(),cv_scores.std()))

# xgb_params={}
# DMatrix=xgb.DMatrix(data=X_dev_impt,
#          label=y_dev_log)


In [None]:
 cv_scores

In [None]:
reg_xgb.fit(X_train_impt,y_train_log)


In [None]:
X_train.columns

In [None]:
len(y_train_log)

In [None]:
y_pred_xgb=reg_xgb.predict(X_train_impt)
 
_y=pd.DataFrame({'y_true_log':y_train_log,
               'y_pred_log':y_pred_xgb,
                'y_true':np.exp(y_train_log),
                'y_pred':np.exp(y_pred_xgb)})
plt.scatter(_y['y_true_log'],
           _y['y_pred_log'])


In [None]:
plt.scatter(_y['y_true'],
           _y['y_pred'])

In [None]:
reg_xgb.get_booster()

In [None]:
print(reg_xgb.get_params())

## 3.2  Xgboost_turning

In [None]:
from sklearn.model_selection import learning_curve

In [None]:
# training datasize


def plot_learning_curve(model,X,y):
    '''
    ploting performance as train_sizes
    args:
        X: X_train
        y: y_train
    return:
    '''
    #train_sizes (size of n_tickes)
    # train_scores and test_scores (n_ticks, n_cv_folds)
    train_sizes,train_scores,test_scores=learning_curve(estimator=model,
                  X=X,
                  y=y,
                  train_sizes=[0.1, 0.33, 0.55, 0.78, 1.], #n_ticks=5
                  cv=10,
                  scoring=rmse_scorer)
    
    # calculate score for each tick
    train_scores_mean=np.mean(train_scores,axis=1)
    train_scores_std=np.std(train_scores,axis=1)
    test_scores_mean=np.mean(test_scores,axis=1)
    test_scores_std=np.std(test_scores,axis=1)
    
    plt.figure(figsize=(10, 6))
    plt.plot(train_sizes,
             train_scores_mean, 
             label='Training Accuracy', 
             color='blue')
    plt.fill_between(train_sizes, 
                     train_scores_mean -2*train_scores_std,
                     train_scores_mean + 2*train_scores_std,
                     alpha=0.2, color='blue')
   
    plt.plot(train_sizes, test_scores_mean, 
             label='Validation Accuracy', color='green')
    plt.fill_between(train_sizes, 
                     test_scores_mean - 2*test_scores_std, 
                     test_scores_mean + 2*test_scores_std, 
                     alpha=0.2, color='green')
    
    plt.xlabel('Training Set Size')
    plt.ylabel('Mean RMSE')
    plt.title('Learning Curve')
    plt.legend(loc='best')
    plt.grid(True)
    plt.show()
    print(train_sizes)
    

In [None]:
X_train_impt.shape

In [None]:
import time

In [None]:
# training datasize

reg_xgb=xgb.XGBRegressor(eval_metric=rmse_scorer)
%time plot_learning_curve(reg_xgb,X_train_impt,y_train_log)

<b> [comments]</b>

RMSE_train<RMSE_dev --> variance (overfitting)

In [None]:
# search on hyper parameters
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV

In [None]:
# default n_estimators=100,max_depth=6, learning_Rate=0.3
reg_xgb=xgb.XGBRegressor(eval_metric=rmse_scorer)
rmse_scorer=make_scorer(rmse,greater_is_better=False) 

param_grid={'n_estimators':[100,500,1000,2000],
            'max_depth':[3,5,8,10],
            'learning_rate':[0.01,0.05,0.1,0.15]}
rand_search_cv=RandomizedSearchCV(estimator=reg_xgb,
#                             param_grid=param_grid,
                          param_distributions=param_grid,
                                  cv=5,
                          n_iter=10,
                          scoring=rmse_scorer, #'neg_mean_squared_error',
                           verbose=4,
                          random_state=1234,
                          )

In [None]:
%%time
rand_search_cv.fit(X_train_impt,y_train_log)

print()
best_xgb=rand_search_cv.best_estimator_
print(f"Mean RMSE for the best model is {rand_search_cv.best_score_:.2f}")
print(f"best params_  for the best model is {rand_search_cv.best_params_}")
   

In [None]:

best_xgb

In [None]:
plot_learning_curve(best_xgb,X_train_impt,y_train_log)

In [None]:
y_pred_best_xgb=best_xgb.predict(X_train_impt)

_y=pd.DataFrame({'y_true_log':y_train_log,
               'y_pred_log':y_pred_best_xgb,
                'y_true':np.exp(y_train_log),
                'y_pred':np.exp(y_pred_best_xgb)})
plt.scatter(_y['y_true_log'],
           _y['y_pred_log'])


In [None]:
plt.scatter(_y['y_true'],
           _y['y_pred'])
# plt.xlim(1,2e6)
# plt.ylim(1,60000)

In [None]:
y_train_log_pred=pd.DataFrame({'price':best_xgb.predict(X_train_impt)})
                                 
print(y_train_log_pred[0:5])
y_train_pred=np.squeeze(log_trans.inverse_transform(y_train_log_pred ))
print(y_train_pred[0:5])
print(y_train[0:5])
rmse_train=rmse(y_train,y_train_pred)

# rmse_train.mean()
# rmse_train.std()

In [None]:
rmse_train

In [None]:
# names=list(set(X_test.columns).intersection( set(X_train.columns)))

In [None]:
X_test=X_test[names]

In [None]:
y_train[0:5]

In [None]:
y_train_log[0:5]

# Error Analysis

In [None]:
fig, (ax1,ax2)=plt.subplots(1,2,figsize=(15,5))
ax1.scatter(np.exp(y_train_log),np.exp(y_pred))
ax1.set_xlabel('actual')
ax1.set_ylabel('prediction')
ax1.set_title('Train')
# ax2.scatter(y_test,test_prediction,c=(y_test-test_prediction).abs(),cmap='autumn')
# ax2.set_xlabel('actual')
# ax2.set_ylabel('prediction')
# ax2.set_title('Test')