<a href="https://colab.research.google.com/github/MarioMarkov/cars-predict/blob/master/cars_predict.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [102]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedKFold
from sklearn.feature_selection import mutual_info_regression
from sklearn.preprocessing import OneHotEncoder
from pandas.plotting import scatter_matrix
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
%matplotlib inline 
 


#Load data
data = pd.read_csv('/content/cars-data2.csv', index_col=None)
test = pd.read_csv('/content/test-data.csv',  index_col=None)
#data.price = np.log10(data.price) 
#data.drop(['fuel'],axis='columns')
#test.drop(['fuel'],axis='columns')

# Remove Id column
data = data.drop(['id'],axis='columns')
test = test.drop(['price'],axis='columns')


# Remove brands that are seen less than 200 times
data = data.groupby('brand').filter(lambda x :len(x)>200)

# Format BMW model
#data[data.brand == 'BMW'].model.apply(lambda x: x[1] )
def format_bmw_model(model_name):
  if model_name.__contains__('X') or model_name.__contains__('i'):
    return model_name
  return model_name[0]

# Trim model to just 1 letter except if it is X or i ex.(318 to 3)
data.loc[data['brand'] == 'BMW', ['model']] = data[data.brand == 'BMW'].model.apply(lambda x: format_bmw_model(x))

# Remove models that are met less than 9 times
brands = data.brand.value_counts().index
for brand in brands:
    data.loc[data['brand'] == brand, ['model']] = data.loc[data['brand'] == brand].groupby('model').filter(lambda x :len(x)>9)



#print(len(data))
# Remove rows with missing kms 
#data = data[pd.to_numeric(data['kms'], errors='coerce').notnull()]


# Print columns that have missing values 
#print(data.apply(lambda x: sum(x.isnull()),axis=0) )

# Impute columns records with missing values
data.kms.fillna(data.kms.median(), inplace = True)
data = data.fillna(data.mode().iloc[0])
print("2 door missing " + str(len(data.loc[data["2door"].isnull() ])))
print("transmission missing " + str(len(data.loc[data["transmission"].isnull() ])))
print("color missing "  + str(len(data.loc[data["color"].isnull() ])))
print("type missing " + str(len(data.loc[data["type"].isnull() ])))



# Remove outliers in IQR 
Q3 = np.quantile(data.price, 0.85)
Q1 = np.quantile(data.price, 0.10)
IQR = Q3 - Q1
lower_range = Q1 - 1.5 * IQR
upper_range = Q3 + 1.5 * IQR
outlier_free_list = [x for x in data.price if (
    (x > lower_range) & (x < upper_range))]
data = data.loc[data.price.isin(outlier_free_list)]

Q3 = np.quantile(data.kms, 0.90)
Q1 = np.quantile(data.kms, 0.28)
IQR = Q3 - Q1
lower_range = Q1 - 1.5 * IQR
upper_range = Q3 + 1.5 * IQR
outlier_free_list = [x for x in data.kms if (
    (x > lower_range) & (x < upper_range))]
data = data.loc[data.kms.isin(outlier_free_list)]

# Encoding string columns to numeric

ordinal_enc_cols = ['brand','model']
one_hot_columns = ['fuel']

ordinal_encoder = OrdinalEncoder()
data[ordinal_enc_cols] = ordinal_encoder.fit_transform(data[ordinal_enc_cols])
test[ordinal_enc_cols] = ordinal_encoder.transform(test[ordinal_enc_cols])

#scatter_matrix(data[['price','kms','year']], figsize=(12, 8))
#sns.catplot(data=data, x="fuel", y="price",kind="box")

# Apply one-hot encoder to fuel column
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
oh_columns_data = pd.DataFrame(OH_encoder.fit_transform(data[one_hot_columns]))
oh_columns_test = pd.DataFrame(OH_encoder.transform(test[one_hot_columns])) 

# One-hot encoding removed index; put it back
oh_columns_data.index = data.index
oh_columns_test.index = test.index

# Remove categorical columns (will replace with one-hot encoding)
num_X_data = data.drop(one_hot_columns, axis=1)
num_X_test = test.drop(one_hot_columns, axis=1)

# Add one-hot encoded columns to numerical features
data = pd.concat([num_X_data, oh_columns_data], axis=1)
test = pd.concat([num_X_test, oh_columns_test], axis=1)

# Standardize prize variable
scaler = StandardScaler()
data[['kms', 'year']] = scaler.fit_transform(data[['kms', 'year']])



# Train set without price col
X = data.drop(['price'],axis='columns')

# Train set price col
y = data.price



# Getting mutual inforamtion scores
for colname in X.select_dtypes("object"):
    X[colname], _ = X[colname].factorize()
for colname in X.select_dtypes("float"):
    X[colname], _ = X[colname].factorize()
y = y.round(0).astype(int)
discrete_features = X.dtypes == int

mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features)




# Space of possible hyperparameters
# params = {
#     "learning_rate": [0.01,0.02,0.03,0.05,0.1], # default 0.1 
#     "n_estimators": range(200, 800,100) # default 100
# }
# model = GridSearchCV(xgb_model, param_grid=params, cv=2, verbose=1,n_jobs=1, return_train_score=True)

# Defining model to try hyperparameters from space on
model = XGBRegressor(random_state=1,objective='reg:squarederror')



# Fit model
model.fit(X,y)

#Calculate error 
mae = -1 * cross_val_score(model, X, y,
                                  cv=3,
                                  scoring='neg_mean_absolute_error')


#Supress scientific notation
pd.options.display.float_format = '{:.10f}'.format

print(mae.mean())

#submission_predictions =  model.predict(test)
#submission_predictions = submission_predictions
#print(submission_predictions)
#data.hist(bins=50, figsize=(20,15))


#corr_matrix = data.corr()
#corr_matrix["price"].sort_values(ascending=False)





2865.572296544125
   brand        price
0     27  2009.866821
1      1  7811.062012


In [86]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedKFold
from sklearn.neighbors import LocalOutlierFactor



data = pd.read_csv('/content/cars-data.csv')
data = data[1:]


print(data.price.describe())


Q3 = np.quantile(data.price, 0.75)
Q1 = np.quantile(data.price, 0.25)
IQR = Q3 - Q1
print("IQR value for column %s is: %s" % ('price', IQR))
global outlier_free_list
global filtered_data
lower_range = Q1 - 1.5 * IQR
upper_range = Q3 + 1.5 * IQR
outlier_free_list = [x for x in data.price if (
    (x > lower_range) & (x < upper_range))]
filtered_data = data.loc[data.price.isin(outlier_free_list)]
 
print(filtered_data.price.describe())


count       995.000000
mean      13369.934673
std       12928.062509
min         650.000000
25%        5600.000000
50%        9499.000000
75%       16500.000000
max      144444.000000
Name: price, dtype: float64
IQR value for column price is: 10900.0
count      928.000000
mean     10841.085129
std       7128.286736
min        650.000000
25%       5500.000000
50%       8850.000000
75%      14500.000000
max      32500.000000
Name: price, dtype: float64
