In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
fold = KFold(n_splits=10)

In [2]:
train = pd.read_csv("Train.csv")
test = pd.read_csv("Test.csv")
sub = pd.read_csv("SampleSubmission.csv")
var = pd.read_csv("VariableDefinitions.csv")

In [3]:
train.head()

Unnamed: 0,VehicleID,Location,Maker,Model,Year,Colour,Amount (Million Naira),Type,Distance
0,VHL12546,Abuja,Honda,Accord Coupe EX V-6,2011,Silver,2.2,Nigerian Used,
1,VHL18827,Ibadan,Hyundai,Sonata,2012,Silver,3.5,Nigerian Used,125000.0
2,VHL19499,Lagos,Lexus,RX 350,2010,Red,9.2,Foreign Used,110852.0
3,VHL17991,Abuja,Mercedes-Benz,GLE-Class,2017,Blue,22.8,Foreign Used,30000.0
4,VHL12170,Ibadan,Toyota,Highlander,2002,Red,2.6,Nigerian Used,125206.0


In [4]:
train = train[train['Amount (Million Naira)'] > 0].reset_index(drop = True)

In [5]:
ntrain = train.shape[0]
df = pd.concat([train, test]).reset_index(drop = True)

In [6]:
df.dtypes

VehicleID                  object
Location                   object
Maker                      object
Model                      object
Year                       object
Colour                     object
Amount (Million Naira)    float64
Type                       object
Distance                   object
dtype: object

In [7]:
for i in range(var.shape[0]):
    print(f"{var['VehicleID'][i]}:  {var['This is the unique identifier of the car.'][i]}")

Location:  This is the location in Nigeria where the seller is based.
Maker:  This is the manufacturer of the car. It is the brand name.
Model:  This is the the name of the car product within a range of similar car products.
Year:  This is the year the car was manufactured.
Colour:  This is the colour of the car.
Amount (Million Naira):  This is the selling price of the car. It is the amount the company will sell the car.
Type:  This is the nature of previous use of the car, whether it was previously used within Nigeria or outside Nigeria.
Distance:  This is the mileage of the car. It is how much distance it covered in its previous use


In [8]:
df = df.drop('Colour', axis = 1)

In [9]:
df['Distance'] = df['Distance'].fillna(0)
df['Distance'] = [int(i.split(",")[0] + i.split(",")[1]) if "," in str(i) else int(i) for i in df['Distance']]
df['Distance'] = [np.nan if i == 0 else i for i in df['Distance']]

In [10]:
features = [feat for feat in list(df) if feat != 'Amount (Million Naira)' ]
cat_feat = np.where(df[features].dtypes == object)[0]
to_encode = [val for index, val in enumerate(features) if index in cat_feat]

In [11]:
df[to_encode] = df[to_encode].apply(LabelEncoder().fit_transform)

In [12]:
new_test = df[train.shape[0]:].reset_index(drop = True)
new_train = df[:train.shape[0]]

In [13]:
new_train = new_train.drop("VehicleID", axis = 1)
new_test = new_test.drop("VehicleID", axis = 1)

In [17]:
X = new_train.drop("Amount (Million Naira)", axis = 1)
y = new_train['Amount (Million Naira)']

In [18]:
def modelling_tree(algorithm,yy,X, test_df):
    cv_score=[]
    test_pred=[]
    
    for train_index, test_index in fold.split(X,pd.Series(yy)):
        
        X_trainx, X_val = X.iloc[train_index], X.iloc[test_index]
        y_trainx, y_val = pd.Series(yy).iloc[train_index], pd.Series(yy).iloc[test_index]

        algorithm.fit(X_trainx,y_trainx)
        
        val_preds=algorithm.predict(X_val)
        cv_score.append(np.sqrt(mean_squared_error(y_val, val_preds)))

        test_p = algorithm.predict(test_df)
        test_pred.append(test_p)
        
    print(f'Validation RMSE Score{np.mean(cv_score)}')

    return np.mean(test_pred, axis = 0)

In [20]:
cat_pred = modelling_tree(algorithm=CatBoostRegressor(silent = True), yy = y , X = X , test_df=new_test[X.columns])

Validation RMSE Score10.744924111393459


In [24]:
my_sub = pd.DataFrame()
my_sub['VehichleID'] = test['VehicleID'] 
my_sub['Amount (Million Naira)'] = cat_pred
my_sub.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2051,2052,2053,2054,2055,2056,2057,2058,2059,2060
VehichleID,VHL18518,VHL17149,VHL10927,VHL12909,VHL12348,VHL10798,VHL11022,VHL12206,VHL11697,VHL12313,...,VHL11288,VHL12337,VHL14268,VHL10015,VHL16136,VHL17903,VHL14018,VHL17473,VHL11480,VHL13881
Amount (Million Naira),6.184953,6.020343,4.715104,4.637041,8.658097,28.75909,2.689506,6.71472,8.739652,6.020343,...,17.477898,7.751321,2.178228,19.937449,4.674146,24.475686,6.629225,6.802435,11.002493,5.017771


In [26]:
my_sub.to_csv("Final_solution.csv", index = False)