# Building FastAi Model

In [1]:
#Printing thw working directory
import os
wd = os.getcwd()

In [2]:
wd

'/Users/amalnair/Documents/Amal-WorkSpace/UCC_Project/resources'

In [3]:
# Setting The Home Directory for the project
home_dir = wd[:-9]

In [4]:
home_dir

'/Users/amalnair/Documents/Amal-WorkSpace/UCC_Project/'

In [5]:
#Listing the home directory and working directory contents
print('HOME DIRECTORY')
print(os.listdir(home_dir))
print('WORKING DIRECTORY')
print(os.listdir(wd))

HOME DIRECTORY
['.DS_Store', 'requirements.txt', 'bin', 'include', 'resources', 'model', 'app.py', 'lib', 'templates']
WORKING DIRECTORY
['Data_Train.xlsx', 'modeling.ipynb', 'Data_Test.xlsx', '.ipynb_checkpoints']


## Loading The Datasets

In [6]:
import pandas as pd
import numpy as np
training_set = pd.read_excel(wd+'/Data_Train.xlsx')

In [7]:
training_set.head(5)

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price,Price
0,Maruti Wagon R LXI CNG,Mumbai,2010,72000,CNG,Manual,First,26.6 km/kg,998 CC,58.16 bhp,5.0,,1.75
1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,19.67 kmpl,1582 CC,126.2 bhp,5.0,,12.5
2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,18.2 kmpl,1199 CC,88.7 bhp,5.0,8.61 Lakh,4.5
3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77 kmpl,1248 CC,88.76 bhp,7.0,,6.0
4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2 kmpl,1968 CC,140.8 bhp,5.0,,17.74


## Structuring & Formatting The Datasets

In [8]:
def restructure(data):
  
  names = list(data.Name)
  
  brand = []
  model = []
  
  for i in range(len(names)):
    try:
      brand.append(names[i].split(" ")[0])
      try:
        model.append(" ".join(names[i].split(" ")[1:]).strip())
      except:
        pass
    except:
        print("ERR ! - ", names[i], "@" , i)
        
        
  mileage = list(data.Mileage)
  
  for i in range(len(mileage)):
    try :
      mileage[i] = float(mileage[i].split(" ")[0].strip())
    except:
      mileage[i] = np.nan
      
      
  engine = list(data.Engine)
  for i in range(len(engine)):
    try :
      engine[i] = int(engine[i].split(" ")[0].strip())
    except:
      engine[i] = np.nan
      
      
  power = list(data.Power)
  for i in range(len(power)):
    try :
      power[i] = float(power[i].split(" ")[0].strip())
    except:
      power[i] = np.nan
      
  data['New_Price'].fillna(0, inplace = True)
  
  newp = list(data['New_Price'])
  
  for i in range(len(newp)):
    if newp[i] == 0:
      newp[i] = float(newp[i])
      continue
    elif 'Cr' in newp[i]:
      newp[i] = float(newp[i].split()[0].strip()) * 100 
    elif 'Lakh' in newp[i]:
      newp[i] = float(newp[i].split()[0].strip())
      
      
#Re-ordering the columns

  restructured = pd.DataFrame({'Brand': brand,
                              'Model':model,
                              'Location': data['Location'], 
                              'Year':data['Year'] , 
                              'Kilometers_Driven':data['Kilometers_Driven'],
                              'Fuel_Type':data['Fuel_Type'],
                              'Transmission':data['Transmission'],
                              'Owner_Type':data['Owner_Type'],
                              'Mileage':mileage,
                              'Engine':engine,
                              'Power':power,
                              'Seats':data['Seats'],
                              'New_Price':newp
                             })

  if 'Price' in data.columns:
    restructured['Price'] = data['Price']
    return restructured

  else:
    return restructured

In [9]:
train_d = restructure(training_set)


### Selecting Few Features

In [10]:
cols = ['Brand', 'Location', 'Year', 'Kilometers_Driven', 'Fuel_Type',
       'Transmission', 'Owner_Type', 'Mileage', 'Price']

In [11]:
train_d = train_d[cols]

In [12]:
train_d.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6019 entries, 0 to 6018
Data columns (total 9 columns):
Brand                6019 non-null object
Location             6019 non-null object
Year                 6019 non-null int64
Kilometers_Driven    6019 non-null int64
Fuel_Type            6019 non-null object
Transmission         6019 non-null object
Owner_Type           6019 non-null object
Mileage              6017 non-null float64
Price                6019 non-null float64
dtypes: float64(2), int64(2), object(5)
memory usage: 423.3+ KB


## Modeling With Fast.ai


In [13]:
from fastai.tabular import *
#This path will be used for saving and exporting the model
path = wd

In [14]:
#The target variable that we are trying to predict
dep_var = 'Price'

#The categorical variables 
cat_names = list(train_d.select_dtypes('object').columns)

#The continuous variables
cont_names =['Year', 'Kilometers_Driven', 'Mileage'] #No need to keep the Dependend variable

#Preprocessing steps for the fastai learner
procs = [FillMissing, Categorify, Normalize]

In [15]:
#Creating a validation set
val = TabularList.from_df(train_d.iloc[800:1000].copy(), path=path, cat_names=cat_names, cont_names=cont_names)

In [16]:
#Creating a trainig set
data = (TabularList.from_df(train_d, path=path, cat_names=cat_names, cont_names=cont_names, procs=procs)
                           .split_by_idx(list(range(len(train_d) - int(len(train_d) * 0.2),len(train_d))))
                           .label_from_df(cols=dep_var)
                           .add_test(val)
                           .databunch())

In [17]:
data.show_batch(5)

Brand,Location,Fuel_Type,Transmission,Owner_Type,Mileage_na,Year,Kilometers_Driven,Mileage,target
Volkswagen,Delhi,Diesel,Manual,Second,False,-0.4138,0.1472,0.8758,2.65
Ford,Kochi,Petrol,Manual,First,False,-1.3381,0.0725,-0.9393,1.65
Hyundai,Bangalore,Diesel,Manual,First,False,0.2025,-0.3001,0.9789,8.35
Renault,Pune,Petrol,Manual,First,False,-0.1056,-0.3895,-1.0622,6.0
Maruti,Mumbai,Petrol,Manual,First,False,-0.7219,-0.0111,-0.1273,2.95


### Initializing Neural Network

In [18]:
learn = tabular_learner(data, layers=[300,100, 100, 50], metrics= rmse)

### Training The Model

In [19]:
learn.fit(25, 1e-2)

epoch,train_loss,valid_loss,root_mean_squared_error,time
0,55.471222,36.102558,5.426857,00:01
1,33.489296,35.301064,5.212431,00:00
2,31.670364,37.889015,5.400799,00:00
3,29.258881,35.724693,5.329782,00:00
4,27.429525,33.397991,5.02961,00:00
5,26.536196,33.372677,5.031393,00:00
6,26.760942,35.821808,5.235147,00:00
7,26.035761,29.993044,4.786624,00:00
8,26.381557,30.354549,4.830415,00:00
9,24.374403,33.764561,5.071315,00:00


In [20]:
learn.show_results(ds_type=DatasetType.Train)

Brand,Location,Fuel_Type,Transmission,Owner_Type,Mileage_na,Year,Kilometers_Driven,Mileage,target,prediction
Hyundai,Hyderabad,Diesel,Manual,First,False,-0.4138,0.1209,0.8384,3.95,[5.786664]
Maruti,Pune,Diesel,Manual,Fourth & Above,False,-1.3381,0.2068,-0.0614,2.15,[4.262152]
Audi,Kochi,Diesel,Automatic,First,False,0.5106,-0.4526,-0.2128,21.43,[22.643032]
Hyundai,Bangalore,Petrol,Manual,First,False,0.2025,-0.1244,-0.237,7.75,[6.077044]
BMW,Mumbai,Diesel,Automatic,Second,False,-1.6463,0.2167,0.0878,10.5,[11.230114]


In [21]:
learn.show_results(ds_type=DatasetType.Valid)

Brand,Location,Fuel_Type,Transmission,Owner_Type,Mileage_na,Year,Kilometers_Driven,Mileage,target,prediction
BMW,Delhi,Petrol,Automatic,Second,False,-2.2625,-0.1112,-1.4178,6.99,[9.390251]
Hyundai,Coimbatore,Diesel,Manual,First,False,1.7431,-0.1722,1.014,15.57,[10.457531]
Tata,Coimbatore,Diesel,Manual,First,False,0.5106,0.2558,-0.6101,5.29,[8.295876]
Datsun,Kolkata,Petrol,Manual,First,False,0.8188,-0.4392,1.014,2.25,[2.160599]
BMW,Chennai,Diesel,Automatic,First,False,-1.3381,0.5049,-1.4002,20.0,[15.827377]


## Saving & Exporting The Model

In [22]:
learn.save('model',return_path=True)

PosixPath('/Users/amalnair/Documents/Amal-WorkSpace/UCC_Project/resources/models/model.pth')

In [23]:
learn.export('model.pkl')