In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import joblib

import matplotlib.pyplot


In [2]:
file = 'data_clean/sedan_limited_features.csv'
df = pd.read_csv(file,low_memory=False)

In [3]:
# fisker auto had a number of duplicate entries and was removed.
df = df.loc[df['make']!='FISKER AUTOMOTIVE']
df = df.reset_index(drop=True)

In [4]:
df.head()

Unnamed: 0,make,makeID,model,modelID,modelYear,age_in_years,mileage,askPrice,msrp,color
0,FORD,460.0,Taurus,1782.0,2015.0,5.0,67588,16462,16462,Ingot Silver Metallic
1,AUDI,582.0,S8,3678.0,2015.0,5.0,32990,59962,59962,Daytona Gray Pearl Effect
2,CHRYSLER,477.0,300,1878.0,2017.0,3.0,9231,23962,23962,Billet Silver Metallic Clearcoat
3,LINCOLN,464.0,MKZ,1790.0,2016.0,4.0,19710,22962,22962,White Platinum Metallic Tri-Coat
4,FORD,460.0,Fiesta,3267.0,2011.0,9.0,92053,5962,5962,Blue


In [5]:
df.count()

make            490456
makeID          490456
model           490456
modelID         490456
modelYear       490456
age_in_years    490456
mileage         490456
askPrice        490456
msrp            490456
color           490456
dtype: int64

In [6]:
corr_set = df.corr(method='pearson')

corr_set

Unnamed: 0,makeID,modelID,modelYear,age_in_years,mileage,askPrice,msrp
makeID,1.0,0.209932,-0.023344,0.023344,-0.000494,-0.000844,-0.000901
modelID,0.209932,1.0,-0.022483,0.022483,-0.03148,-0.002601,-0.002779
modelYear,-0.023344,-0.022483,1.0,-1.0,-0.666976,0.001471,0.001921
age_in_years,0.023344,0.022483,-1.0,1.0,0.666976,-0.001471,-0.001921
mileage,-0.000494,-0.03148,-0.666976,0.666976,1.0,-0.001945,-0.002289
askPrice,-0.000844,-0.002601,0.001471,-0.001471,-0.001945,1.0,0.938209
msrp,-0.000901,-0.002779,0.001921,-0.001921,-0.002289,0.938209,1.0


# Correlations

#### Observe corellations for all makes

import seaborn as sn
import matplotlib.pyplot as plt

fig = plt.figure(figsize = (19.20,10.80))

corrMatrix = df.corr()
sn.heatmap(corrMatrix, annot=True)
plt.savefig('visualizations/sedan_total_corr.png')

When observed as a whole, the corellations between certain features are below significancee

when each make is split into a group, the correlations are more easily observed

make_list = df['make'].unique()

for i, make in enumerate(make_list):
    df_new = df.loc[df['make']==f'{make}']
    df_new = df_new.drop(columns=['makeID'])
    fig = plt.figure(figsize = (19.20,10.80))

    corrMatrix = df_new.corr()
    sn.heatmap(corrMatrix, annot=True)
    plt.title(f'{make}')
    plt.savefig(f'visualizations/Sedan_Correlations/sedan_{make}_corr.png')
    plt.close()

with the overall data giving un-usable correlations we may need to have a regression model for each make/body class?

Test the difference between a prediction using the total data and a single make, data.

---------------------

# Random Forest Regrssion

In [None]:
# turm each make into a data frame to allow for correlations to be observed
# whwen observed as a whole, the correlations between any useful data are not existant
FORD = df.loc[df['make']=='FORD']
AUDI = df.loc[df['make']=='AUDI']
CHRYSLER = df.loc[df['make']=='CHRYSLER']
LINCOLN = df.loc[df['make']=='LINCOLN']
INFINITI = df.loc[df['make']=='INFINITI']
BUICK = df.loc[df['make']=='BUICK']
NISSAN = df.loc[df['make']=='NISSAN']
CHEVROLET = df.loc[df['make']=='CHEVROLET']
BMW = df.loc[df['make']=='BMW']
TOYOTA = df.loc[df['make']=='TOYOTA']
VOLKSWAGEN = df.loc[df['make']=='VOLKSWAGEN']
HONDA = df.loc[df['make']=='HONDA']
MAZDA = df.loc[df['make']=='MAZDA']
VOLVO = df.loc[df['make']=='VOLVO']
JAGUAR = df.loc[df['make']=='JAGUAR']
DODGE = df.loc[df['make']=='DODGE']
CADILLAC = df.loc[df['make']=='CADILLAC']
MERCURY = df.loc[df['make']=='MERCURY']
HYUNDAI = df.loc[df['make']=='HYUNDAI']
KIA = df.loc[df['make']=='KIA']
MERCEDES_BENZ = df.loc[df['make']=='MERCEDES-BENZ']
SUBARU = df.loc[df['make']=='SUBARU']
ALFA_ROMEO = df.loc[df['make']=='ALFA ROMEO']
PONTIAC = df.loc[df['make']=='PONTIAC']
SAAB = df.loc[df['make']=='SAAB']
SATURN = df.loc[df['make']=='SATURN']
ACURA = df.loc[df['make']=='ACURA']
LEXUS = df.loc[df['make']=='LEXUS']
MITSUBISHI = df.loc[df['make']=='MITSUBISHI']
MASERATI = df.loc[df['make']=='MASERATI']
PORSCHE = df.loc[df['make']=='PORSCHE']
GENESIS = df.loc[df['make']=='GENESIS']
TESLA = df.loc[df['make']=='TESLA']
SUZUKI = df.loc[df['make']=='SUZUKI']
OLDSMOBILE = df.loc[df['make']=='OLDSMOBILE']
BENTLEY = df.loc[df['make']=='BENTLEY']
PLYMOUTH = df.loc[df['make']=='PLYMOUTH']
ROLLS_ROYCE = df.loc[df['make']=="ROLLS ROYCE"]
ASTON_MARTIN = df.loc[df['make']=='ASTON MARTIN']

In [None]:
df_list = [FORD, AUDI, CHRYSLER, LINCOLN, INFINITI, BUICK, NISSAN, CHEVROLET, BMW, TOYOTA, VOLKSWAGEN, HONDA, MAZDA, VOLVO, JAGUAR, DODGE, CADILLAC, MERCURY, HYUNDAI, KIA, MERCEDES_BENZ, SUBARU, ALFA_ROMEO, PONTIAC, SAAB, SATURN, ACURA, LEXUS, MITSUBISHI, MASERATI, PORSCHE, GENESIS, TESLA, SUZUKI, OLDSMOBILE, BENTLEY, PLYMOUTH, ROLLS_ROYCE, ASTON_MARTIN]
name_list = ['FORD', 'AUDI', 'CHRYSLER', 'LINCOLN', 'INFINITI', 'BUICK', 'NISSAN', 'CHEVROLET', 'BMW', 'TOYOTA', 'VOLKSWAGEN', 'HONDA', 'MAZDA', 'VOLVO', 'JAGUAR', 'DODGE', 'CADILLAC', 'MERCURY', 'HYUNDAI', 'KIA', 'MERCEDES_BENZ', 'SUBARU', 'ALFA_ROMEO', 'PONTIAC', 'SAAB', 'SATURN', 'ACURA', 'LEXUS', 'MITSUBISHI', 'MASERATI', 'PORSCHE', 'GENESIS', 'TESLA', 'SUZUKI', 'OLDSMOBILE', 'BENTLEY', 'PLYMOUTH', 'ROLLS_ROYCE', 'ASTON_MARTIN']

In [None]:
def RF_rgr_model(depth):
    model_results = {}
    train_score_list = []
    test_score_list = []
    tree_depth = depth


    # Break the Dataframe down to base features and train the random forest regression on data
    # record the results in a selection of dictionaries for reviewing/analysis later.
    # the goal is to get the model to the highest accuracy without overfit.


    for i,df_element in enumerate(df_list):
        try:
            # set dataframe to drop unneeded columns
            df_test = df_element.drop(columns=['msrp','make','model',])
            #names for each DF/ Model to be trained
            name = name_list[i]


            # divide the data
            X = df_test.drop(columns=['askPrice'])
            y = df_test['askPrice']

            # set dummies
            X_dummies = pd.get_dummies(X,drop_first=True)

            # split for test and train
            X_train, X_test, y_train, y_test = train_test_split(X_dummies, y, random_state=42)

            # model parameters
            model = RandomForestRegressor(max_depth=depth, random_state=12)

            # fit the model to the data
            model.fit(X_train, y_train)

            # evaluate scores and predictions
            train_predicted = model.predict(X_train)
            test_predicted = model.predict(X_test)
            
            train_score = model.score(X_train,y_train,sample_weight=None)
            test_score = model.score(X_test,y_test,sample_weight=None)

            # record results

            model_results['model'] = model

            model_results[f'{name}_train_y_pred'] = train_predicted
            model_results[f'{name}_test_y_pred'] = test_predicted

            model_results[f'{name}_train_score']  = train_score
            model_results[f'{name}_test_score']  = test_score

            train_score_list.append(train_score)
            test_score_list.append(test_score)

            # print Status updates to know its working
            print(f'#{i} of {len(df_list)-1} - {name} complete...')
            print('----------------------------')
            print()
        # if any problem, move along - this serves more as a warning than a solution
        except:
            print(f'{name} failed - moving on..')
            pass
    print('done')
    return model_results, train_score_list, test_score_list

In [None]:
model_results_D2, train_score_list_D2, test_score_list_D2 = RF_rgr_model(2)

In [None]:
model_results_D3, train_score_list_D3, test_score_list_D3 = RF_rgr_model(3)

In [None]:
model_results_D4, train_score_list_D4, test_score_list_D4 = RF_rgr_model(4)

In [None]:
model_results_D5, train_score_list_D5, test_score_list_D5 = RF_rgr_model(5)

In [None]:
model_results_D6, train_score_list_D6, test_score_list_D6 = RF_rgr_model(6)

In [None]:
model_results_D7, train_score_list_D7, test_score_list_D7 = RF_rgr_model(7)

In [None]:
model_results_D2

In [None]:
results_df = pd.DataFrame({
    "make":name_list,
    'train_score_D2':train_score_list_D2,
    'test_score_D2':test_score_list_D2,

    'train_score_D3':train_score_list_D3,
    'test_score_D3':test_score_list_D3,

    'train_score_D4':train_score_list_D4,
    'test_score_D4':test_score_list_D4,

    'train_score_D5':train_score_list_D5,
    'test_score_D5':test_score_list_D5,

    'train_score_D6':train_score_list_D7,
    'test_score_D6':test_score_list_D7,

    'train_score_D7':train_score_list_D7,
    'test_score_D7':test_score_list_D7,

})

results_df

In [None]:
for i,name in enumerate(name_list):
    model = model_results_D2['model']
    joblib.dump(model,f'models/sedan_RF_regression/depth_2_model.joblib')

for i,name in enumerate(name_list):
    model = model_results_D3['model']
    joblib.dump(model,f'models/sedan_RF_regression/depth_3_model.joblib')

for i,name in enumerate(name_list):
    model = model_results_D4['model']
    joblib.dump(model,f'models/sedan_RF_regression/depth_4_model.joblib')

for i,name in enumerate(name_list):
    model = model_results_D5['model']
    joblib.dump(model,f'models/sedan_RF_regression/depth_5_model.joblib')

for i,name in enumerate(name_list):
    model = model_results_D6['model']
    joblib.dump(model,f'models/sedan_RF_regression/depth_6_model.joblib')

for i,name in enumerate(name_list):
    model = model_results_D7['model']
    joblib.dump(model,f'models/sedan_RF_regression/depth_7_model.joblib')

-----------

In [16]:
df = pd.read_csv('data_clean/sedan_clean.csv')
df

Unnamed: 0,brandName,modelName,vf_ModelID,vf_ModelYear,mileage,askPrice,msrp,color,vf_EngineCylinders,vf_FuelTypePrimary,vin
0,FORD,Taurus,1782.0,2015.0,67588,16462,16462,Ingot Silver Metallic,6.0,Gasoline,85384fd9108f6e9c75d7a538e4ce8a892170f7dbf42264...
1,CHRYSLER,300,1878.0,2017.0,9231,23962,23962,Billet Silver Metallic Clearcoat,6.0,Gasoline,5c45a1254ea832cffc329eebf700acf918682c1b40f9a7...
2,LINCOLN,MKZ,1790.0,2016.0,19710,22962,22962,White Platinum Metallic Tri-Coat,6.0,Gasoline,95592374eecca29c4614b5987b760f9b528e6d3e33c2db...
3,FORD,Fiesta,3267.0,2011.0,92053,5962,5962,Blue,4.0,Gasoline,5558ab8c6833c13904fa4a1fbd45c8d00d02f6db9e53fe...
4,INFINITI,G37,2337.0,2013.0,74917,13962,13962,Vibrant Red,6.0,Gasoline,10e990a77a9615aca1f0e2210513a289fe54a9175a522d...
...,...,...,...,...,...,...,...,...,...,...,...
138313,MERCEDES-BENZ,C-Class,2085.0,2016.0,18322,23999,23999,Polar White,4.0,Gasoline,a5edaa50f3a74293f38ba7611f5be5db8bede213acdf96...
138314,LINCOLN,Town Car,1791.0,2004.0,135839,5900,5900,Medium Steel Blue Clearcoat Metallic,8.0,Gasoline,04a3a8af9f12eaa2b8466dd3588994cfbf333d30bf9eee...
138315,CHEVROLET,Cruze,1832.0,2011.0,98007,5950,5950,Imperial Blue Metallic,4.0,Gasoline,b31d181e2ba2f8a7b56a353a81a55951ad833ab2ea5180...
138316,HONDA,Accord,1861.0,2015.0,44717,18500,18500,Crystal Black Pearl,4.0,Gasoline,6afcb285f492dcb542c720b50f8fa32e57d8fc43ea95b2...


In [18]:
df = df.rename(columns={
    'brandName': 'brandName',
    'modelName': 'modelName',
    'vf_ModelID': 'modelID',
    'vf_ModelYear': 'modelYear',
    'mileage': 'mileage',
    'askPrice': 'askPrice',
    'msrp': 'msrp',
    'color': 'color',
    'vf_EngineCylinders': 'engineCylinders',
    'vf_FuelTypePrimary': 'fuelTypePrimary',
    'vin': 'vin'
})

In [25]:
def RF_regr_alldata(depth):
    model_results = {}
    train_score_list = []
    test_score_list = []
    df_test = df.drop(columns=['msrp','vin'])
    #names for each DF/ Model to be trained


    # divide the data
    X = df_test.drop(columns=['askPrice'])
    y = df_test['askPrice']

    # set dummies
    X_dummies = pd.get_dummies(X,drop_first=True)

    # split for test and train
    X_train, X_test, y_train, y_test = train_test_split(X_dummies, y, random_state=42)

    # model parameters
    model = RandomForestRegressor(max_depth=depth, random_state=12)

    # fit the model to the data
    model.fit(X_train, y_train)

    # evaluate scores and predictions
    train_predicted = model.predict(X_train)
    test_predicted = model.predict(X_test)

    train_score = model.score(X_train,y_train,sample_weight=None)
    test_score = model.score(X_test,y_test,sample_weight=None)

    # record results

    model_results['model'] = model

    model_results['train_y_pred'] = train_predicted
    model_results['test_y_pred'] = test_predicted

    model_results['train_score']  = train_score
    model_results['test_score']  = test_score

    train_score_list.append(train_score)
    test_score_list.append(test_score)
    return model_results, train_score_list, test_score_list

In [28]:
model_results, train_score_list, test_score_list = RF_regr_alldata(5)

In [29]:
model_results

{'model': RandomForestRegressor(max_depth=5, random_state=12),
 'train_y_pred': array([16205.13847434, 16205.13847434, 25841.8764979 , ...,
        16205.13847434, 16205.13847434, 12695.01360104]),
 'test_y_pred': array([17993.29477626,  8984.14488702,  9615.43675097, ...,
        12685.99833404, 16205.13847434, 16205.13847434]),
 'train_score': 0.7230275088792911,
 'test_score': 0.7241415831280362}

In [33]:
model = model_results['model']
joblib.dump(model,f'models/sedan_depth_5_72.joblib')

['models/sedan_depth_5_72.joblib']

# TO_DO

Run the random forest regression models
display the results to a dataframe for comparison
Row = Make score
Col = #_depth

use https://mljar.com/blog/save-load-random-forest/ to help with saving the models

turn regression finder into a repeating loop to moce through depth of trees and save dictionaries accordingly

alter layout to return lists instead of dictionaries?


export lists of the make and model IDs for reference and useage in HTML

------------------------------------------

move through models, make dictionary where model is key, values are dictionary containing make, makeID, amd model ID

dict = {
    'Taurus' : {
        make : 'FORD',
        makeID : 460.0,
        model: 'Taurus',
        modelID : 1782.0,
        
    }
}

save as JSON object for parsing later?

coversely, save a dataframe containing just one row per unique Model as CSV.

# Loading and Testing Model