In [37]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [38]:
#reading in the CSV data and truncating to just Dodge vehicles
df = pd.read_csv('final project.csv')

#just looking at Dodge models
df = df[df['make']=='Dodge']

#dropping columns
df = df.drop(columns=['trim','vin','seller','saledate','make'])

#dropping nuls
df = df.dropna()

#reducing data further due to computational limits
df = df.sample(frac=.5,random_state=42)

#resetting index
df.reset_index(drop=True,inplace=True)

df.head()

Unnamed: 0,year,model,body,transmission,state,condition,odometer,color,interior,mmr,sellingprice
0,2012,Avenger,Sedan,automatic,ga,43.0,66978.0,white,black,8050.0,8900.0
1,2007,Dakota,quad cab,automatic,nc,35.0,79673.0,red,gray,10650.0,11000.0
2,2004,Durango,SUV,automatic,pa,22.0,184741.0,blue,gray,2925.0,1400.0
3,2012,Grand Caravan,Minivan,automatic,md,46.0,59159.0,white,black,10600.0,10200.0
4,2012,Grand Caravan,Minivan,automatic,ga,36.0,143026.0,silver,black,6200.0,5700.0


In [39]:
# getting dummies for model, body, transmission, and state

cleaned_df = pd.get_dummies(df,columns=['model','body','transmission','state'], prefix=['model','body','transmission','state'])
cleaned_df.head()

Unnamed: 0,year,condition,odometer,color,interior,mmr,sellingprice,model_Avenger,model_Caliber,model_Caravan,...,state_or,state_pa,state_pr,state_sc,state_tn,state_tx,state_ut,state_va,state_wa,state_wi
0,2012,43.0,66978.0,white,black,8050.0,8900.0,True,False,False,...,False,False,False,False,False,False,False,False,False,False
1,2007,35.0,79673.0,red,gray,10650.0,11000.0,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,2004,22.0,184741.0,blue,gray,2925.0,1400.0,False,False,False,...,False,True,False,False,False,False,False,False,False,False
3,2012,46.0,59159.0,white,black,10600.0,10200.0,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,2012,36.0,143026.0,silver,black,6200.0,5700.0,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [40]:
# creating dataframe for exterior colors only

exterior_color_df = cleaned_df.drop(columns='interior')
exterior_color_df = pd.get_dummies(exterior_color_df,columns=['color'], prefix=['exterior'])
exterior_color_df.head()

Unnamed: 0,year,condition,odometer,mmr,sellingprice,model_Avenger,model_Caliber,model_Caravan,model_Challenger,model_Charger,...,exterior_off-white,exterior_orange,exterior_pink,exterior_purple,exterior_red,exterior_silver,exterior_turquoise,exterior_white,exterior_yellow,exterior_—
0,2012,43.0,66978.0,8050.0,8900.0,True,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
1,2007,35.0,79673.0,10650.0,11000.0,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
2,2004,22.0,184741.0,2925.0,1400.0,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,2012,46.0,59159.0,10600.0,10200.0,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
4,2012,36.0,143026.0,6200.0,5700.0,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False


In [41]:
# creating dataframe for interior colors only

interior_color_df = cleaned_df.drop(columns='color')
interior_color_df = pd.get_dummies(interior_color_df,columns=['interior'], prefix=['interior'])
interior_color_df.head()

Unnamed: 0,year,condition,odometer,mmr,sellingprice,model_Avenger,model_Caliber,model_Caravan,model_Challenger,model_Charger,...,interior_gold,interior_gray,interior_green,interior_orange,interior_purple,interior_red,interior_silver,interior_tan,interior_white,interior_—
0,2012,43.0,66978.0,8050.0,8900.0,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,2007,35.0,79673.0,10650.0,11000.0,False,False,False,False,False,...,False,True,False,False,False,False,False,False,False,False
2,2004,22.0,184741.0,2925.0,1400.0,False,False,False,False,False,...,False,True,False,False,False,False,False,False,False,False
3,2012,46.0,59159.0,10600.0,10200.0,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,2012,36.0,143026.0,6200.0,5700.0,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [42]:
# creating dataframe excluding all colors

no_color_df = cleaned_df.drop(columns=['color','interior'])
no_color_df.head()

Unnamed: 0,year,condition,odometer,mmr,sellingprice,model_Avenger,model_Caliber,model_Caravan,model_Challenger,model_Charger,...,state_or,state_pa,state_pr,state_sc,state_tn,state_tx,state_ut,state_va,state_wa,state_wi
0,2012,43.0,66978.0,8050.0,8900.0,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,2007,35.0,79673.0,10650.0,11000.0,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,2004,22.0,184741.0,2925.0,1400.0,False,False,False,False,False,...,False,True,False,False,False,False,False,False,False,False
3,2012,46.0,59159.0,10600.0,10200.0,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,2012,36.0,143026.0,6200.0,5700.0,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [43]:
# modeling exterior color
X_exterior = exterior_color_df.drop(['sellingprice'], axis=1)
y_exterior = exterior_color_df['sellingprice']

# Split the exterior color data into training and testing sets
X_exterior_train, X_exterior_test, y_exterior_train, y_exterior_test = train_test_split(X_exterior, y_exterior, test_size=0.2, random_state=42)

# Create a Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Fit the model on the training data
rf_model.fit(X_exterior_train, y_exterior_train)

# Predict on all data
exterior_pred = rf_model.predict(X_exterior)

In [44]:
# modeling interior color
X_interior = interior_color_df.drop(['sellingprice'], axis=1)
y_interior = interior_color_df['sellingprice']

# Split the interior color data into training and testing sets
X_interior_train, X_interior_test, y_interior_train, y_interior_test = train_test_split(X_interior, y_interior, test_size=0.2, random_state=42)

# Create a Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Fit the model on the training data
rf_model.fit(X_interior_train, y_interior_train)

# Predict on all data
interior_pred = rf_model.predict(X_interior)

In [45]:
# modeling, excluding color data
X_no = no_color_df.drop(['sellingprice'], axis=1)
y_no = no_color_df['sellingprice']

# Split the exclusion of color data into training and testing sets
X_no_train, X_no_test, y_no_train, y_no_test = train_test_split(X_no, y_no, test_size=0.2, random_state=42)

# Create a Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Fit the model on the training data
rf_model.fit(X_no_train, y_no_train)

# Predict on all data
no_pred = rf_model.predict(X_no)

In [46]:
# creating dataframe with predictions included
final_df = df
final_df['ext color prediction'] = exterior_pred
final_df['int color prediction'] = interior_pred
final_df['exclusion of color prediction'] = no_pred

final_df

Unnamed: 0,year,model,body,transmission,state,condition,odometer,color,interior,mmr,sellingprice,ext color prediction,int color prediction,exclusion of color prediction
0,2012,Avenger,Sedan,automatic,ga,43.0,66978.0,white,black,8050.0,8900.0,8564.0,8557.02,8588.0
1,2007,Dakota,quad cab,automatic,nc,35.0,79673.0,red,gray,10650.0,11000.0,11121.0,11159.01,11111.0
2,2004,Durango,SUV,automatic,pa,22.0,184741.0,blue,gray,2925.0,1400.0,1662.5,1661.00,1677.5
3,2012,Grand Caravan,Minivan,automatic,md,46.0,59159.0,white,black,10600.0,10200.0,10530.0,10546.00,10511.0
4,2012,Grand Caravan,Minivan,automatic,ga,36.0,143026.0,silver,black,6200.0,5700.0,6057.0,5968.00,6021.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13585,2012,Charger,Sedan,automatic,ma,37.0,26961.0,black,black,16400.0,15000.0,15595.0,15637.00,15613.0
13586,2014,Charger,Sedan,automatic,tx,5.0,46989.0,white,black,15300.0,17750.0,14763.0,14740.00,14779.0
13587,2008,Charger,Sedan,automatic,tx,35.0,113846.0,black,gray,6450.0,6600.0,6578.0,6614.00,6568.0
13588,2006,Charger,Sedan,automatic,ca,35.0,174401.0,gray,gray,4575.0,4200.0,4645.0,4655.00,4857.0


In [47]:
# pushing dataframe with predictions to a csv
final_df.to_csv('final_predictions_dodge.csv', index=False)