In [2]:
import numpy as np
%matplotlib inline
import pandas as pd
import matplotlib.pyplot
from statsmodels.api import OLS
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [3]:
#READ DATA IN THRU PANDAS
cars = pd.read_csv('CARS.csv')

In [4]:
#DETERINE HOW MANY ROWS & DIMENSIONS ARE IN THE DATA
cars.shape

(428, 15)

In [5]:
#TAKE A PEEK AT THE DATA
cars.head()

Unnamed: 0,Make,Model,Type,Origin,DriveTrain,MSRP,Invoice,EngineSize,Cylinders,Horsepower,MPG_City,MPG_Highway,Weight,Wheelbase,Length
0,Acura,MDX,SUV,Asia,All,"$36,945","$33,337",3.5,6.0,265,17,23,4451,106,189
1,Acura,RSX Type S 2dr,Sedan,Asia,Front,"$23,820","$21,761",2.0,4.0,200,24,31,2778,101,172
2,Acura,TSX 4dr,Sedan,Asia,Front,"$26,990","$24,647",2.4,4.0,200,22,29,3230,105,183
3,Acura,TL 4dr,Sedan,Asia,Front,"$33,195","$30,299",3.2,6.0,270,20,28,3575,108,186
4,Acura,3.5 RL 4dr,Sedan,Asia,Front,"$43,755","$39,014",3.5,6.0,225,18,24,3880,115,197


In [6]:
#SEE THE LIST ALL ALL DIMENSIONS W/ DATA TYPE INSIGHTS IN ONE SNAPSHOT
cars.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 428 entries, 0 to 427
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Make         428 non-null    object 
 1   Model        428 non-null    object 
 2   Type         428 non-null    object 
 3   Origin       428 non-null    object 
 4   DriveTrain   428 non-null    object 
 5   MSRP         428 non-null    object 
 6   Invoice      428 non-null    object 
 7   EngineSize   428 non-null    float64
 8   Cylinders    426 non-null    float64
 9   Horsepower   428 non-null    int64  
 10  MPG_City     428 non-null    int64  
 11  MPG_Highway  428 non-null    int64  
 12  Weight       428 non-null    int64  
 13  Wheelbase    428 non-null    int64  
 14  Length       428 non-null    int64  
dtypes: float64(2), int64(6), object(7)
memory usage: 50.3+ KB


In [7]:
# SEE THE DESCRIPTIVE STATISTICS WITHIN FILE DIMENSIONS
cars.describe()

Unnamed: 0,EngineSize,Cylinders,Horsepower,MPG_City,MPG_Highway,Weight,Wheelbase,Length
count,428.0,426.0,428.0,428.0,428.0,428.0,428.0,428.0
mean,3.196729,5.807512,215.885514,20.060748,26.843458,3577.953271,108.154206,186.36215
std,1.108595,1.558443,71.836032,5.238218,5.741201,758.983215,8.311813,14.357991
min,1.3,3.0,73.0,10.0,12.0,1850.0,89.0,143.0
25%,2.375,4.0,165.0,17.0,24.0,3104.0,103.0,178.0
50%,3.0,6.0,210.0,19.0,26.0,3474.5,107.0,187.0
75%,3.9,6.0,255.0,21.25,29.0,3977.75,112.0,194.0
max,8.3,12.0,500.0,60.0,66.0,7190.0,144.0,238.0


In [None]:
########################################################################### 
                        
                            #DATA CLEANING#

########################################################################### 

In [8]:
#REMOVE UNNECESSARY FIELDS FROM FILE 
cars.drop(['Length', 'Wheelbase', 'Weight', 'Model'], axis = 1)

Unnamed: 0,Make,Type,Origin,DriveTrain,MSRP,Invoice,EngineSize,Cylinders,Horsepower,MPG_City,MPG_Highway
0,Acura,SUV,Asia,All,"$36,945","$33,337",3.5,6.0,265,17,23
1,Acura,Sedan,Asia,Front,"$23,820","$21,761",2.0,4.0,200,24,31
2,Acura,Sedan,Asia,Front,"$26,990","$24,647",2.4,4.0,200,22,29
3,Acura,Sedan,Asia,Front,"$33,195","$30,299",3.2,6.0,270,20,28
4,Acura,Sedan,Asia,Front,"$43,755","$39,014",3.5,6.0,225,18,24
...,...,...,...,...,...,...,...,...,...,...,...
423,Volvo,Sedan,Europe,Front,"$40,565","$38,203",2.4,5.0,197,21,28
424,Volvo,Sedan,Europe,Front,"$42,565","$40,083",2.3,5.0,242,20,26
425,Volvo,Sedan,Europe,Front,"$45,210","$42,573",2.9,6.0,268,19,26
426,Volvo,Wagon,Europe,Front,"$26,135","$24,641",1.9,4.0,170,22,29


In [9]:
#REMOVE SPECIAL CHARACTERS/TRANFORM DATA-TYPES && SET DATA TYPE AS INT FOR QUANTITATIVE STUDY
cars['MSRP'] = cars['MSRP'].str.replace('$','')
cars['MSRP'] = cars['MSRP'].str.replace(',','')

cars['Invoice'] = cars['Invoice'].str.replace('$','')
cars['Invoice'] = cars['Invoice'].str.replace(',','')


pd.to_numeric(cars['MSRP'])
pd.to_numeric(cars['Invoice'])

0      33337
1      21761
2      24647
3      30299
4      39014
       ...  
423    38203
424    40083
425    42573
426    24641
427    33112
Name: Invoice, Length: 428, dtype: int64

In [25]:
#CREATE DUMMY VARIABLES OF ORIGIN, TYPE, DRIVETRAIN ** why should i keep make **

In [12]:
pd.get_dummies(cars, columns=['Origin'])

Unnamed: 0,Make,Model,Type,DriveTrain,MSRP,Invoice,EngineSize,Cylinders,Horsepower,MPG_City,MPG_Highway,Weight,Wheelbase,Length,Origin_Asia,Origin_Europe,Origin_USA
0,Acura,MDX,SUV,All,36945,33337,3.5,6.0,265,17,23,4451,106,189,1,0,0
1,Acura,RSX Type S 2dr,Sedan,Front,23820,21761,2.0,4.0,200,24,31,2778,101,172,1,0,0
2,Acura,TSX 4dr,Sedan,Front,26990,24647,2.4,4.0,200,22,29,3230,105,183,1,0,0
3,Acura,TL 4dr,Sedan,Front,33195,30299,3.2,6.0,270,20,28,3575,108,186,1,0,0
4,Acura,3.5 RL 4dr,Sedan,Front,43755,39014,3.5,6.0,225,18,24,3880,115,197,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
423,Volvo,C70 LPT convertible 2dr,Sedan,Front,40565,38203,2.4,5.0,197,21,28,3450,105,186,0,1,0
424,Volvo,C70 HPT convertible 2dr,Sedan,Front,42565,40083,2.3,5.0,242,20,26,3450,105,186,0,1,0
425,Volvo,S80 T6 4dr,Sedan,Front,45210,42573,2.9,6.0,268,19,26,3653,110,190,0,1,0
426,Volvo,V40,Wagon,Front,26135,24641,1.9,4.0,170,22,29,2822,101,180,0,1,0


In [13]:
pd.get_dummies(cars, columns=['Type'])

Unnamed: 0,Make,Model,Origin,DriveTrain,MSRP,Invoice,EngineSize,Cylinders,Horsepower,MPG_City,MPG_Highway,Weight,Wheelbase,Length,Type_Hybrid,Type_SUV,Type_Sedan,Type_Sports,Type_Truck,Type_Wagon
0,Acura,MDX,Asia,All,36945,33337,3.5,6.0,265,17,23,4451,106,189,0,1,0,0,0,0
1,Acura,RSX Type S 2dr,Asia,Front,23820,21761,2.0,4.0,200,24,31,2778,101,172,0,0,1,0,0,0
2,Acura,TSX 4dr,Asia,Front,26990,24647,2.4,4.0,200,22,29,3230,105,183,0,0,1,0,0,0
3,Acura,TL 4dr,Asia,Front,33195,30299,3.2,6.0,270,20,28,3575,108,186,0,0,1,0,0,0
4,Acura,3.5 RL 4dr,Asia,Front,43755,39014,3.5,6.0,225,18,24,3880,115,197,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
423,Volvo,C70 LPT convertible 2dr,Europe,Front,40565,38203,2.4,5.0,197,21,28,3450,105,186,0,0,1,0,0,0
424,Volvo,C70 HPT convertible 2dr,Europe,Front,42565,40083,2.3,5.0,242,20,26,3450,105,186,0,0,1,0,0,0
425,Volvo,S80 T6 4dr,Europe,Front,45210,42573,2.9,6.0,268,19,26,3653,110,190,0,0,1,0,0,0
426,Volvo,V40,Europe,Front,26135,24641,1.9,4.0,170,22,29,2822,101,180,0,0,0,0,0,1


In [14]:
pd.get_dummies(cars, columns=['DriveTrain'])

Unnamed: 0,Make,Model,Type,Origin,MSRP,Invoice,EngineSize,Cylinders,Horsepower,MPG_City,MPG_Highway,Weight,Wheelbase,Length,DriveTrain_All,DriveTrain_Front,DriveTrain_Rear
0,Acura,MDX,SUV,Asia,36945,33337,3.5,6.0,265,17,23,4451,106,189,1,0,0
1,Acura,RSX Type S 2dr,Sedan,Asia,23820,21761,2.0,4.0,200,24,31,2778,101,172,0,1,0
2,Acura,TSX 4dr,Sedan,Asia,26990,24647,2.4,4.0,200,22,29,3230,105,183,0,1,0
3,Acura,TL 4dr,Sedan,Asia,33195,30299,3.2,6.0,270,20,28,3575,108,186,0,1,0
4,Acura,3.5 RL 4dr,Sedan,Asia,43755,39014,3.5,6.0,225,18,24,3880,115,197,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
423,Volvo,C70 LPT convertible 2dr,Sedan,Europe,40565,38203,2.4,5.0,197,21,28,3450,105,186,0,1,0
424,Volvo,C70 HPT convertible 2dr,Sedan,Europe,42565,40083,2.3,5.0,242,20,26,3450,105,186,0,1,0
425,Volvo,S80 T6 4dr,Sedan,Europe,45210,42573,2.9,6.0,268,19,26,3653,110,190,0,1,0
426,Volvo,V40,Wagon,Europe,26135,24641,1.9,4.0,170,22,29,2822,101,180,0,1,0
