In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [2]:
# Import data
# Read in the CSV file as a pandas DataFrame and set the "year" column as the index.
df = pd.read_csv('Resources/model_car_prices.csv')

# Review the DataFrame
df.head()

Unnamed: 0,odometer,mmr,year,make,body,color,interior,sellingprice,automatic
0,-0.978595,0.726397,2015,24,6,17,1,21500.0,True
1,-1.117537,0.758191,2015,24,6,17,0,21500.0,True
2,-1.272126,1.93458,2014,3,7,8,1,30000.0,True
3,-1.02379,1.468264,2015,52,7,17,1,27750.0,True
4,-1.247006,5.548533,2014,3,7,8,1,67000.0,True


In [8]:
# Get the features (everything except the "price" column)
X = df.copy().drop(columns=["sellingprice", 'automatic'])
X.head()

Unnamed: 0,odometer,mmr,year,make,body,color,interior
0,-0.978595,0.726397,2015,24,6,17,1
1,-1.117537,0.758191,2015,24,6,17,0
2,-1.272126,1.93458,2014,3,7,8,1
3,-1.02379,1.468264,2015,52,7,17,1
4,-1.247006,5.548533,2014,3,7,8,1


In [9]:
# Get the target column
y = df["sellingprice"].values.reshape(-1,1)
y[0:5]

array([[21500.],
       [21500.],
       [30000.],
       [27750.],
       [67000.]])

In [10]:
# Use the Sklearn `train_test_split()` function to split the data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [11]:
import statsmodels.api as sm

# Use the statsmodels package to create and fit a linear regression
lr = sm.OLS(y_train, X_train).fit()

In [13]:
# Create a variable to hold the p-values of all columns sorted in ascending order
p_values = lr.pvalues.sort_values()
p_values

mmr         0.000000e+00
year        0.000000e+00
color       7.422987e-79
odometer    1.167371e-32
body        8.473342e-19
make        3.889884e-04
interior    8.995330e-03
dtype: float64