In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn import linear_model
import itertools

%matplotlib inline

# How Much is Your Car Worth?

Data about the retail price of 2005 General Motors cars can be found in `car_data.csv`.

The columns are:

1. Price: suggested retail price of the used 2005 GM car in excellent condition.
2. Mileage: number of miles the car has been driven
3. Make: manufacturer of the car such as Saturn, Pontiac, and Chevrolet
4. Model: specific models for each car manufacturer such as Ion, Vibe, Cavalier
5. Trim (of car): specific type of car model such as SE Sedan 4D, Quad Coupe 2D          
6. Type: body type such as sedan, coupe, etc.      
7. Cylinder: number of cylinders in the engine        
8. Liter: a more specific measure of engine size     
9. Doors: number of doors           
10. Cruise: indicator variable representing whether the car has cruise control (1 = cruise)
11. Sound: indicator variable representing whether the car has upgraded speakers (1 = upgraded)
12. Leather: indicator variable representing whether the car has leather seats (1 = leather)

## Tasks, Part 1

1. Find the linear regression equation for mileage vs price.
2. Chart the original data and the equation on the chart.
3. Find the equation's $R^2$ score (use the `.score` method) to determine whether the
equation is a good fit for this data. (0.8 and greater is considered a strong correlation.)

## Tasks, Part 2

1. Use mileage, cylinders, liters, doors, cruise, sound, and leather to find the linear regression equation.
2. Find the equation's $R^2$ score (use the `.score` method) to determine whether the
equation is a good fit for this data. (0.8 and greater is considered a strong correlation.)
3. Find the combination of the factors that is the best predictor for price.

## Tasks, Part 3

1. Research dummy variables in scikit-learn to see how to use the make, model, and body type.
2. Find the best combination of factors to predict price.

In [None]:
df = pd.read_csv("car_data.csv")

### Part 1

##### Find the linear regression equation for mileage vs price.

In [None]:
in1 = np.array(df[['Mileage']])
out1 = np.array(df[['Price']])
regr1 = linear_model.LinearRegression()
regr1.fit(in1, out1)
print('Price', regr1.coef_, '* Mileage +', regr1.intercept_)

##### Chart the original data and the equation on the chart.

In [None]:
plt.scatter(in1, out1)
plt.plot(in1, regr1.predict(in1))
plt.xlabel('Mileage')
plt.ylabel('Price')
plt.title('Mileage vs. Price for used cars')
plt.show()

##### Find the equation's $R^2$ score (use the `.score` method) to determine whether the equation is a good fit for this data.

In [None]:
regr1.score(in1, out1)

Mileage by itself is not a good correlation for Price

### Part 2

##### Use mileage, cylinders, liters, doors, cruise, sound, and leather to find the linear regression equation.

In [None]:
cols = ['Mileage', 'Cylinder', 'Liter', 'Doors', 'Cruise', 'Sound', 'Leather']
in2 = np.array(df[cols])
out2 = np.array(df[['Price']])
regr2 = linear_model.LinearRegression()
regr2.fit(in2, out2)
print('Price =', end=' ')
for i in range(7):
    print(regr2.coef_[0][i], '*', cols[i],'+', end=' ')
print(*regr2.intercept_)

##### Find the equation's $R^2$ score (use the `.score` method) to determine whether the equation is a good fit for this data.

In [None]:
regr2.score(in2, out2)

All of these together are not a good correlation for price

##### Find the combination of the factors that is the best predictor for price.

In [None]:
combos = [list(itertools.combinations(cols, sz)) for sz in range(1, 7)]
combos = list(itertools.chain.from_iterable(combos))
combo_combo = [list(x) for x in combos]
regrs = [linear_model.LinearRegression() \
         .fit(np.array(df[cols]), np.array(df[['Price']]))
         for cols in combo_combo]
score_sort = sorted(list(zip(combo_combo, regrs)), key=lambda x: 
                    x[1].score(np.array(df[x[0]]), 
                               np.array(df[['Price']])), 
                    reverse=True)

In [None]:
for x in score_sort[:]:
    print(str(round(x[1].score(np.array(df[x[0]]), np.array(df[['Price']])), 4)).ljust(6, '0'), x[0])

From this we see that all of the numbers actually provide the closest correlation at .4457, but highest single factor is the number of Cylinders at .3239 (followed closely, as could be expected, by the number of Liters.

##### Research dummy variables in scikit-learn to see how to use the make, model, and body type.

In [None]:
df2 = df
for col in df2.columns[2:6]:
    for attr in df2[col].unique():
        df2[attr] = [int(x == attr) for x in df2[col]]
    del df2[col]

In [None]:
df.head()

In [None]:
#####NOT HAPPENING######
#combos = [list(itertools.combinations(df2.columns[2:], sz)) for sz in range(1, len(df2.columns[2:]))]
# combos = list(itertools.chain.from_iterable(combos))
# combo_combo = [list(x) for x in combos]
# regrs = [linear_model.LinearRegression() \
#          .fit(np.array(df[cols]), np.array(df[['Price']]))
#          for cols in combo_combo]
# score_sort = sorted(list(zip(combo_combo, regrs)), key=lambda x: 
#                     x[1].score(np.array(df[x[0]]), 
#                                np.array(df[['Price']])), 
#                     reverse=True)