### Step 1: Import libraries and CSV File


In [10]:
# Import the required libraries
import pandas as pd
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import metrics
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.linear_model import Lasso, LinearRegression
from sklearn.metrics import mean_squared_error,r2_score, mean_absolute_error

from statsmodels.stats.outliers_influence import variance_inflation_factor 
import statsmodels.api as sm

import warnings
warnings.filterwarnings('ignore')

In [11]:
# Read the CSV file into a Pandas DataFrame
car_prices_df = pd.read_csv(Path("Resources/car_prices.csv"))

# Review the DataFrame
car_prices_df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'Resources/car_prices.csv'

In [None]:
# Check for Null Values
car_prices_df.isnull().sum()

In [None]:
car_prices_df.info()

In [None]:
# Separate the data into target and features

# Separate the y variable, the target
y = car_prices_df["sellingprice"]

# Separate the X variable, the features
X = car_prices_df.drop(columns="sellingprice")

In [None]:
# Create a distribution plot for target variable 'sellingprice'
#sns.distplot(car_prices_df['sellingprice']);

# print 'Skewness' and 'Kurtosis'
#print("Skewness: %f" % car_prices_df['sellingprice'].skew())
#print("Kurtosis: %f" % car_prices_df['sellingprice'].kurt())

In [None]:
plt.figure(figsize=(7, 5))
sns.heatmap(car_prices_df.corr(), annot=True, fmt='.2f', cmap='gist_heat', linewidths=1, vmin=-1, vmax=1)
plt.title('Correlation Heatmap')

plt.show()

In [None]:
X_encoded = pd.get_dummies(car_prices_df[['year', 'make', 'model', 'trim', 'body', 'transmission', 'state', 'color',
                                        'interior', 'seller']], drop_first=True)

#Create the VIF dataframe
#vif_data = pd.DataFrame() 
#vif_data["feature"] = X_encoded.columns 

# Calculate VIF for each feature 
#vif_data["VIF"] = [variance_inflation_factor(X_encoded.values, i) 
                  # for i in range(len(X_encoded.columns))] 

#print(vif_data) -(encoder form sklearn)

In [None]:
#car_prices_df["make"] = car_prices_df["make"].fillna("0")
#car_prices_df.head(
x_vars= ["year", "odometer", "trim", "body", "transmission","state", "condition", "color"]
y_vars= ["sellingprice"]
g= sns.PairGrid(car_prices_df, hue="mmr", x_vars=x_vars, y_vars=y_vars)
g.map_diag(sns.histplot, color=".3")
g.map_offdiag(sns.scatterplot)
g.add_legend();

In [None]:
sns.catplot(data=car_prices_df, x="year", y="sellingprice", kind="box")

In [None]:
#make_pp= car_prices_df.make.value_counts()[:10]
#make_pp.index

In [None]:
#tencar_df.make.value_counts() , tencar_df.shape

In [None]:
# Create a pairplot using Seaborn
sns.pairplot(car_prices_df, x_vars=["sellingprice"], y_vars=["mmr", "condition", "odometer"])

# Display the pairplot
plt.show()

In [None]:
# Create a pairplot using the variables "interior" and "color" using Seaborn
sns.pairplot(car_prices_df, x_vars=["sellingprice"], y_vars=["color", "interior"], hue="color", markers="o")

#Display the pairplot
plt.show()

In [None]:
sns.countplot (x="transmission", data=car_prices_df)
plt.title("Transmission Types")
plt.show;

In [None]:
# Create a pairplot using the variables "make" using Seaborn
sns.pairplot(car_prices_df, x_vars=["sellingprice"], y_vars=["make"], hue="make", markers="o")
#Display the pairplot
plt.show()

#(need to fix plot for easir visibility)

In [None]:
#converting 'year' to categorical data type and creating 'year_codes'
#car_prices_df['year'] = car_prices_df['year'].astype('category')

# create a new column and assign numerical values to each category
#car_prices_df['year_codes'] = car_prices_df['year'].cat.codes + 1

car_prices_df['year'] = car_prices_df['year'].astype('category')

# Create a mapping between original 'year' values and numerical codes
year_mapping = {year: code + 1 for code, year in enumerate(car_prices_df['year'].cat.categories)}

# Apply the mapping to create a new column 'year_codes'
car_prices_df['year_codes'] = car_prices_df['year'].map(year_mapping)


In [None]:
car_prices_df.info()

In [None]:
car_prices_df.head(25)

### Preprocessing the Data

In [None]:
car_prices_df = pd.read_csv(Path("Resources/car_prices.csv"))
car_prices_df = car_prices_df.drop(columns=['vin','seller',], axis=1)
car_prices_df.dropna(inplace=True)
# Review the DataFrame
car_prices_df.head()

In [None]:
make_pp= car_prices_df.make.value_counts()[:10]
make_pp.index


In [None]:
tencar_df= car_prices_df[car_prices_df['make'].isin(make_pp.index)]

In [None]:
tencar_df.make.value_counts() , tencar_df.shape

In [None]:
tencar_df.isna().sum()

In [None]:
tencar_df = tencar_df[['year','make','model','trim','transmission','state','condition','odometer','color','interior','mmr','sellingprice']]
   

In [None]:
tencar_df.duplicated().sum()

In [None]:
tencar_df.head()

In [None]:
plt.figure(figsize=(7, 5))
sns.heatmap(tencar_df.corr(), annot=True, fmt='.2f', cmap='gist_heat', linewidths=1, vmin=-1, vmax=1)
plt.title('Correlation Heatmap')

plt.show()

In [None]:
cat_encoder = LabelEncoder()

In [None]:
for i in tencar_df.columns:
    if tencar_df[i].dtype =='object':
        tencar_df[i] = cat_encoder.fit_transform(tencar_df[i])

In [None]:
tencar_df.head()

## Machine Learning 

#### Step 1: separate target and feature variables, Split DataFrame, Create Scaler 

In [None]:
# Separate the y variable, the target
y = tencar_df["sellingprice"]

# Separate the X variable, the features
X = tencar_df.drop(columns="sellingprice")

In [None]:
# Split the DataFrame using train_test_split
# Assign a random_state of 42 to the function
xtrain,xtest,ytrain,ytest = train_test_split(X,y,test_size=0.3,shuffle=True,random_state=42)

In [None]:
xtrain.shape, ytrain.shape

In [None]:
scaler = MinMaxScaler()

In [None]:
X= tencar_df.iloc[:,:-1]
y= tencar_df.iloc[:,-1]

In [None]:
y

In [None]:
for i in X.columns:
    X[i] = scaler.fit_transform(np.array(X[i]).reshape(-1,1))

In [None]:
X

#### Lasso Regression

In [None]:
lasso_model = Lasso(alpha=0.1)

In [None]:
lasso_model.fit(xtrain,ytrain)

In [None]:
pred=lasso_model.predict(xtest)

In [None]:
msre = round(np.sqrt(mean_squared_error(ytest,pred)),2)
msre

In [None]:
mean_squared_error(ytest,pred)

In [None]:
lasso_model.coef_

In [None]:
lasso_model.intercept_

In [None]:
r2_score(ytest,pred)

In [None]:
param = {'alpha' :[1e-15,1e-10,1e-8,1e-3,1e-2,1,5,10,20,30,40,50,100, 1000, 10000]}

In [None]:
lasso_twocv= GridSearchCV(lasso_model, param, cv=3, n_jobs=1)

In [None]:
lasso_twocv.fit(xtrain, ytrain)

In [None]:
pred_two= lasso_twocv.predict(xtest)

In [None]:
mean_absolute_error(ytest, pred_two)

In [None]:
mean_squared_error(ytest, pred_two)

In [None]:
r2_score(ytest, pred_two)

In [None]:
lasso_twocv.best_estimator_

In [None]:
lass_two = Lasso(alpha=10000)

In [None]:
lass_two.fit(xtrain, ytrain)

In [None]:
lass_two.intercept_

In [None]:
lass_two.coef_

In [None]:
coef_dict={}
for coef, feat in zip(lasso_model.coef_,X.columns):
    coef_dict[feat] = coef
coef_dict

In [None]:
predict_lasso = lass_two.predict(xtest)

In [None]:
coef_names = tencar_df.drop('sellingprice',axis=1).columns

print(coef_names)

In [None]:
lasso = Lasso(alpha=100)
lasso_coef = lasso.fit(X, y).coef_

plt.plot(range(len(coef_names)), lasso_coef)
plt.xticks(range(len(coef_names)), coef_names, rotation=90)
plt.ylabel("Coefficients")
plt.show()

#### Linear Regression

In [None]:
linreg = LinearRegression()
linreg.fit(X_train, y_train)
linreg_pred = linreg.predict(X_test)

In [None]:
linreg_mae = mean_absolute_error(y_test, linreg_pred)
linreg_r2 = r2_score(y_test, linreg_pred)
print("Mean Absolute Error of linear regression model is:", linreg_mae)
print("R2 score of linear regression model is:", linreg_r2)

In [None]:
linreg_score = cross_val_score(linreg, X_test, y_test, cv=4)


In [None]:
tencar_df.describe()

In [None]:
# go back into preprocessing -normalize that data to remove skewness 

In [None]:
residuals = y_train - y_test

# Residual plot
plt.figure(figsize=(8, 6))
sns.scatterplot(x=y_test, y=residuals)
plt.axhline(y=0, color='red', linestyle='--', linewidth=2)
plt.title("Residual Plot")
plt.xlabel("Predicted Values")
plt.ylabel("Residuals")
plt.show()

### Principal Component Analysis 

In [None]:
## if we had more time things we could look into 
#which states had the best selling prices/
#which seller had the lowest/highest selling prices