### Read the csv file as a dataframe object and check the data

In [1]:
import pandas as pd
df = pd.read_csv('car data.csv')

In [2]:
df.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,Maruti 800 AC,2007,60000,70000,Petrol,Individual,Manual,First Owner
1,Maruti Wagon R LXI Minor,2007,135000,50000,Petrol,Individual,Manual,First Owner
2,Hyundai Verna 1.6 SX,2012,600000,100000,Diesel,Individual,Manual,First Owner
3,Datsun RediGO T Option,2017,250000,46000,Petrol,Individual,Manual,First Owner
4,Honda Amaze VX i-DTEC,2014,450000,141000,Diesel,Individual,Manual,Second Owner


In [3]:
df.shape

(4340, 8)

In [4]:
df.isnull().sum()

name             0
year             0
selling_price    0
km_driven        0
fuel             0
seller_type      0
transmission     0
owner            0
dtype: int64

### Deriving Age from the year and drop unnecessary column

In [5]:
import datetime
final_dataframe = df.copy()
current_year = datetime.datetime.now().year
final_dataframe['age'] = current_year - final_dataframe['year']
final_dataframe.drop(['year'], axis=1, inplace=True)
final_dataframe.drop(['name'], axis=1, inplace=True)
final_dataframe.head()

Unnamed: 0,selling_price,km_driven,fuel,seller_type,transmission,owner,age
0,60000,70000,Petrol,Individual,Manual,First Owner,16
1,135000,50000,Petrol,Individual,Manual,First Owner,16
2,600000,100000,Diesel,Individual,Manual,First Owner,11
3,250000,46000,Petrol,Individual,Manual,First Owner,6
4,450000,141000,Diesel,Individual,Manual,Second Owner,9


### Creating dummy variables for text values

In [6]:
print(df['fuel'].unique())
print(df['seller_type'].unique())
print(df['transmission'].unique())
print(df['owner'].unique())

['Petrol' 'Diesel' 'CNG' 'LPG' 'Electric']
['Individual' 'Dealer' 'Trustmark Dealer']
['Manual' 'Automatic']
['First Owner' 'Second Owner' 'Fourth & Above Owner' 'Third Owner'
 'Test Drive Car']


In [7]:
final_dataframe = pd.get_dummies(final_dataframe, drop_first=True)
# it is just converting all these textual attributes into some binary values
# drop first is true because we want to avoid dummy variable trap
final_dataframe.head()

Unnamed: 0,selling_price,km_driven,age,fuel_Diesel,fuel_Electric,fuel_LPG,fuel_Petrol,seller_type_Individual,seller_type_Trustmark Dealer,transmission_Manual,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,60000,70000,16,0,0,0,1,1,0,1,0,0,0,0
1,135000,50000,16,0,0,0,1,1,0,1,0,0,0,0
2,600000,100000,11,1,0,0,0,1,0,1,0,0,0,0
3,250000,46000,6,0,0,0,1,1,0,1,0,0,0,0
4,450000,141000,9,1,0,0,0,1,0,1,0,1,0,0


### Checking the histogram, Scatter Plot & Correlations

In [None]:
final_dataframe.hist(bins = 50, figsize = (20, 15))

array([[<AxesSubplot: title={'center': 'selling_price'}>,
        <AxesSubplot: title={'center': 'km_driven'}>,
        <AxesSubplot: title={'center': 'age'}>,
        <AxesSubplot: title={'center': 'fuel_Diesel'}>],
       [<AxesSubplot: title={'center': 'fuel_Electric'}>,
        <AxesSubplot: title={'center': 'fuel_LPG'}>,
        <AxesSubplot: title={'center': 'fuel_Petrol'}>,
        <AxesSubplot: title={'center': 'seller_type_Individual'}>],
       [<AxesSubplot: title={'center': 'seller_type_Trustmark Dealer'}>,
        <AxesSubplot: title={'center': 'transmission_Manual'}>,
        <AxesSubplot: title={'center': 'owner_Fourth & Above Owner'}>,
        <AxesSubplot: title={'center': 'owner_Second Owner'}>],
       [<AxesSubplot: title={'center': 'owner_Test Drive Car'}>,
        <AxesSubplot: title={'center': 'owner_Third Owner'}>,
        <AxesSubplot: >, <AxesSubplot: >]], dtype=object)

In [None]:
cols = final_dataframe.columns
cols
for i in cols:
    if i == "selling_price":
        continue
    final_dataframe.plot(kind = "scatter", x = i, y = "selling_price", alpha = 0.8)


In [None]:
corr_matrix = final_dataframe.corr()
# correlation shomanupatik naki bestanupatik oita dekhabe
corr_matrix['selling_price'].sort_values(ascending=False)
# eita MEDV er shapekkhe correlation dekhabe

### Stratified Train Test Split

the dummy variables need to be split strategically

In [None]:
dummy_cols = final_dataframe.filter(regex='fuel_|seller_type_|transmission_|owner_').columns.tolist()
dummy_cols

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit
ssp = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

# fuel_Electric

dummy_columns = ['fuel_Diesel',
 'fuel_LPG',
 'fuel_Petrol',
 'seller_type_Trustmark Dealer',
 'owner_Second Owner',
 'owner_Test Drive Car',
 'owner_Third Owner',
]

for train_index, test_index in ssp.split(final_dataframe, final_dataframe[dummy_columns]):
    strat_train_set = final_dataframe.loc[train_index]
    strat_test_set = final_dataframe.loc[test_index]

In [None]:
strat_train_set['transmission_Manual'].value_counts()

In [None]:
strat_test_set['transmission_Manual'].value_counts()

### Creating a Pipeline

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
my_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
])
final_train_set = my_pipeline.fit_transform(strat_train_set)
# it returns a numpy array not a pandas dataframe
# so I'm converting test set to numpy array as well
final_train_set
final_test_set = strat_test_set.values

### Feature-Label Split

In [None]:
# Assume final_train_set is a NumPy array containing both features and target variable
import numpy as np
# Get the index of the target variable column
target_index = 2  # Assuming the target variable is the third column

# Split the data into features and target variables
X_train_before = final_train_set[:, :target_index]  # Select all columns before the target column
X_train_after = final_train_set[:, target_index+1:]  # Select all columns after the target column
X_train = np.hstack((X_train_before, X_train_after))  # Combine the two sets of feature columns
y_train = final_train_set[:, target_index]  # Select only the target column


# Split the data into features and target variables
X_test_before = final_test_set[:, :target_index]  # Select all columns before the target column
X_test_after = final_test_set[:, target_index+1:]  # Select all columns after the target column
X_test = np.hstack((X_test_before, X_test_after))  # Combine the two sets of feature columns
y_test = final_test_set[:, target_index]  # Select only the target column


### Evaluating Different Models

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

def evaluate(model, X, y):
    # Make predictions on the data
    y_pred = model.predict(X)

    # Calculate the mean squared error
    mse = mean_squared_error(y, y_pred)

    # Calculate the R-squared score
    r2 = r2_score(y, y_pred)
    
    # Calculate the standard deviation of the residuals
    std_dev = np.std(y - y_pred)

    return mse, r2, std_dev


In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

def evaluate_with_cv(model, X, y):
    # Calculate cross-validation scores
    scores = cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=10)
    cv_mse = -scores.mean()
    cv_std_dev = scores.std()

    # Fit the model to the full training data
    model.fit(X, y)

    # Make predictions on the test data
    y_pred = model.predict(X)

    return cv_mse, cv_std_dev

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

def evaluate_models(X_train, y_train, X_test, y_test):

    models = [
        LinearRegression(),
        DecisionTreeRegressor(),
        RandomForestRegressor()
    ]
    
    for model in models:
        # Fit the model to the training data
        model.fit(X_train, y_train)

        # Evaluate the model on the testing data
        mse, r2, std_dev = evaluate(model, X_test, y_test)

        # Evaluate the model with cross-validation and on the test data
        cv_mse, cv_std_dev = evaluate_with_cv(model, X_train, y_train)
        # Write the evaluation results to the file
        print("{}:\n".format(type(model).__name__))
        print("  Mean squared error: {:.2f}\n".format(mse))
        print("  R-squared score: {:.2f}\n".format(r2))
        print("  Standard deviation of residuals: {:.2f}\n".format(std_dev))
        print("  Cross-validation mean squared error: {:.2f}\n".format(cv_mse))
        print("  Cross-validation standard deviation: {:.2f}\n\n".format(cv_std_dev))       

In [None]:
evaluate_models(X_train, y_train, X_test, y_test)

### Export the Best Model

In [None]:
from joblib import dump, load

model = RandomForestRegressor()
model.fit(X_train, y_train)
dump(model, 'car_prediction_model.joblib')