In [7]:
import pandas as pd

# Load the dataset
file_path = 'Car Dataset.csv'
car_data = pd.read_csv(file_path)

# Display the first few rows of the dataset
car_data.head()


Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,Maruti 800 AC,2007,60000,70000,Petrol,Individual,Manual,First Owner
1,Maruti Wagon R LXI Minor,2007,135000,50000,Petrol,Individual,Manual,First Owner
2,Hyundai Verna 1.6 SX,2012,600000,100000,Diesel,Individual,Manual,First Owner
3,Datsun RediGO T Option,2017,250000,46000,Petrol,Individual,Manual,First Owner
4,Honda Amaze VX i-DTEC,2014,450000,141000,Diesel,Individual,Manual,Second Owner


In [8]:
# Display the first few rows of the dataset
print(car_data.head())

                       name  year  selling_price  km_driven    fuel  \
0             Maruti 800 AC  2007          60000      70000  Petrol   
1  Maruti Wagon R LXI Minor  2007         135000      50000  Petrol   
2      Hyundai Verna 1.6 SX  2012         600000     100000  Diesel   
3    Datsun RediGO T Option  2017         250000      46000  Petrol   
4     Honda Amaze VX i-DTEC  2014         450000     141000  Diesel   

  seller_type transmission         owner  
0  Individual       Manual   First Owner  
1  Individual       Manual   First Owner  
2  Individual       Manual   First Owner  
3  Individual       Manual   First Owner  
4  Individual       Manual  Second Owner  


In [9]:
# Check for missing values
print(car_data.isnull().sum())

name             0
year             0
selling_price    0
km_driven        0
fuel             0
seller_type      0
transmission     0
owner            0
dtype: int64


In [10]:
# Handle missing values for numeric columns
numeric_columns = car_data.select_dtypes(include=['number']).columns
car_data[numeric_columns] = car_data[numeric_columns].fillna(car_data[numeric_columns].median())

In [11]:
# Handle missing values for categorical columns
categorical_columns = car_data.select_dtypes(include=['object']).columns
car_data[categorical_columns] = car_data[categorical_columns].fillna(car_data[categorical_columns].mode().iloc[0])

In [12]:
# Verify that there are no missing values left
print(car_data.isnull().sum())

name             0
year             0
selling_price    0
km_driven        0
fuel             0
seller_type      0
transmission     0
owner            0
dtype: int64


In [17]:
# Convert categorical features to numerical values using one-hot encoding
car_data_encoded = pd.get_dummies(car_data, drop_first=True)
car_data_encoded.head()

Unnamed: 0,year,selling_price,km_driven,name_Ambassador Classic 2000 Dsz,name_Ambassador Grand 1800 ISZ MPFI PW CL,name_Audi A4 1.8 TFSI,name_Audi A4 2.0 TDI,name_Audi A4 2.0 TDI 177 Bhp Premium Plus,name_Audi A4 3.0 TDI Quattro,name_Audi A4 30 TFSI Technology,...,fuel_Electric,fuel_LPG,fuel_Petrol,seller_type_Individual,seller_type_Trustmark Dealer,transmission_Manual,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,2007,60000,70000,False,False,False,False,False,False,False,...,False,False,True,True,False,True,False,False,False,False
1,2007,135000,50000,False,False,False,False,False,False,False,...,False,False,True,True,False,True,False,False,False,False
2,2012,600000,100000,False,False,False,False,False,False,False,...,False,False,False,True,False,True,False,False,False,False
3,2017,250000,46000,False,False,False,False,False,False,False,...,False,False,True,True,False,True,False,False,False,False
4,2014,450000,141000,False,False,False,False,False,False,False,...,False,False,False,True,False,True,False,True,False,False


In [20]:
from sklearn.preprocessing import StandardScaler

# Initialize the scaler
scaler = StandardScaler()

# Scale the features
scaled_features = scaler.fit_transform(car_data_encoded.drop('selling_price', axis=1))
scaled_data = pd.DataFrame(scaled_features, columns=car_data_encoded.columns[:-1])
scaled_data['selling_price'] = car_data_encoded['selling_price']

In [21]:
# Assuming 'selling_price' is the target variable
features = scaled_data.drop('selling_price', axis=1)
target = scaled_data['selling_price']

In [22]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)


In [26]:
from sklearn.metrics import mean_squared_error, r2_score

In [27]:
# Function to train and evaluate models
def train_evaluate(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    mse = mean_squared_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    return mse, r2

In [28]:
# Initialize the models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

In [29]:
linear_model = LinearRegression()
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
gb_model = GradientBoostingRegressor(n_estimators=100, random_state=42)

In [30]:
# Train and evaluate models
mse_linear, r2_linear = train_evaluate(linear_model, X_train, y_train, X_test, y_test)
mse_rf, r2_rf = train_evaluate(rf_model, X_train, y_train, X_test, y_test)
mse_gb, r2_gb = train_evaluate(gb_model, X_train, y_train, X_test, y_test)

In [31]:
# Compare results
results = pd.DataFrame({
    'Model': ['Linear Regression', 'Random Forest', 'Gradient Boosting'],
    'MSE': [mse_linear, mse_rf, mse_gb],
    'R2': [r2_linear, r2_rf, r2_gb]
})

results

Unnamed: 0,Model,MSE,R2
0,Linear Regression,1.3179999999999999e+39,-4.3189e+27
1,Random Forest,128103700000.0,0.5802224
2,Gradient Boosting,140672100000.0,0.5390372


## Submitted By Jestover Mark David (1206)