In [11]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import numpy as np
import xgboost as xgb

# Load the dataset
df = pd.read_csv('D:/DC CARs/merged_file.csv')

# Selecting features and target
features = df[['modelYear','Seats', 'ownerNo', 'Kms Driven', 'Engine Displacement', 'Mileage']]
target = df['price']

# Standardizing the features
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features_scaled, target, test_size=0.2, random_state=42)

# Creating and training the XGBoost regression model
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1, max_depth=6)
xgb_model.fit(X_train, y_train)

# Making predictions
predictions = xgb_model.predict(X_test)

# Evaluating the model
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')


Mean Squared Error: 157850008684.54678
R^2 Score: 0.8885952234268188


In [12]:
#
import pandas as pd
      
# Calculate Q1 (25th percentile) and Q3 (75th percentile)
Q1 = df[['modelYear', 'Seats', 'ownerNo', 'Kms Driven', 'Engine Displacement', 'Mileage']].quantile(0.25)
Q3 = df[['modelYear', 'Seats', 'ownerNo', 'Kms Driven', 'Engine Displacement', 'Mileage']].quantile(0.75)
IQR = Q3 - Q1
      
# Determine outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
      
# Remove outliers
df_cleaned = df[~((df[['modelYear', 'Seats', 'ownerNo', 'Kms Driven', 'Engine Displacement', 'Mileage']] < lower_bound) | df[['modelYear', 'Seats', 'ownerNo', 'Kms Driven', 'Engine Displacement', 'Mileage']]).any(axis=1)]
      
# Display the cleaned dataframe
df_cleaned_head = df_cleaned.head()
print(df_cleaned_head)
  


Empty DataFrame
Columns: [ft, bt, ownerNo, oem, model, modelYear, centralVariantId, variantName, price, trendingText, Insurance Validity, Seats, Kms Driven, RTO, Engine Displacement, Transmission, Comfort & Convenience, Interior, Exterior, Safety, Entertainment & Communication, Mileage, Max Power, Torque, Wheel Size]
Index: []

[0 rows x 25 columns]


In [13]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=2, include_bias=False)
poly_features = poly.fit_transform(features)
poly_feature_names = poly.get_feature_names_out(['modelYear', 'Seats', 'ownerNo', 'Kms Driven', 'Engine Displacement', 'Mileage'])
features_poly = pd.DataFrame(poly_features, columns=poly_feature_names)


In [14]:
df['Engine_Mileage_Ratio'] = df['Engine Displacement'] / df['Mileage']
features = df[['modelYear', 'Seats', 'ownerNo', 'Kms Driven', 'Engine Displacement', 'Mileage', 'Engine_Mileage_Ratio']]


In [15]:
from sklearn.linear_model import Lasso

lasso = Lasso(alpha=0.1)  # Adjust alpha for regularization strength
lasso.fit(X_train, y_train)


In [9]:
from sklearn.linear_model import Ridge

ridge = Ridge(alpha=1.0)  # Adjust alpha for regularization strength
ridge.fit(X_train, y_train)


In [4]:
#Testing the model
import pickle

# Load the model using pickle
with open('car_prediction.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

# Check if the model is properly fitted
if hasattr(loaded_model, 'feature_importances_'):  # A simple check for a fitted model
    print("Model loaded successfully and is ready for predictions.")
else:
    print("Model is not properly loaded or not fitted.")

# Making predictions
features = [[2020, 4, 1, 10000, 1500, 15]]  # Example features
prediction = loaded_model.predict(features)

print(f"Predicted Price: ${prediction[0]:,.2f}")


Model loaded successfully and is ready for predictions.
Predicted Price: $9,436,331.00


In [5]:
import pickle
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import pandas as pd

# Load the dataset
df = pd.read_csv('D:/DC CARs/merged_file.csv')

# Selecting features and target
features = df[['modelYear', 'Seats', 'ownerNo', 'Kms Driven', 'Engine Displacement', 'Mileage']]
target = df['price']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Standardizing the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Creating and training the XGBoost regression model
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1, max_depth=6)
xgb_model.fit(X_train_scaled, y_train)

# Save the model and scaler using pickle
with open('D:/DC CARs/car_prediction.pkl', 'wb') as file:
    pickle.dump(xgb_model, file)

with open('D:/DC CARs/scaler.pkl', 'wb') as file:
    pickle.dump(scaler, file)
