In [1]:
import pandas as pd
import ast
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import pickle as pk

# Function to process individual datasets
def load_and_process_data(filepath, place):
    # Load the dataset
    data = pd.read_excel(filepath)
    
    # Convert columns to lowercase and replace spaces with underscores
    data.columns = data.columns.str.lower().str.replace(' ', '_')

    # Convert columns to dictionaries if needed
    data['new_car_detail'] = data['new_car_detail'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
    data['new_car_overview'] = data['new_car_overview'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
    data['new_car_feature'] = data['new_car_feature'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
    data['new_car_specs'] = data['new_car_specs'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

    # Extract new car overview from 'new_car_overview' column
    overview_records = []
    for overview in data['new_car_overview']:
        if isinstance(overview, dict):
            top_data = {item['key']: item['value'] for item in overview.get('top', [])}
            overview_records.append(top_data)
    overview_df = pd.DataFrame(overview_records)

    # Extract new car features from 'new_car_feature' column
    features_records = []
    for car_features in data['new_car_feature']:
        if isinstance(car_features, dict) and 'top' in car_features:
            top_data = {item['value'] for item in car_features['top']}
            features_records.append(top_data)
        else:
            features_records.append(set())
    features_df = pd.DataFrame(features_records)
    features_df.columns = [f'feature_{i+1}' for i in range(features_df.shape[1])]

    # Extract new car specs from 'new_car_specs' column
    spec_records = []
    for spec in data['new_car_specs']:
        if isinstance(spec, dict):
            top_data = {item['key']: item['value'] for item in spec.get('top', [])}
            spec_records.append(top_data)
    spec_df = pd.DataFrame(spec_records)

    # Extract new car details from 'new_car_detail' column
    details = []
    for detail in data['new_car_detail']:
        if isinstance(detail, dict):
            details.append(detail)
    details_df = pd.DataFrame(details)

    # Combine all DataFrames into one
    combined_df = pd.concat([details_df, overview_df, features_df, spec_df], axis=1)
    
    combined_df.columns = combined_df.columns.str.lower().str.replace(' ', '_')

    # Remove duplicate columns
    combined_df = combined_df.loc[:, ~combined_df.columns.duplicated()]

    columns_to_drop = ['it', 'ft', 'bt', 'km', 'ownerno', 'oem', 'centralvariantid', 'owner',
                   'variantname', 'priceactual', 'pricesaving', 'pricefixedtext', 
                   'trendingtext', 'registration_year', 'insurance_validity','rto', 'engine_displacement', 'year_of_manufacture',
                   'feature_1', 'feature_2', 'feature_3', 'feature_4', 'feature_5',
                   'feature_6', 'feature_7', 'feature_8', 'feature_9', 'wheel_size','torque']

    combined_df.drop(columns=columns_to_drop, inplace=True, errors='ignore')

    # Add the 'place' column for this dataset
    combined_df['place'] = place

    return combined_df

# List of file paths for the datasets
file_paths = [
    r'dataset/chennai_cars.xlsx',
    r'dataset/bangalore_cars.xlsx',
    r'dataset/delhi_cars.xlsx',
    r'dataset/hyderabad_cars.xlsx',
    r'dataset/kolkata_cars.xlsx'
]

# Corresponding state names (places)
places = ['Chennai', 'Bangalore', 'Delhi', 'Hyderabad', 'Kolkata']

# Initialize an empty list to store each processed DataFrame
dataframes = []

# Loop through file paths and places to load and process each dataset
for filepath, place in zip(file_paths, places):
    df = load_and_process_data(filepath, place)
    dataframes.append(df)

# Concatenate all DataFrames into one
combined_df = pd.concat(dataframes, ignore_index=True)

# Save the combined DataFrame to a CSV file
combined_df.to_csv('combined_data_cleaned.csv', index=False)

# Display the combined DataFrame (optional)
print(combined_df.head())


  transmission           model  modelyear         price fuel_type    seats  \
0    Automatic       Kia Sonet       2022  ₹ 11.50 Lakh    Petrol  5 Seats   
1       Manual     Maruti Eeco       2015   ₹ 4.15 Lakh    Petrol  7 Seats   
2       Manual  Nissan Magnite       2021   ₹ 7.50 Lakh    Petrol  5 Seats   
3       Manual     Hyundai i10       2015   ₹ 3.98 Lakh    Petrol  5 Seats   
4       Manual      Honda Jazz       2015   ₹ 5.50 Lakh    Petrol  5 Seats   

   kms_driven    ownership   engine  max_power     mileage    place  
0  20,000 Kms  First Owner   998 CC  118.36bhp         NaN  Chennai  
1  20,687 Kms  First Owner  1196 CC      73bhp  15.37 kmpl  Chennai  
2  30,000 Kms  First Owner   999 CC   98.63bhp     20 kmpl  Chennai  
3  59,247 Kms  First Owner  1086 CC   68.05bhp  19.81 kmpl  Chennai  
4  50,000 Kms  First Owner  1199 CC    88.7bhp   18.7 kmpl  Chennai  


In [2]:
combined_df

Unnamed: 0,transmission,model,modelyear,price,fuel_type,seats,kms_driven,ownership,engine,max_power,mileage,place
0,Automatic,Kia Sonet,2022,₹ 11.50 Lakh,Petrol,5 Seats,"20,000 Kms",First Owner,998 CC,118.36bhp,,Chennai
1,Manual,Maruti Eeco,2015,₹ 4.15 Lakh,Petrol,7 Seats,"20,687 Kms",First Owner,1196 CC,73bhp,15.37 kmpl,Chennai
2,Manual,Nissan Magnite,2021,₹ 7.50 Lakh,Petrol,5 Seats,"30,000 Kms",First Owner,999 CC,98.63bhp,20 kmpl,Chennai
3,Manual,Hyundai i10,2015,₹ 3.98 Lakh,Petrol,5 Seats,"59,247 Kms",First Owner,1086 CC,68.05bhp,19.81 kmpl,Chennai
4,Manual,Honda Jazz,2015,₹ 5.50 Lakh,Petrol,5 Seats,"50,000 Kms",First Owner,1199 CC,88.7bhp,18.7 kmpl,Chennai
...,...,...,...,...,...,...,...,...,...,...,...,...
7244,Manual,Maruti Celerio,2022,₹ 5.10 Lakh,Petrol,5 Seats,"10,000 Kms",First Owner,998 CC,65.71bhp,25.24 kmpl,Kolkata
7245,Manual,Maruti Alto 800,2014,₹ 1.80 Lakh,Petrol,5 Seats,"1,20,000 Kms",First Owner,796 CC,47.3bhp,22.74 kmpl,Kolkata
7246,Automatic,Mercedes-Benz C-Class,2011,₹ 5.50 Lakh,Petrol,5 Seats,"50,000 Kms",Third Owner,1796 CC,186bhp,11.74 kmpl,Kolkata
7247,Manual,Maruti Ritz,2012,₹ 1.40 Lakh,Petrol,5 Seats,"40,000 Kms",First Owner,1197 CC,85.80bhp,18.5 kmpl,Kolkata


In [3]:
combined_df.columns

Index(['transmission', 'model', 'modelyear', 'price', 'fuel_type', 'seats',
       'kms_driven', 'ownership', 'engine', 'max_power', 'mileage', 'place'],
      dtype='object')

In [4]:
combined_df.head()

Unnamed: 0,transmission,model,modelyear,price,fuel_type,seats,kms_driven,ownership,engine,max_power,mileage,place
0,Automatic,Kia Sonet,2022,₹ 11.50 Lakh,Petrol,5 Seats,"20,000 Kms",First Owner,998 CC,118.36bhp,,Chennai
1,Manual,Maruti Eeco,2015,₹ 4.15 Lakh,Petrol,7 Seats,"20,687 Kms",First Owner,1196 CC,73bhp,15.37 kmpl,Chennai
2,Manual,Nissan Magnite,2021,₹ 7.50 Lakh,Petrol,5 Seats,"30,000 Kms",First Owner,999 CC,98.63bhp,20 kmpl,Chennai
3,Manual,Hyundai i10,2015,₹ 3.98 Lakh,Petrol,5 Seats,"59,247 Kms",First Owner,1086 CC,68.05bhp,19.81 kmpl,Chennai
4,Manual,Honda Jazz,2015,₹ 5.50 Lakh,Petrol,5 Seats,"50,000 Kms",First Owner,1199 CC,88.7bhp,18.7 kmpl,Chennai


In [5]:
combined_df.shape

(7249, 12)

In [6]:
combined_df.isnull().sum()

transmission      0
model             0
modelyear         0
price             0
fuel_type         0
seats             5
kms_driven        1
ownership        31
engine            4
max_power        51
mileage         259
place             0
dtype: int64

In [7]:
combined_df.dropna(inplace=True)

In [8]:
combined_df.shape

(6925, 12)

In [9]:
combined_df.duplicated().sum()

88

In [10]:
combined_df.drop_duplicates(inplace=True)

In [11]:
combined_df.shape

(6837, 12)

In [12]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6837 entries, 1 to 7248
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   transmission  6837 non-null   object
 1   model         6837 non-null   object
 2   modelyear     6837 non-null   int64 
 3   price         6837 non-null   object
 4   fuel_type     6837 non-null   object
 5   seats         6837 non-null   object
 6   kms_driven    6837 non-null   object
 7   ownership     6837 non-null   object
 8   engine        6837 non-null   object
 9   max_power     6837 non-null   object
 10  mileage       6837 non-null   object
 11  place         6837 non-null   object
dtypes: int64(1), object(11)
memory usage: 694.4+ KB


In [13]:
def get_brand_name(car_name):
    car_name = car_name.split(' ')[0]
    return car_name.strip()

In [14]:
def get_kms(kms):
    kms = kms.replace(',', '')
    return float(kms)

In [15]:
def clean_data(value):
    value = value.split(' ')[0]
    value = value.strip()
    if value == '':
        value = 0
    return float(value)

In [16]:
combined_df['model'] = combined_df['model'].apply(get_brand_name)

In [17]:
combined_df['engine'] = combined_df['engine'].apply(get_brand_name)

In [18]:
combined_df['kms_driven'] = combined_df['kms_driven'].apply(get_brand_name)

In [19]:
combined_df['kms_driven'] = combined_df['kms_driven'].apply(get_kms)

In [20]:
combined_df['model'].unique()

array(['Maruti', 'Nissan', 'Hyundai', 'Honda', 'Mercedes-Benz', 'BMW',
       'Ford', 'Tata', 'Jeep', 'Audi', 'Toyota', 'Mahindra', 'Renault',
       'Chevrolet', 'Volkswagen', 'Datsun', 'Kia', 'Fiat', 'Land', 'MG',
       'Skoda', 'Isuzu', 'Mini', 'Volvo', 'Jaguar', 'Citroen',
       'Mitsubishi', 'Lexus', 'OpelCorsa', 'Porsche', 'Ambassador'],
      dtype=object)

In [21]:
combined_df['mileage'] = combined_df['mileage'].apply(clean_data)

In [22]:
combined_df.head()

Unnamed: 0,transmission,model,modelyear,price,fuel_type,seats,kms_driven,ownership,engine,max_power,mileage,place
1,Manual,Maruti,2015,₹ 4.15 Lakh,Petrol,7 Seats,20687.0,First Owner,1196,73bhp,15.37,Chennai
2,Manual,Nissan,2021,₹ 7.50 Lakh,Petrol,5 Seats,30000.0,First Owner,999,98.63bhp,20.0,Chennai
3,Manual,Hyundai,2015,₹ 3.98 Lakh,Petrol,5 Seats,59247.0,First Owner,1086,68.05bhp,19.81,Chennai
4,Manual,Honda,2015,₹ 5.50 Lakh,Petrol,5 Seats,50000.0,First Owner,1199,88.7bhp,18.7,Chennai
5,Automatic,Mercedes-Benz,2016,₹ 33 Lakh,Diesel,5 Seats,20000.0,First Owner,2143,204bhp,13.0,Chennai


In [23]:
def clean_data(value):
    # Split the string by space and take the numeric part
    value = value.split(' ')[0]
    # Remove any commas and strip any extra spaces
    value = value.replace(',', '').strip()
    # Handle empty strings by converting them to 0
    if value == '':
        value = 0
    # Convert the cleaned value to float
    return float(value)



In [24]:
combined_df.head()

Unnamed: 0,transmission,model,modelyear,price,fuel_type,seats,kms_driven,ownership,engine,max_power,mileage,place
1,Manual,Maruti,2015,₹ 4.15 Lakh,Petrol,7 Seats,20687.0,First Owner,1196,73bhp,15.37,Chennai
2,Manual,Nissan,2021,₹ 7.50 Lakh,Petrol,5 Seats,30000.0,First Owner,999,98.63bhp,20.0,Chennai
3,Manual,Hyundai,2015,₹ 3.98 Lakh,Petrol,5 Seats,59247.0,First Owner,1086,68.05bhp,19.81,Chennai
4,Manual,Honda,2015,₹ 5.50 Lakh,Petrol,5 Seats,50000.0,First Owner,1199,88.7bhp,18.7,Chennai
5,Automatic,Mercedes-Benz,2016,₹ 33 Lakh,Diesel,5 Seats,20000.0,First Owner,2143,204bhp,13.0,Chennai


In [25]:
combined_df['seats'] = combined_df['seats'].apply(clean_data)

In [26]:
def clean_data1(value):
    # Remove any non-numeric characters except the period
    value = ''.join([char for char in value if char.isdigit() or char == '.'])
    
    # If the value is empty after cleaning, set it to 0
    if value == '':
        value = 0
    
    return float(value)

# Apply the function to the 'max_power' column
combined_df['max_power'] = combined_df['max_power'].apply(clean_data1)


In [27]:
def price_cleaned_data(value):
    # Remove the ₹ symbol and any extra spaces
    value = value.replace('₹', '').strip()
    
    # Split the string to separate the numeric part and the unit ('Lakh' or nothing)
    parts = value.split(' ')
    numeric_value = parts[0].replace(',', '').strip()  # Remove commas
    
    # Handle empty strings
    if numeric_value == '':
        numeric_value = 0
    else:
        numeric_value = float(numeric_value)
    
    # Check if the value is in Lakhs and convert it to the full form
    if len(parts) > 1 and parts[1] == 'Lakh':
        numeric_value *= 100000  # Convert Lakh to full number (4.15 Lakh → 415000)
    
    # Return the cleaned and converted float value
    return numeric_value

# Apply the function to clean the 'price' column
combined_df['price'] = combined_df['price'].apply(price_cleaned_data)


In [28]:
for col in combined_df.columns:
    print('Unique values of ' + col)
    print(combined_df[col].unique())
    print("======================")

Unique values of transmission
['Manual' 'Automatic']
Unique values of model
['Maruti' 'Nissan' 'Hyundai' 'Honda' 'Mercedes-Benz' 'BMW' 'Ford' 'Tata'
 'Jeep' 'Audi' 'Toyota' 'Mahindra' 'Renault' 'Chevrolet' 'Volkswagen'
 'Datsun' 'Kia' 'Fiat' 'Land' 'MG' 'Skoda' 'Isuzu' 'Mini' 'Volvo' 'Jaguar'
 'Citroen' 'Mitsubishi' 'Lexus' 'OpelCorsa' 'Porsche' 'Ambassador']
Unique values of modelyear
[2015 2021 2016 2010 2018 2019 2020 2017 2014 2013 2022 2008 2011 2012
 2007 2009 2023 2002 2005 2003 2004 2006 1998 2001 1985]
Unique values of price
[ 415000.  750000.  398000. ...   35000. 1082000.  105000.]
Unique values of fuel_type
['Petrol' 'Diesel' 'CNG' 'LPG' 'Electric']
Unique values of seats
[ 7.  5.  6.  8.  4. 10.  9.  2.]
Unique values of kms_driven
[ 20687.  30000.  59247. ...  53562.  32190. 151000.]
Unique values of ownership
['First Owner' 'Second Owner' 'Third Owner' 'Fourth Owner' 'Fifth Owner']
Unique values of engine
['1196' '999' '1086' '1199' '2143' '998' '1995' '1497' '1197' '195

In [29]:
combined_df.to_csv('cleaned.csv')

In [30]:
combined_df.head()

Unnamed: 0,transmission,model,modelyear,price,fuel_type,seats,kms_driven,ownership,engine,max_power,mileage,place
1,Manual,Maruti,2015,415000.0,Petrol,7.0,20687.0,First Owner,1196,73.0,15.37,Chennai
2,Manual,Nissan,2021,750000.0,Petrol,5.0,30000.0,First Owner,999,98.63,20.0,Chennai
3,Manual,Hyundai,2015,398000.0,Petrol,5.0,59247.0,First Owner,1086,68.05,19.81,Chennai
4,Manual,Honda,2015,550000.0,Petrol,5.0,50000.0,First Owner,1199,88.7,18.7,Chennai
5,Automatic,Mercedes-Benz,2016,3300000.0,Diesel,5.0,20000.0,First Owner,2143,204.0,13.0,Chennai


In [31]:
combined_df.dtypes

transmission     object
model            object
modelyear         int64
price           float64
fuel_type        object
seats           float64
kms_driven      float64
ownership        object
engine           object
max_power       float64
mileage         float64
place            object
dtype: object

In [32]:
combined_df.to_csv('final.csv')

In [33]:
combined_df['model'].replace(
    ['Maruti', 'Nissan', 'Hyundai', 'Honda', 'Mercedes-Benz', 'BMW', 'Ford', 
     'Tata', 'Jeep', 'Audi', 'Toyota', 'Mahindra', 'Renault', 'Chevrolet', 
     'Volkswagen', 'Datsun', 'Kia', 'Fiat', 'Land', 'MG', 'Skoda', 'Isuzu',
     'Mini', 'Volvo', 'Jaguar', 'Citroen', 'Mitsubishi', 'Lexus', 
     'OpelCorsa', 'Porsche', 'Ambassador'],
    [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,31],
    inplace=True
)


In [34]:
combined_df['transmission'].unique()

array(['Manual', 'Automatic'], dtype=object)

In [35]:
combined_df['transmission'].replace(['Manual', 'Automatic'],[1,2], inplace=True)

In [36]:
combined_df.columns

Index(['transmission', 'model', 'modelyear', 'price', 'fuel_type', 'seats',
       'kms_driven', 'ownership', 'engine', 'max_power', 'mileage', 'place'],
      dtype='object')

In [37]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6837 entries, 1 to 7248
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   transmission  6837 non-null   int64  
 1   model         6837 non-null   int64  
 2   modelyear     6837 non-null   int64  
 3   price         6837 non-null   float64
 4   fuel_type     6837 non-null   object 
 5   seats         6837 non-null   float64
 6   kms_driven    6837 non-null   float64
 7   ownership     6837 non-null   object 
 8   engine        6837 non-null   object 
 9   max_power     6837 non-null   float64
 10  mileage       6837 non-null   float64
 11  place         6837 non-null   object 
dtypes: float64(5), int64(3), object(4)
memory usage: 694.4+ KB


In [38]:
combined_df['fuel_type'].unique()

array(['Petrol', 'Diesel', 'CNG', 'LPG', 'Electric'], dtype=object)

In [39]:
combined_df['fuel_type'].replace(['Petrol', 'Diesel', 'CNG', 'LPG', 'Electric'],[1,2,3,4,5], inplace=True)

In [40]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6837 entries, 1 to 7248
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   transmission  6837 non-null   int64  
 1   model         6837 non-null   int64  
 2   modelyear     6837 non-null   int64  
 3   price         6837 non-null   float64
 4   fuel_type     6837 non-null   int64  
 5   seats         6837 non-null   float64
 6   kms_driven    6837 non-null   float64
 7   ownership     6837 non-null   object 
 8   engine        6837 non-null   object 
 9   max_power     6837 non-null   float64
 10  mileage       6837 non-null   float64
 11  place         6837 non-null   object 
dtypes: float64(5), int64(4), object(3)
memory usage: 694.4+ KB


In [41]:
combined_df.reset_index(inplace=True)

In [42]:
combined_df

Unnamed: 0,index,transmission,model,modelyear,price,fuel_type,seats,kms_driven,ownership,engine,max_power,mileage,place
0,1,1,1,2015,415000.0,1,7.0,20687.0,First Owner,1196,73.00,15.37,Chennai
1,2,1,2,2021,750000.0,1,5.0,30000.0,First Owner,999,98.63,20.00,Chennai
2,3,1,3,2015,398000.0,1,5.0,59247.0,First Owner,1086,68.05,19.81,Chennai
3,4,1,4,2015,550000.0,1,5.0,50000.0,First Owner,1199,88.70,18.70,Chennai
4,5,2,5,2016,3300000.0,2,5.0,20000.0,First Owner,2143,204.00,13.00,Chennai
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6832,7244,1,1,2022,510000.0,1,5.0,10000.0,First Owner,998,65.71,25.24,Kolkata
6833,7245,1,1,2014,180000.0,1,5.0,120000.0,First Owner,796,47.30,22.74,Kolkata
6834,7246,2,5,2011,550000.0,1,5.0,50000.0,Third Owner,1796,186.00,11.74,Kolkata
6835,7247,1,1,2012,140000.0,1,5.0,40000.0,First Owner,1197,85.80,18.50,Kolkata


In [43]:
combined_df['ownership'].unique()

array(['First Owner', 'Second Owner', 'Third Owner', 'Fourth Owner',
       'Fifth Owner'], dtype=object)

In [44]:
combined_df['ownership'].replace(['First Owner', 'Second Owner', 'Third Owner',
       'Fourth Owner', 'Fifth Owner'],
                           [1,2,3,4,5], inplace=True)

In [45]:
combined_df['place'].unique()

array(['Chennai', 'Bangalore', 'Delhi', 'Hyderabad', 'Kolkata'],
      dtype=object)

In [46]:
combined_df['place'].replace(['Chennai', 'Bangalore', 'Delhi', 'Hyderabad', 'Kolkata'],
                           [1,2,3,4,5], inplace=True)

In [47]:
combined_df.drop(columns=['index'], inplace=True)

In [48]:
for col in combined_df.columns:
    print('------------')
    print(col)
    print(combined_df[col].unique())

------------
transmission
[1 2]
------------
model
[ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 30 31]
------------
modelyear
[2015 2021 2016 2010 2018 2019 2020 2017 2014 2013 2022 2008 2011 2012
 2007 2009 2023 2002 2005 2003 2004 2006 1998 2001 1985]
------------
price
[ 415000.  750000.  398000. ...   35000. 1082000.  105000.]
------------
fuel_type
[1 2 3 4 5]
------------
seats
[ 7.  5.  6.  8.  4. 10.  9.  2.]
------------
kms_driven
[ 20687.  30000.  59247. ...  53562.  32190. 151000.]
------------
ownership
[1 2 3 4 5]
------------
engine
['1196' '999' '1086' '1199' '2143' '998' '1995' '1497' '1197' '1956'
 '1198' '1248' '1582' '1984' '2694' '1498' '1298' '1461' '1798' '1364'
 '1591' '1968' '2967' '2393' '2993' '1368' '1373' '1598' '2179' '1462'
 '1451' '2198' '1194' '1991' '1493' '1998' '4663' '2987' '796' '1186'
 '1799' '1950' '1496' '2925' '1396' '1353' '799' '1405' '2755' '814'
 '2494' '1399' '1997' '1047' '2999' '1499' '1896' '1

In [49]:
combined_df.isnull().sum()

transmission    0
model           0
modelyear       0
price           0
fuel_type       0
seats           0
kms_driven      0
ownership       0
engine          0
max_power       0
mileage         0
place           0
dtype: int64

In [50]:
input_data = combined_df.drop(columns=['price'])
output_data =combined_df['price']

In [51]:
x_train, x_test, y_train, y_test = train_test_split(input_data, output_data, test_size=0.2)

In [52]:
model = LinearRegression()

In [53]:
model.fit(x_train, y_train)

In [54]:
predict = model.predict(x_test)

In [55]:
predict

array([ 993022.85813805, 1013374.44612563,  239829.71532235, ...,
       -359588.14110288,  391669.73183075,  892594.95227927])

In [56]:
x_train.head(1)


Unnamed: 0,transmission,model,modelyear,fuel_type,seats,kms_driven,ownership,engine,max_power,mileage,place
2041,2,11,2015,2,7.0,120000.0,1,2982,168.5,12.55,2


In [57]:
input_data_model = pd.DataFrame(
    [[1,4,2017,1,5.0,46375.0,2,1198,86.8,18.5,1]],
    columns=['transmission', 'model', 'modelyear', 'fuel_type', 'seats', 'kms_driven', 'ownership', 'engine', 'max_power', 'mileage','place'])

In [58]:
input_data_model

Unnamed: 0,transmission,model,modelyear,fuel_type,seats,kms_driven,ownership,engine,max_power,mileage,place
0,1,4,2017,1,5.0,46375.0,2,1198,86.8,18.5,1


In [59]:
model.predict(input_data_model)

array([585138.03152236])

In [61]:
pk.dump(model,open('model.pkl','wb'))