In [None]:
import numpy as np
import pandas as pd


In [None]:
df = pd.read_csv("quikr_car.csv")
df.head()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.columns

In [None]:
df = df[df['year'].str.isnumeric()]

In [None]:
df['year'] = df['year'].astype(int)

In [None]:
df = df[df['Price'] != "Ask For Price"]

In [None]:
df['Price'] = df['Price'].str.replace(',','').astype(int)

In [None]:
df['kms_driven'] = df['kms_driven'].str.split(' ').str.get(0).str.replace(',','')

In [None]:
df = df[df['kms_driven'].str.isnumeric()]

In [None]:
df['kms_driven'] = df['kms_driven'].astype(int)

In [None]:
df = df[~df['fuel_type'].isnull()]

In [None]:
df.info()

In [None]:
df['name'] = df['name'].str.split(' ').str.slice(0,3).str.join(' ')

In [None]:
df = df.reset_index(drop=True)

In [None]:
df.describe()

In [None]:
df = df[df['Price'] < 6e6].reset_index(drop=True)

In [None]:
df.to_csv('cleaned.csv')

In [None]:
df.columns

In [None]:
uniques = []
list1 = ['name', 'company', 'year', 'Price', 'kms_driven', 'fuel_type']
for i in list1:
    unique = df[i].unique()
    uniques.append(unique)

In [None]:
unique_df = pd.DataFrame(uniques).T
unique_df.columns = ['name', 'company', 'year', 'Price', 'kms_driven', 'fuel_type']

In [None]:
unique_df.to_csv("unique_values.csv", index=False)

### **Create Model**

In [None]:
X = df.drop(columns='Price')
y = df['Price']

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.preprocessing import OneHotEncoder

In [None]:
encoder = OneHotEncoder()

In [None]:
encoder.fit(X[['name','company','fuel_type']])

In [None]:
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

In [None]:
column_trans = make_column_transformer((OneHotEncoder(handle_unknown='ignore'), ['name','company','fuel_type']),
                                        remainder='passthrough')

In [None]:
model = LinearRegression()

In [None]:
pipe = make_pipeline(column_trans, model)

In [None]:
pipe.fit(X_train, y_train)

In [None]:
y_pred = pipe.predict(X_test)

In [None]:
r2_score(y_test, y_pred)

In [None]:
r2_max = 0
best_i = 0
for i in range(1000):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i)
    model = LinearRegression()
    pipe = make_pipeline(column_trans, model)
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    if r2 > r2_max:
        r2_max = r2
        best_i = i
print(f"Best R2 score: {r2_max} at random state: {best_i}")

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=best_i)
model = LinearRegression()
pipe = make_pipeline(column_trans, model)
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
r2_score(y_test, y_pred)

In [None]:
import pickle

In [None]:
pickle.dump(pipe, open("LinearRegressionModel.pkl", 'wb'))

In [None]:
new_data = pd.DataFrame({'name': ['Maruti Suzuki Swift'], 'company': ['Maruti'], 'year':[2019], 'kms_driven': [100], 'fuel_type': ['Petrol']})
pipe.predict(new_data)