In [18]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split 
from sklearn import metrics 
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from matplotlib import style

In [2]:
car_data = pd.read_csv("../input/vehicle-dataset-from-cardekho/car data.csv")

In [3]:
car_data.head(5)

In [4]:
car_data.info()  # to see more information about datasets

In [5]:
#to check total number of null in datasets 
car_data.isnull().sum()

In [6]:
#to get information about statistics of each column in datasets
#but categorical data will not appear
car_data.describe()

In [7]:
#to know what is categorical data we will list all columns
car_data.columns

In [12]:
#to count and list all category in fuel_type column
car_data["Fuel_Type"].value_counts()

In [13]:
car_data["Car_Name"].value_counts()

In [15]:
print(car_data["Car_Name"].value_counts())
print("======================================")
print(car_data["Fuel_Type"].value_counts())
print("======================================")
print(car_data["Transmission"].value_counts())
print("======================================")
print(car_data["Seller_Type"].value_counts())
print("======================================")

In [16]:
#to visulaize data using bar plot
seller_type = car_data["Seller_Type"]
transmission = car_data["Transmission"]
fuel_type = car_data["Fuel_Type"]
selling_price = car_data["Selling_Price"]

In [22]:
#this plot categorical data vs max of selling price 

style.use("ggplot")
fig = plt.figure(figsize = (15,5))
fig.suptitle("visulization categorical data columns")
plt.subplot(1,3,1)
plt.bar(fuel_type,selling_price,color = "royalblue")
plt.xlabel("Fuel_Type")
plt.ylabel("Selling_price")
plt.subplot(1,3,2)
plt.bar(transmission,selling_price,color = "purple")
plt.xlabel("transmission")
plt.ylabel("Selling_price")
plt.subplot(1,3,3)
plt.bar(seller_type,selling_price,color = "red")
plt.xlabel("seller_type")
plt.ylabel("Selling_price")


In [24]:
#this plot categorical data vs mean of selling price 
fig ,axes = plt.subplots(1,3,figsize=(15,5),sharey = True)
fig.suptitle("visulization categorical data columns")
sns.barplot(x = fuel_type , y = selling_price, ax = axes[0])
sns.barplot(x = transmission , y = selling_price, ax = axes[1])
sns.barplot(x = seller_type , y = selling_price, ax = axes[2])


In [25]:
petrol_data = car_data.groupby("Fuel_Type").get_group("Petrol")

In [26]:
petrol_data.describe()

In [29]:
# manual encoding 
car_data.replace({"Fuel_Type":{"Petrol":0,"Diesel":1,"CNG":2}}, inplace = True)
#one hot encoding
car_data = pd.get_dummies(car_data , columns = ["Transmission","Seller_Type"],drop_first = True)

In [30]:
car_data.head()

In [33]:
#ploting heat map to check which feature are correlated to output and which is not
#then will drop weakly correlated features and reduce number of features so that results will be better
plt.figure(figsize = (10, 5))

sns.heatmap(car_data.corr(),annot = True)
plt.title("correlation between columns")
plt.show()

In [34]:
car_data.corr()

In [36]:
fig = plt.figure(figsize = (7,5))
plt.title("correlation between selling price and seller type")
sns.regplot(x =  "Seller_Type_Individual" , y = "Selling_Price" ,data = car_data)

In [37]:
fig = plt.figure(figsize = (7,5))
plt.title("correlation between selling price and Present_Price")
sns.regplot(x =  "Present_Price" , y = "Selling_Price" ,data = car_data)

In [39]:
X = car_data.drop(["Car_Name","Selling_Price"],axis = 1)
y = car_data["Selling_Price"]

In [40]:
print("X shape",X.shape)
print("y shape",y.shape)

In [42]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.3)

In [44]:
print("X_train shape",X_train.shape)
print("X_test shape",X_test.shape)

In [45]:
#scalling features to make traing faster and more efficient
scaler = StandardScaler()

In [46]:
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [47]:
model = LinearRegression()

In [49]:
model.fit(X_train,y_train)

In [50]:
y_pred  = model.predict(X_test)

In [59]:
from sklearn.metrics import mean_absolute_error , mean_squared_error , r2_score

print("MAE : ", mean_absolute_error(  y_pred,y_test))

In [60]:
print("MSE : ", mean_squared_error(y_test , y_pred))

In [61]:
"""R2 indicates the proportion of data points which lie within the line created by 
the regression equation.A higher value of R2 is desirable as it indicates better results
max value is 1 and it may negative[bad cases]"""

print("r2_score : ", r2_score(y_test , y_pred))

In [63]:
fig = plt.figure(figsize = (7,5))
plt.title("correlation between actual and predicted  values")
sns.regplot(x = y_pred  , y = y_test  )