## Importing


In [0]:
import pandas as pd
import numpy as np


In [0]:
# Reading CSV files
d2 = pd.read_csv('train data.csv',index_col=0)

In [0]:
# data understanding
d2.head(5)

In [0]:
#changing name of columns
d2.columns = ['Name','Location','Year','Kilometers_Driven','Fuel_Type','Transmission','Owner_Type','Mileage',
                    'Engine_CC','Power_bhp','Seats','New_Price_Lakh','Price_Lakh']

## Dta Analysis

In [0]:
#top 5 rows
d2.head(5)

In [0]:
# bottom 5 rows
d2.tail(5)

In [0]:
# shape of data
d2.shape

In [0]:




#columns of dataset
d2.columns

In [0]:
d2.info()

In [0]:
#size of dataset
d2.size

In [0]:
# datatypes of different attributes
d2.dtypes

In [0]:
# total number of each datatype in dataset
d2.get_dtype_counts()

In [0]:
# number of entities in each column
d2.count()

## Description


In [0]:
d2.describe()

In [0]:
d2.info()

## Handling Missing Values

In [0]:
# checking the missing values
d2.isnull()

In [0]:
# sum of all null values
d2.isnull().sum()

In [0]:
#dropping duplicate values
d2.drop_duplicates()

## Transformation of Data using functions

In [0]:
# converting float data of New_Price column to String
str1 = {'New_Price_Lakh': str }

In [0]:
d2 = d2.astype(str1)

In [0]:
# function to change price value from crore to lakhs
def crore(x):
    l = x.split(' ')
    if "Cr" in x:
        return (str(float(l[0]) * 100) +" "+ "Lakh")
    if "Lakh" in x:
        return (str(float(l[0])) +" "+ "Lakh")
    else:
        return (str(float(l[0])))

In [0]:
# Applying New_Price(in Lakh) column values in crore()
d2['New_Price_Lakh'] = d2['New_Price_Lakh'].apply(crore)

In [0]:
# removing "Lakh" from column New_Price
def clean_currency(x):
    if isinstance(x, str):
        return(x.replace('Lakh', '').replace(',', ''))
    return(x)

In [0]:
# updating New_Price(in Lakh) column values in data set
d2['New_Price_Lakh'] = d2['New_Price_Lakh'].apply(clean_currency).astype('float')

### Engine

In [0]:
# removing "CC" from Engine column 
def clean_CC(y):
    if isinstance(y, str):
        return(y.replace('CC', '').replace(',', ''))
    return(y)

In [0]:
# applying clean_CC() to values of engine column
d2['Engine_CC'] = d2['Engine_CC'].apply(clean_CC).astype('float')


In [0]:
# removing "bhp" from Power column 
def clean_bhp(z):
    if isinstance(z, str):
        return(z.replace('bhp', '').replace(',', ''))
    return(z)

In [0]:
# applying clean_bhp() to values of engine column
d2['Power_bhp'] = d2['Power_bhp'].apply(clean_bhp)


### -Mileage

In [0]:
# Transforming the Mileage from kmkg to kml

# converting data type of Mileage in string
str2 = {'Mileage': str } 

In [0]:
d2 = d2.astype(str2)

In [0]:
d2.isin([0]).sum()

In [0]:
# function to change km/kg to kmpl
def kml(x):
    ll = x.split(' ')
    d2.loc[d2['Fuel_Type']=='CNG']
    if 'km/kg' in x:
        return(str(float(l1[0]) //2) +" "+ "kmpl")
       elif "kmpl" in x:
        return (str(float(l1[0])) +" "+ "kmpl") 
    if d2.loc['Fuel_Type']=='LPG':
        if "km/kg" in x:
            return (str(float(l1[0]) //1.36) +" "+ "kmpl")
        elif "kmpl" in x:
            return (str(float(l1[0])) +" "+ "kmpl")
            


In [0]:
# Applying Mileage column values in kml()
d2['Mileage'] = d2['Mileage'].apply(kml)

In [0]:
# removing "kmpl" from column
def clean_mileage(k):
    if isinstance(k, str):
        return(k.replace('kmpl', '').replace(',', ''))
    return(k)

In [0]:
# updating Mileage column values in data set
d2['Mileage'] = d2['Mileage'].apply(clean_mileage)

In [0]:
d2.head()

## Type Conversion

In [0]:
#type conversion of Power(in bhp) from object type to float data type
d2['Power_bhp'] = pd.to_numeric(d2['Power_bhp'],errors='coerce')

In [0]:
# type conversion of Mileage from object type to float
d2['Mileage'] = pd.to_numeric(d2['Mileage'],errors='coerce')

In [0]:
d2.dtypes

In [0]:
# Checking the NaN values of attributes
d2.isna().sum()

## Data Cleaning

###       Engine

In [0]:
d2.Engine_CC.replace(0,np.nan)

In [0]:
d2.Engine_CC.fillna(d2.Engine_CC.median(),inplace=True)

In [0]:
d2.Engine_CC.isna().sum()

#### Mileage

In [0]:
d2.Mileage.replace(0,np.nan)

In [0]:
d2.Mileage.fillna(d2.Mileage.median(),inplace=True)

In [0]:
d2.Mileage.isna().sum()

#### Power

In [0]:
d2.Power_bhp.replace(0,np.nan)

In [0]:
d2.Power_bhp.fillna(d2.Mileage.median(),inplace=True)

In [0]:
d2.Power_bhp.isna().sum()

#### seats

In [0]:
d2.Seats.replace(0,np.nan)

In [0]:
d2.Seats.fillna(d2.Mileage.median(),inplace=True)

In [0]:
d2.Seats.isna().sum()

## Visualising the data

#### Name

In [0]:
d2['Name'].value_counts().plot(kind='barh')

#### Location

In [0]:
d2['Location'].value_counts().plot(kind='pie')
d2['Location'].value_counts()

#### Seats

In [0]:
d2['Seats'].value_counts().plot(kind='bar')
d2['Seats'].value_counts()

#### Year

In [0]:
d2['Year'].value_counts().plot(kind='pie')
d2['Year'].value_counts()

#### Kilometers driven

In [0]:
d2['Kilometers_Driven'].plot.hist(bins=300)

#### Fuel Type

In [0]:
d2['Fuel_Type'].value_counts().plot(kind='pie')
d2['Fuel_Type'].value_counts()

#### Transmission

In [0]:
d2['Transmission'].value_counts().plot(kind='pie')
d2['Transmission'].value_counts()

#### Owner Type

In [0]:
d2['Owner_Type'].value_counts().plot(kind='bar')
d2['Owner_Type'].value_counts()

# Cleaned Data

In [0]:
d2.head()