In [126]:
pip install word2number

In [127]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from word2number import w2n

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [128]:
automobiles =  pd.read_csv('../input/automobile-dataset/Automobile_data.csv')

# Exploring the dataset

In [129]:
automobiles.head()

In [130]:
automobiles.describe()

In [131]:
automobiles.info()

# Data Cleaning

In [132]:
# To see all the rows and cloumns in the data frame, run the command below:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

#### Here, Null values have been indicated using ?, replacing them with NULL

In [133]:
automobiles = automobiles.replace('?',np.NAN)

In [134]:
automobiles.isnull().sum()

In [135]:
automobiles.dtypes

In [136]:
# Exploring the different numeric and object columns
numerical_vars = automobiles.columns[automobiles.dtypes != 'object']
categorical_vars = automobiles.columns[automobiles.dtypes == 'object']
print(numerical_vars)
print(categorical_vars)

### Changing Datatypes of num-of-doors, bore, stroke, horsepower, peak-rpm, num-of-cylinders and price

#### The values of number of doors and number of cylinders is in words, so we convert it to its equivalent numerical value

In [137]:
automobiles['num-of-doors'].value_counts()
automobiles['num-of-cylinders'].value_counts()

automobiles['num-of-doors'] = automobiles['num-of-doors'].fillna('zero')
automobiles['num-of-cylinders'] = automobiles['num-of-cylinders'].fillna('zero')

In [138]:
automobiles['num-of-doors'] = automobiles['num-of-doors'].apply(w2n.word_to_num)

In [139]:
automobiles['num-of-cylinders'] = automobiles['num-of-cylinders'].apply(w2n.word_to_num)

#### Converting to float for analysis

In [140]:
objects_to_float = ['normalized-losses','bore', 'stroke', 'horsepower', 'peak-rpm', 'price']
for object_to_float in objects_to_float:
    automobiles[object_to_float] = automobiles[object_to_float].str.replace('\D','',regex=True).astype(float) 
    

### Accounting for missing and NULL values

In [141]:
#Fill data with mode where value is 0

number_of_doors_Mode = automobiles['num-of-doors'].mode()
#print(number_of_doors_Mode[0])
automobiles['num-of-doors'].replace(0,number_of_doors_Mode[0],inplace=True)

In [142]:
automobiles['num-of-doors'].value_counts()

In [143]:
#Fill data with mode where value is 0

num_of_cylinders_Mode = automobiles['num-of-cylinders'].mode()
#print(num_of_cylinders_Mode[0])
automobiles['num-of-cylinders'].replace(0,num_of_cylinders_Mode[0],inplace=True)

In [144]:
automobiles['num-of-cylinders'].value_counts()

In [145]:
automobiles['num-of-cylinders']

In [146]:
bore_mean = round(automobiles['bore'].mean(),2)
#print(bore_mean)
automobiles['bore'].fillna(bore_mean,inplace=True)

In [147]:
stroke_mean = round(automobiles['stroke'].mean(),2)
#print(stroke_mean)
automobiles['stroke'].fillna(stroke_mean,inplace=True)

In [148]:
horse_power_mean = round(automobiles['horsepower'].mean(),0)
#print(horse_power_mean)
automobiles['horsepower'].fillna(horse_power_mean,inplace=True)

In [149]:
peak_rpm_mean = round(automobiles['peak-rpm'].mean(),0)
print(peak_rpm_mean)
automobiles['peak-rpm'].fillna(peak_rpm_mean,inplace=True)

In [150]:
price_mean = round(automobiles['price'].mean(),0)
print(price_mean)
automobiles['price'].fillna(price_mean,inplace=True)

In [151]:
losses_mean = round(automobiles['normalized-losses'].mean(),0)
print(losses_mean)
automobiles['normalized-losses'].fillna(losses_mean,inplace=True)

In [152]:
automobiles.isnull().sum()

In [155]:
# Download dataframe as CSV

# from IPython.display import HTML
# import pandas as pd
# import numpy as np
# import base64

# def create_download_link(df, title = "Download CSV file", filename = "data.csv"):  
#     csv = df.to_csv()
#     b64 = base64.b64encode(csv.encode())
#     payload = b64.decode()
#     html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a>'
#     html = html.format(payload=payload,title=title,filename=filename)
#     return HTML(html)
# create_download_link(automobiles)