In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import re

In [2]:
df = pd.read_csv('./bikes_data_Modified.csv')

In [3]:
df.head()

Unnamed: 0,Company,Country of Origin,Model,Number of cc,Horsepower,Torque,Transmission Type,Drivetrain,Number of Seating,Price(Lakhs),Year,Looks,Body Type,Engine Type,Number of Cylinders
0,Aprilia,Italy,RS 660,659.0,100 hp,67 Nm,6-speed quickshifter,Chain,2,10.99,2021,Sport,Naked,Parallel-twin,2
1,Aprilia,Italy,Tuono 660,659.0,100 hp,67 Nm,6-speed quickshifter,Chain,2,11.99,2021,Sport,Naked,Parallel-twin,2
2,Aprilia,Italy,RS 125,124.9,15 hp,12 Nm,6-speed manual,Chain,2,4.49,2022,Sport,Racing,Single-cylinder,1
3,Aprilia,Italy,Shiver 900,896.0,95 hp,90 Nm,6-speed manual,Shaft,2,13.99,2022,Adventure,Naked,V-twin,2
4,Aprilia,Italy,Tuono 1100,1077.0,175 hp,121 Nm,6-speed manual,Shaft,2,19.99,2022,Adventure,Naked,V-twin,2


In [4]:
df.shape

(362, 15)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 362 entries, 0 to 361
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Company              362 non-null    object 
 1   Country of Origin    362 non-null    object 
 2   Model                362 non-null    object 
 3   Number of cc         362 non-null    object 
 4   Horsepower           362 non-null    object 
 5   Torque               362 non-null    object 
 6   Transmission Type    362 non-null    object 
 7   Drivetrain           362 non-null    object 
 8   Number of Seating    362 non-null    int64  
 9   Price(Lakhs)         362 non-null    float64
 10  Year                 362 non-null    int64  
 11  Looks                362 non-null    object 
 12  Body Type            362 non-null    object 
 13  Engine Type          362 non-null    object 
 14  Number of Cylinders  361 non-null    object 
dtypes: float64(1), int64(2), object(12)
memo

There are no null values present except for one in 'Number of Cylinders'.

we will change names of columns with spaces, so it will be easy to handle.

In [6]:
df = df.rename(columns={'Country of Origin':'Origin','Number of cc':'CC',
        'Transmission Type':'Transmission','Number of Seating':'Seating',
        'Number of Cylinders':'Cylinders','Body Type':'Body','Engine Type':'Engine'
        })

##### Company

In [7]:
df['Company'].value_counts()

Benelli             16
Husqvarna           12
Mutt Motorcycles    11
KTM                  9
Genuine Scooters     9
                    ..
Italjet              1
Derbi                1
Ontrack              1
Ampere               1
Fantic               1
Name: Company, Length: 75, dtype: int64

##### Origin

In [8]:
df['Origin'].value_counts()

Italy             92
India             47
China             44
UK                28
Japan             24
United States     19
Taiwan            18
United Kingdom    17
Austria           14
USA               13
Sweden            12
Spain              9
Germany            8
France             8
South Korea        5
Canada             3
Denmark            1
Name: Origin, dtype: int64

There are some same values with different cases. we will combine them first.  

In [9]:
df['Origin']=df['Origin'].replace(['UK','USA'],['United Kingdom','United States'])

In [10]:
df['Origin'].value_counts()

Italy             92
India             47
United Kingdom    45
China             44
United States     32
Japan             24
Taiwan            18
Austria           14
Sweden            12
Spain              9
Germany            8
France             8
South Korea        5
Canada             3
Denmark            1
Name: Origin, dtype: int64

##### Model

In [11]:
df.Model.value_counts()

Artemis           4
TC                2
Svartpilen 401    2
X-Cape            2
Seiemmezzo 6 ½    2
                 ..
MGX-21            1
Eldorado          1
Audace            1
V9 Roamer         1
Eva Ribelle RS    1
Name: Model, Length: 345, dtype: int64

##### CC

In [12]:
df['CC'].value_counts()

125     60
250     24
300     12
500      8
110      7
        ..
1133     1
693      1
1158     1
321      1
346      1
Name: CC, Length: 139, dtype: int64

In [13]:
df.drop(list(df.loc[df.CC == 'Electric'].index),inplace=True)

There are values with different formats and units, have to make it consistent.

In [14]:
CCs = []

for i in df['CC']:
    # for values like '1,100 cc' or '1,100 CC' 
    if re.search(',\d+\s[cC]',i):
        CCs.append(float(i.replace(',','').split(' ')[0]))
    # replacing commas wherever present.
    elif re.search(',',i):
        CCs.append(float(i.replace(',','')))
    # for values like 34 CC or 80.15 cc
    elif re.search('\s[cC]',i):
        CCs.append(float(i.split(' ')[0]))
    # for values like 100cc or 100CC
    elif re.search('\d+[cC]',i):
        CCs.append(float(i.split('c')[0]))
    # Converting kW to cc 
    # 35 kW ~ 46.6 bhp
    # 14.6 bhp ~ 125 cc
    # gives 1 kW ~ 11.42 cc
    elif re.search('\skW',i):
        CCs.append(round(float(i.split(' ')[0])*11.42,1))
    else:
        CCs.append(float(i))

In [15]:
df['CC'] = CCs

##### Horsepower

In [16]:
df['Horsepower'].value_counts()

11 hp                  6
20 bhp                 6
95 hp                  5
15 hp                  5
18.4 bhp               5
                      ..
31 hp @ 8,500 rpm      1
26.5 hp @ 8,500 rpm    1
38 hp                  1
24 hp                  1
42 hp                  1
Name: Horsepower, Length: 244, dtype: int64

Horsepower is also not consistent with values or units, so made it consistent.
1. 1 bhp = 13.15 hp
2. 1 PS  = 0.9863 hp
3. 1 kW  = 1.34 hp

In [17]:
df['Horsepower'] = df['Horsepower'].replace('500W','0.5 kW')

In [18]:
hps = []

for i in df['Horsepower']:
    if re.search('\d{1,2}[.]?\d$',i):
        hps.append(float(i))
    elif re.search('\d\s[hpHP]',i):
        hps.append(float(i.split(' ')[0]))
    elif re.search('\d\s[bB][hH][pP]',i):
        hps.append(round(float(i.split(' ')[0])*13.15,1))
    elif re.search('\d[hps]',i):
        cnt = 0
        s =''
        while not i[cnt].isalpha():
            s+=i[cnt]
            cnt +=1
        hps.append(float(s))
    elif re.search('kW',i):
        hps.append(round(float(i.split(' ')[0])*1.34,1))
    else:
        print(i)


In [19]:
df['Horsepower'] = hps

In [20]:
df.Horsepower.dtype

dtype('float64')

##### Torque

In [21]:
df['Torque'].value_counts()

10.5 Nm               10
18 Nm                  9
10 Nm                  8
12 Nm                  6
11 Nm                  6
                      ..
32 Nm@4250 rpm         1
52.3 Nm @ 5650 rpm     1
14 Nm @ 6000 rpm       1
163 lb-ft              1
58 Nm                  1
Name: Torque, Length: 219, dtype: int64

The value is not present at 303 for Torque.So replaced it with appropriate value (via google search)

In [22]:
df['Torque'][303] = '70 Nm'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Torque'][303] = '70 Nm'


Made column consistent.
1. 1 lb-ft = 1.36 Nm

In [23]:
Ts = []
for i in df['Torque']:
    # for values like 100Nm
    if re.search('\sNm',i):
        Ts.append(float(i.split(' ')[0]))
    # for values like 100 lb-ft or 100 ft-lb
    elif re.search('\s[lf][bt]',i):
        Ts.append(round(float(i.split(' ')[0])*1.36,1))
    # for values like 100 Nm
    elif re.search('\d+Nm',i):
        Ts.append(float(i.replace('Nm','')))
    # for values with no unit assuming unit as Nm
    else:
        Ts.append(float(i))

In [24]:
df['Torque'] = Ts

In [25]:
df.Torque.dtype

dtype('float64')

##### Transmission

In [26]:
df['Transmission'].value_counts()

6-speed manual                  114
Manual                           50
CVT                              42
Automatic                        36
5-speed manual                   27
6-speed Manual                    8
Six-speed                         8
5-speed                           7
Five-speed constant mesh          7
6-speed sequential                6
4-speed constant mesh             6
Six-speed sequential              6
Six-speed constant mesh           5
5-speed constant mesh             4
6-Speed                           4
6-speed automatic                 4
6 Speed Manual                    3
4-speed manual                    3
CVT Automatic                     3
4-speed                           3
BLDC hub motor                    2
6-speed quickshifter              2
6-speed constant mesh             2
6-speed sequential manual         2
Bafang BBS02 mid-drive motor      1
BAFANG M600 mid-drive motor       1
Single Speed                      1
Direct drive                

Let's check some special transmission types.

In [27]:
df[df.Transmission == 'Single Speed']

Unnamed: 0,Company,Origin,Model,CC,Horsepower,Torque,Transmission,Drivetrain,Seating,Price(Lakhs),Year,Looks,Body,Engine,Cylinders
228,Ontrack,India,Brutus Single Speed MTB 29,29.0,25.0,25.0,Single Speed,Front Suspension,1,0.14,2023,Rugged,Mountain Bike,2,1


In [28]:
df[df.Transmission == 'Direct drive']

Unnamed: 0,Company,Origin,Model,CC,Horsepower,Torque,Transmission,Drivetrain,Seating,Price(Lakhs),Year,Looks,Body,Engine,Cylinders
182,Harley-Davidson,United States,LiveWire,1133.0,105.0,116.0,Direct drive,Permanent magnet synchronous motor,1,35.0,2021,Modern,Cruiser,Electric,Single


Grouped together categories with same mechanisms, so it is east to play around.

In [29]:
df.Transmission = df.Transmission.replace('Direct drive','Single Speed')

df.Transmission = df.Transmission.replace(['6-speed manual','6-speed Manual','Six-speed',
                                        '6-speed sequential','6 Speed Manual',
                                        '6-speed quickshifter','6-speed sequential manual',
                                        'Six-speed sequential','Six-speed constant mesh','6-Speed',
                                        '6-speed','Constant Mesh, 6-speed','6-speed constant mesh'],'6-speed Manual')

df.Transmission = df.Transmission.replace(['CVT automatic','CVT Automatic'],'CVT')

df.Transmission = df.Transmission.replace(['Bafang BBS02 mid-drive motor','BAFANG M600 mid-drive motor'],'Mid-drive Motor')
df.Transmission = df.Transmission.replace('BLDC hub motor','Hub Motor')

df.Transmission = df.Transmission.replace(['5-speed manual','5-speed','Five-speed constant mesh',
                                        '5-speed constant mesh'],'5-speed Manual')

df.Transmission = df.Transmission.replace(['4-speed constant mesh','4-speed manual','4-speed'],'4-speed Manual')


In [30]:
df.Transmission.value_counts()

6-speed Manual       162
Manual                50
CVT                   46
5-speed Manual        45
Automatic             36
4-speed Manual        12
6-speed automatic      4
Single Speed           2
Mid-drive Motor        2
Hub Motor              2
Name: Transmission, dtype: int64

##### Drivetrain

In [31]:
df['Drivetrain'].value_counts()

Chain                                 110
Chain drive                            93
Automatic                              35
Chain Drive                            22
Belt drive                             20
Electric                               16
Shaft drive                            16
CVT                                    15
Shaft                                  11
Belt                                    4
Shimano drivetrain                      4
Automatic CVT                           3
Front wheel drive                       3
Belt Drive                              2
Parallel twin                           2
Permanent magnet synchronous motor      1
Front Suspension                        1
Single-gear                             1
Dual-motor, all-wheel drive             1
V-twin, liquid-cooled, 4-stroke         1
Name: Drivetrain, dtype: int64

Replaced same values with different cases.
special values are categorised as 'Other' for more ease.

In [32]:
df.Drivetrain = df['Drivetrain'].replace(['Chain drive','Chain Drive'],'Chain')

df.Drivetrain = df['Drivetrain'].replace(['Belt drive','Belt Drive'],'Belt')

df.Drivetrain = df['Drivetrain'].replace('Automatic CVT','CVT')

df.Drivetrain = df['Drivetrain'].replace(['Shimano drivetrain','Front wheel drive','Parallel twin', 'Permanent magnet synchronous motor',
    'Front Suspension', 'Single-gear', 'Dual-motor, all-wheel drive',
    'V-twin, liquid-cooled, 4-stroke'],'Other')

df.Drivetrain = df['Drivetrain'].replace('Shaft drive','Shaft')

In [33]:
df.Drivetrain.value_counts()

Chain        225
Automatic     35
Shaft         27
Belt          26
CVT           18
Electric      16
Other         14
Name: Drivetrain, dtype: int64

##### Seating

In [34]:
df['Seating'].value_counts()

2    222
1    138
3      1
Name: Seating, dtype: int64

##### Price (Lakhs)

price column had lots of different value formats, cleaned it with excel.
1. removed commas wherever present.
2. removed units such as Rs. , rs. , ₹ , lakh , Lakh and more.
3. The range values were replaced as mean value of that range.
4. The $ values were converted to rupees.
5. All values converted to single unit lakh.

In [35]:
df['Price(Lakhs)'].values

array([10.99, 11.99,  4.49, 13.99, 19.99,  1.25,  1.75,  2.  ,  4.  ,
        6.  ,  2.25,  3.5 ,  3.25,  3.  ,  2.5 ,  6.  ,  8.  , 12.  ,
       14.  , 16.  , 12.  ,  1.5 ,  1.25,  3.5 ,  3.75,  4.  ,  4.5 ,
       10.  , 13.  , 11.  , 12.  ,  1.5 ,  1.25,  8.  , 10.  , 11.  ,
       12.  , 10.  , 15.  , 20.  ,  2.29,  4.29,  5.29,  5.59, 45.48,
       41.43, 82.85,  1.3 ,  1.4 ,  2.39,  2.59, 27.41, 25.91,  9.39,
       12.95, 21.48, 17.49, 18.79, 40.49, 37.49, 24.49, 20.49, 24.49,
       26.59,  1.45,  1.85,  1.65,  0.65,  0.62,  3.25,  3.35,  6.5 ,
        7.5 ,  0.23,  0.24,  0.26,  0.2 ,  0.15,  3.43,  5.19,  6.65,
        9.11, 23.02,  6.95,  1.78,  1.92,  2.2 ,  3.1 ,  2.05,  3.3 ,
        3.6 ,  5.5 ,  0.75,  1.5 ,  1.6 ,  2.5 ,  1.05,  1.5 ,  2.  ,
        2.5 ,  0.85,  1.15,  0.55,  0.75,  1.1 ,  1.1 ,  2.08,  1.29,
        0.7 ,  0.55,  0.54, 15.4 , 13.6 , 13.3 , 14.8 , 17.5 , 24.9 ,
       18.9 , 25.4 , 22.8 , 21.1 , 20.7 , 20.99, 20.99, 32.99, 35.99,
        1.25,  1.55,

##### Year 

In [36]:
df['Year'].value_counts()

2023    215
2022     90
2020     14
2021      7
2017      7
2019      7
2018      5
2016      3
2005      3
2003      2
2014      2
1994      1
2002      1
2000      1
2015      1
2006      1
1998      1
Name: Year, dtype: int64

For years less than 2020, not a lot of values present for each year.So, they are categarized as one period '<2020'.

In [37]:
df.Year = [i if i >= 2020 else '<2020' for i in df.Year]

In [38]:
df.Year.value_counts()

2023     215
2022      90
<2020     35
2020      14
2021       7
Name: Year, dtype: int64

##### Looks

In [39]:
df['Looks'].value_counts()

Modern                       74
Classy                       48
Adventure                    42
Sporty                       30
Retro                        30
Classic                      29
Retro-inspired               10
Modern, sporty               10
Futuristic                    7
Sport                         7
Modern, stylish               4
Classic, vintage              4
Cruiser                       4
Aggressive                    4
Commuter                      3
Sharp                         3
Simple, reliable              3
Motocross                     3
Sporty, aggressive            3
Classic, stylish              3
Practical                     3
Sleek                         3
Enduro                        2
Naked                         2
Modern, aggressive            2
Retro-modern                  2
Rugged                        2
Race-inspired                 2
Touring                       2
Adventure, sporty             1
Modern, muscular              1
Urban   

In [51]:
looks = ','.join(df['Looks'])
looks = looks.lower().split(',')
d = {}
for i in looks:
    if i not in d.keys():
        d[i] =1
    else:
        d[i] +=1
look = list(d.keys())
counts = list(d.values())
data = {'look':look,'count':counts}
bike_looks = pd.DataFrame(data)
bike_looks

Unnamed: 0,look,count
0,sport,7
1,adventure,44
2,classy,50
3,sporty,34
4,retro,31
5,touring,2
6,classic,38
7,modern,94
8,bold,2
9,sleek,3


##### Body

In [40]:
df['Body'].value_counts()

Cruiser                59
Naked                  46
Scooter                36
Standard               23
Enduro                 23
Adventure              19
Sport                  15
Cafe racer             13
Scrambler              10
Roadster                9
Street                  9
Adventure touring       8
Sports                  6
Street Bike             6
Sportbike               6
commuter                5
Cafe Racer              5
Bobber                  5
Supersport              3
Dual-sport              3
Streetfighter           3
Superbike               3
Supermoto               3
Tourer                  2
Retro                   2
Electric scooter        2
Naked streetfighter     2
Dirt bike               2
Motocross               2
Trail Bike              2
Maxi-Scooter            2
Naked bike              2
Adventure bike          2
Sports bike             2
Off-road                1
Adventure tourer        1
Cargo                   1
Commuter                1
Sports Naked

##### Engine

Didn't do anything with engine column as not lot of values provide unique information except some with cooling mechanisms.
we can omit it for furthur analysis.

In [41]:
df['Engine'].value_counts()

Single-cylinder                                             50
Electric                                                    31
V-twin                                                      31
Single-cylinder, air-cooled                                 22
Single-cylinder, liquid-cooled                              20
                                                            ..
2                                                            1
Single-cylinder, fuel-injected, air-cooled                   1
Triple                                                       1
Liquid-cooled, 4-cylinder, DOHC                              1
Single cylinder, liquid-cooled, 4-stroke, 4-valves, SOHC     1
Name: Engine, Length: 62, dtype: int64

##### Cylinders

First filling in the null value present with appropriate value(google search)

In [42]:
df.loc[df['Cylinders'].isnull()== True]

Unnamed: 0,Company,Origin,Model,CC,Horsepower,Torque,Transmission,Drivetrain,Seating,Price(Lakhs),Year,Looks,Body,Engine,Cylinders
181,GasGas,Spain,SM 700,693.0,74.0,70.0,6-speed Manual,Chain,1,14.5,2023,Modern,Supermoto,Single,


In [43]:
df['Cylinders']=df['Cylinders'].replace(np.nan, '1')

replacing similar meanings into one.

In [44]:
df.Cylinders = df['Cylinders'].replace(['Single','Two','One','90°','Three',
                                        'Parallel-Twin'],['1','2','1','2','3','2'])

In [45]:
df['Cylinders'].value_counts()

1    253
2     82
3     14
4     11
0      1
Name: Cylinders, dtype: int64

In [46]:
df.to_csv('./bikes_data_transformed.csv')

In [52]:
bike_looks.to_csv('./bike_looks_data.csv')