# Exploatory data analysis (EDA) of comparis data

In [1]:
# Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
import pylab as py

# seaborn graphics settings
sns.set(color_codes=True)

# Ignore warning
import warnings
warnings.filterwarnings("ignore")

## Univariate non-graphical exploratory data analysis (EDA)

### Importing the enriched comparis data

In [3]:
# Read and select variables
df_orig = pd.read_csv("comparis_prepared_data3.csv")[['web-scraper-order',
                                                'brand',
                                                'name_details_raw',
                                                'ps', 
                                                'km', 
                                                'reg_year', 
                                                'price', 
                                                'gear_type',
                                                'ps_cat',
                                                'fuel_type_benzin',
                                                'fuel_type_diesel',
                                                'fuel_type_elektro',
                                                'lat',
                                                'lon',
                                                'address']]

# Remove duplicates
df_orig = df_orig.drop_duplicates()
df_orig.head(5)

# Remove missing values
df_orig = df_orig.dropna()
df_orig.head(5)

# Calculate age from car
df_orig['age_of_the_car'] = 2024 - df_orig['reg_year']


# correct representation of the age of several years
df_orig.loc[df_orig["age_of_the_car"] == 2020, "age_of_the_car"] = 22
df_orig.loc[df_orig["age_of_the_car"] == 1825, "age_of_the_car"] = 52
df_orig.loc[df_orig["age_of_the_car"] == 1824, "age_of_the_car"] = 42
df_orig.loc[df_orig["age_of_the_car"] == 1823, "age_of_the_car"] = 32
df_orig.loc[df_orig["age_of_the_car"] == 1821, "age_of_the_car"] = 12
df_orig.loc[df_orig["age_of_the_car"] == 1820, "age_of_the_car"] = 2


### Filter cars

In [8]:
df = df_orig.loc[df_orig['km'] >= 10000]

### Shape (number of rows and colums)

In [9]:
# Number of rows and columns
print(df.shape)

(355, 16)


### Data types

In [10]:
df.dtypes

web-scraper-order     object
brand                 object
name_details_raw      object
ps                   float64
km                   float64
reg_year             float64
price                  int64
gear_type             object
ps_cat                object
fuel_type_benzin       int64
fuel_type_diesel       int64
fuel_type_elektro      int64
lat                  float64
lon                  float64
address               object
age_of_the_car       float64
dtype: object

### Summary statistics of numeric variables

In [11]:
df.describe()

Unnamed: 0,ps,km,reg_year,price,fuel_type_benzin,fuel_type_diesel,fuel_type_elektro,lat,lon,age_of_the_car
count,355.0,355.0,355.0,355.0,355.0,355.0,355.0,355.0,355.0,355.0
mean,187.670423,111527.098592,2014.112676,21958.321127,0.656338,0.315493,0.028169,47.124129,8.100918,9.887324
std,107.207076,75213.831006,6.313266,21351.812888,0.4756,0.465368,0.165689,0.419771,0.844629,6.313266
min,54.0,10000.0,1987.0,499.0,0.0,0.0,0.0,45.857773,6.070308,1.0
25%,120.0,53483.0,2011.0,6995.0,0.0,0.0,0.0,46.916365,7.585464,5.5
50%,150.0,95000.0,2016.0,15800.0,1.0,0.0,0.0,47.246483,8.256602,8.0
75%,216.5,160000.0,2018.5,28940.0,1.0,1.0,0.0,47.416695,8.665546,13.0
max,635.0,418000.0,2023.0,136900.0,1.0,1.0,1.0,47.714527,9.873875,37.0


### Statistical measures (min, max, std, mean, median, count) for selected variables

In [12]:
# Price
print('Price:',
      'Count:', round(df.price.count(), 1),
      'Min:', round(df.price.min(), 1),
      'Max:', round(df.price.max(), 1),
      'Mean:', round(df.price.mean(), 1),
      'Median:', round(df.price.median(), 1),
      'Std:', round(df.price.std(), 1))

# ps
print('Ps:',
      'Count:', round(df.ps.count(), 1),
      'Min:', round(df.ps.min(), 1),
      'Max:', round(df.ps.max(), 1),
      'Mean:', round(df.ps.mean(), 1),
      'Median:', round(df.ps.median(), 1),
      'Std:', round(df.ps.std(), 1))

Price: Count: 355 Min: 499 Max: 136900 Mean: 21958.3 Median: 15800.0 Std: 21351.8
Ps: Count: 355 Min: 54.0 Max: 635.0 Mean: 187.7 Median: 150.0 Std: 107.2
