# EDA of data 

We will explore the data to examine the claim that Covid negatively impacted the production in the automotive industry 

In [129]:
import os
import numpy as np
import pandas as pd
import plotly.express as px

In [140]:
automotive_countries_pdf = pd.read_csv(os.path.join(os.pardir, 'data', 'automotive', 'oica_countries_17_to_21.csv'))
automotive_continents_pdf = pd.read_csv(os.path.join(os.pardir, 'data', 'automotive', 'oica_continents_17_to_21.csv'))
covid_pdf = pd.read_csv(os.path.join(os.pardir, 'data', 'covid', 'covid_agg.csv'))
response_pdf = pd.read_csv(os.path.join(os.pardir, 'data', 'covid', 'response_graphs_data_2021-12-09.csv'))

In [53]:
# Defining some utility functions for data transformation 
def year_cols_to_rows(pdf):
    pdf = pdf.transpose()
    cols = pdf.iloc[0]
    pdf = pdf[1:]
    pdf.columns = cols
    pdf
    pdf = pdf.rename({'country':'year'})
    return pdf

In [102]:
corrected = pd.melt(automotive_countries_pdf, id_vars=['country'], var_name='year', value_name='production').dropna()
corrected

Unnamed: 0,country,year,production
0,AUSTRIA,2017,100398.0
1,BELGIUM,2017,332979.0
2,FINLAND,2017,108838.0
3,FRANCE,2017,1754000.0
4,GERMANY,2017,5645584.0
...,...,...,...
218,BRAZIL,2021,1243106.0
219,CHINA,2021,14657512.0
220,INDIA,2021,2733062.0
236,AZERBAIJAN,2021,1516.0


In [106]:
covid_pdf['country'] = covid_pdf['Country_Region'].str.upper()
covid_pdf

Unnamed: 0,Country_Region,year,Confirmed,Deaths,Recovered,Active,Incident_Rate,Case_Fatality_Ratio,country
0,Afghanistan,2020,52330,2189,41727.0,7916.188679,121.625120,3.913062,AFGHANISTAN
1,Afghanistan,2021,157858,7331,82586.0,15623.259259,272.287093,4.434308,AFGHANISTAN
2,Albania,2020,58316,1181,33634.0,19419.207547,1459.439011,2.118022,ALBANIA
3,Albania,2021,204627,3152,130314.0,17691.259259,4758.528632,1.740721,ALBANIA
4,Algeria,2020,99610,2756,67127.0,26434.339623,192.710762,2.933269,ALGERIA
...,...,...,...,...,...,...,...,...,...
386,Yemen,2021,10081,1973,4251.0,1258.916667,21.540478,21.349153,YEMEN
387,Zambia,2020,20725,388,18660.0,576.207547,98.580523,2.012397,ZAMBIA
388,Zambia,2021,212278,3671,189658.0,6144.569444,762.727480,1.550910,ZAMBIA
389,Zimbabwe,2020,13867,363,11250.0,1297.773585,72.077654,2.769321,ZIMBABWE


In [110]:
covid_countries = list(covid_pdf['country'].values)
car_countries = list(corrected['country'].values)
set([x for x in car_countries if not x in covid_countries])

{'CZECH REPUBLIC', 'MYANMAR', 'SOUTH KOREA', 'TAIWAN', 'USA'}

In [119]:
country_name_update = {
    'CZECHIA' : 'CZECH REPUBLIC',
    'BURMA': 'MYANMAR',
    'KOREA, SOUTH': 'SOUTH KOREA',
    'TAIWAN*' : 'TAIWAN', 
    'US' : 'USA'
}

In [125]:
covid_pdf.country = covid_pdf.country.replace(country_name_update)
covid_pdf[covid_pdf.country == 'USA']

Unnamed: 0,Country_Region,year,Confirmed,Deaths,Recovered,Active,Incident_Rate,Case_Fatality_Ratio,country
363,US,2020,20164082,351754,6399531.0,4475.46215,4952.517859,1.867393,USA
364,US,2021,50374543,802510,0.0,5622.105539,10959.660883,2.753959,USA


In [130]:
corrected.year = corrected.year.astype(np.int64)

In [134]:
merged_df = corrected.merge(covid_pdf, how='left', on=['country', 'year']).drop(['Country_Region'], axis=1).dropna()
merged_df

Unnamed: 0,country,year,production,Confirmed,Deaths,Recovered,Active,Incident_Rate,Case_Fatality_Ratio
136,AUSTRIA,2020,104543.0,360815.0,6222.0,332952.0,49727.226415,3175.244145,1.257113
137,BELGIUM,2020,237057.0,646496.0,19528.0,31130.0,72009.545597,5233.103834,13.967206
138,FINLAND,2020,86270.0,36107.0,561.0,29000.0,7552.377358,488.224648,1.639615
139,FRANCE,2020,927718.0,2677656.0,64758.0,200142.0,190656.036021,2066.163063,0.821113
140,GERMANY,2020,3515372.0,1746929.0,33791.0,1350708.0,18850.107658,1292.606847,1.554289
...,...,...,...,...,...,...,...,...,...
210,BRAZIL,2021,1243106.0,22201221.0,617271.0,17771228.0,45197.359225,9370.184982,2.425855
211,CHINA,2021,14657512.0,112573.0,4849.0,99228.0,23.963508,10.382468,0.776458
212,INDIA,2021,2733062.0,34718602.0,476478.0,30974748.0,26765.978353,2823.377251,1.215202
213,AZERBAIJAN,2021,1516.0,607076.0,8137.0,333694.0,9626.722222,3606.533548,1.382517


In [135]:
px.scatter_matrix(merged_df).update

In [139]:
merged_df.corr()

Unnamed: 0,year,production,Confirmed,Deaths,Recovered,Active,Incident_Rate,Case_Fatality_Ratio
year,1.0,-0.029418,0.324585,0.335646,0.240285,0.200184,0.666992,-0.010417
production,-0.029418,1.0,0.031361,0.016992,0.054566,-0.139593,-0.185787,-0.126368
Confirmed,0.324585,0.031361,1.0,0.938621,0.615557,-0.021078,0.336067,-0.019916
Deaths,0.335646,0.016992,0.938621,1.0,0.619016,-0.054307,0.333176,0.114293
Recovered,0.240285,0.054566,0.615557,0.619016,1.0,-0.07984,0.128979,-0.046865
Active,0.200184,-0.139593,-0.021078,-0.054307,-0.07984,1.0,0.376194,-0.045136
Incident_Rate,0.666992,-0.185787,0.336067,0.333176,0.128979,0.376194,1.0,0.086297
Case_Fatality_Ratio,-0.010417,-0.126368,-0.019916,0.114293,-0.046865,-0.045136,0.086297,1.0


In [144]:
response_pdf['country'] = response_pdf['Country'].str.upper()
response_pdf.country = response_pdf.country.replace(country_name_update)
response_pdf.drop(['Country'], axis=1)

Unnamed: 0,Response_measure,date_start,date_end,country
0,AdaptationOfWorkplace,2020-03-16,2020-04-13,AUSTRIA
1,AdaptationOfWorkplace,2021-04-01,,AUSTRIA
2,BanOnAllEvents,2020-11-03,2021-05-18,AUSTRIA
3,ClosDaycare,2020-03-16,2020-05-04,AUSTRIA
4,ClosDaycare,2020-11-17,2020-12-07,AUSTRIA
...,...,...,...,...
2028,StayHomeOrderPartial,2020-05-10,2020-07-04,UNITED KINGDOM
2029,StayHomeRiskG,2020-03-16,2020-07-05,UNITED KINGDOM
2030,StayHomeRiskG,2020-11-05,2020-12-01,UNITED KINGDOM
2031,StayHomeRiskGPartial,2020-07-06,2020-08-01,UNITED KINGDOM


In [100]:

px.line(corrected, y='production', x='year', color='country')

## Plotting production trend in Continents 

Note these numbers are based on car production excluding LCVs HCVs and heavy busses 

In [54]:
continents_pdf = year_cols_to_rows(automotive_continents_pdf)
continents_pdf

country,EUROPE,AMERICA,ASIA-OCEANIA,AFRICA
2017,19026293.0,8162300.0,44802137.0,672282.0
2018,19660923.0,7690288.0,43622768.0,776967.0
2019,18724208.0,6993215.0,40650626.0,795720.0
2020,14545984.0,4967177.0,35837271.0,484023.0
2021,10023999.0,3328590.0,26328648.0,


In [55]:
px.line(continents_pdf)

In [88]:
years = ['2017', '2018', '2019', '2020', '2021']

continents_delta_pdf = automotive_continents_pdf
for i in range(1, len(years)):
    continents_delta_pdf[f'delta{years[i-1]}_{years[i]}'] = continents_delta_pdf[years[i]] - continents_delta_pdf[years[i-1]]

continents_delta_pdf = year_cols_to_rows(continents_delta_pdf[['country', 'delta2017_2018', 'delta2018_2019', 'delta2019_2020', 'delta2020_2021']])
px.line(continents_delta_pdf)

## Plotting trend in individual countries per continent 

In [56]:
europe = ['AUSTRIA', 'BELGIUM', 'FINLAND', 'FRANCE', 'GERMANY', 'ITALY', 'PORTUGAL', 'SPAIN', 
          'UNITED KINGDOM', 'CZECH REPUBLIC', 'HUNGARY', 'POLAND', 'ROMANIA', 'SLOVAKIA', 
          'SLOVENIA', 'SERBIA', 'RUSSIA', 'BELARUS', 'KAZAKHSTAN', 'UKRAINE', 'UZBEKISTAN', 
          'TURKEY', 'NETHERLANDS', 'SWEDEN', 'AZERBAIJAN']

america = ['CANADA', 'MEXICO', 'USA', 'ARGENTINA', 'BRAZIL', 'COLOMBIA',]

asia_oceania = ['CHINA', 'INDIA', 'INDONESIA', 'THAILAND', 'VIETNAM', 'IRAN', 'JAPAN', 'MYANMAR',
                'MALAYSIA', 'PAKISTAN', 'SOUTH KOREA', 'TAIWAN', 'PHILIPPINES']

africa = ['ALGERIA', 'EGYPT', 'MOROCCO', 'SOUTH AFRICA']

europe_countries_pdf =  year_cols_to_rows(automotive_countries_pdf[automotive_countries_pdf.country.isin(europe)])
america_countries_pdf =  year_cols_to_rows(automotive_countries_pdf[automotive_countries_pdf.country.isin(america)])
asia_oceania_countries_pdf =  year_cols_to_rows(automotive_countries_pdf[automotive_countries_pdf.country.isin(asia_oceania)])
africa_countries_pdf =  year_cols_to_rows(automotive_countries_pdf[automotive_countries_pdf.country.isin(africa)])


In [58]:
px.line(europe_countries_pdf)

In [59]:
px.line(america_countries_pdf)

In [60]:
px.line(asia_oceania_countries_pdf)

In [61]:
px.line(africa_countries_pdf)