# Explorative Analysis

# Import Packages

In [1]:
import pandas as pd
from pandas_profiling import ProfileReport

# Reading data

In [2]:
filename = 'Biergarten_data_for_analysis.csv'
df = pd.read_csv(filename, sep=';')

In [3]:
df.shape

(1278, 152)

In [4]:
df.head()

Unnamed: 0,venue_id,venue_name,rating,likes_cnt,city,cities,time,"Population on the 1st of January, 0-4 years, female","Population on the 1st of January, 0-4 years, male","Population on the 1st of January, 0-4 years, total",...,Number of private cars registered,Number of registered cars per 1000 population,People commuting into the city,People commuting out of the city,"Municipal waste generated (domestic and commercial), total - 1000 t",Price of a m³ of domestic water - Euro,Share of population connected to potable drinking water system - %,Share of population connected to sewerage treatment system - %,Share of the urban waste water load (in population equivalents) treated according to the applicable standard -%,Total use of water - m³
0,53541e32498e5522be7a5eed,Jockel Biergarten,5.6,44,Berlin,Berlin,2016,86434.0,91718.0,178152.0,...,1024876.0,291.2,295838.0,174263.0,1361.98,1.81,99.81,99.72,100.0,206000000.0
1,53541e32498e5522be7a5eed,Jockel Biergarten,5.6,44,Berlin,Berlin,2017,90989.0,96033.0,187022.0,...,,,309375.0,179977.0,1385.72,,,,,
2,53541e32498e5522be7a5eed,Jockel Biergarten,5.6,44,Berlin,Berlin,2018,92674.0,97822.0,190496.0,...,,,321219.0,185723.0,,,,,,
3,4c20c6424889a593e9df2620,Haus Sanssouci,7.7,9,Berlin,Berlin,2016,86434.0,91718.0,178152.0,...,1024876.0,291.2,295838.0,174263.0,1361.98,1.81,99.81,99.72,100.0,206000000.0
4,4c20c6424889a593e9df2620,Haus Sanssouci,7.7,9,Berlin,Berlin,2017,90989.0,96033.0,187022.0,...,,,309375.0,179977.0,1385.72,,,,,


# Missing values

In [5]:
# Set options to enable better understanding of data 
pd.options.display.max_rows = 200
pd.options.display.max_colwidth = 100
pd.options.display.max_columns = 50

In [6]:
# Count missing values per column
df_missings = pd.DataFrame(df.isnull().sum()).reset_index()
df_missings.columns = ['variable', 'cnt']
df_missings

Unnamed: 0,variable,cnt
0,venue_id,0
1,venue_name,0
2,rating,348
3,likes_cnt,0
4,city,0
5,cities,0
6,time,0
7,"Population on the 1st of January, 0-4 years, female",0
8,"Population on the 1st of January, 0-4 years, male",0
9,"Population on the 1st of January, 0-4 years, total",0


In [7]:
# Drop variables which have more than 400 missings
df_missings = df_missings[df_missings.cnt<400].reset_index(drop=True)
df_missings

Unnamed: 0,variable,cnt
0,venue_id,0
1,venue_name,0
2,rating,348
3,likes_cnt,0
4,city,0
5,cities,0
6,time,0
7,"Population on the 1st of January, 0-4 years, female",0
8,"Population on the 1st of January, 0-4 years, male",0
9,"Population on the 1st of January, 0-4 years, total",0


## Examine some of the variables one-by-one

In [8]:
var_of_interest = 'Employment (jobs) in information and communication (NACE Rev. 2, J)'
col_list = ['venue_id','venue_name','city','time']
col_list.append(var_of_interest)
df[col_list]

Unnamed: 0,venue_id,venue_name,city,time,"Employment (jobs) in information and communication (NACE Rev. 2, J)"
0,53541e32498e5522be7a5eed,Jockel Biergarten,Berlin,2016,102980.0
1,53541e32498e5522be7a5eed,Jockel Biergarten,Berlin,2017,111777.0
2,53541e32498e5522be7a5eed,Jockel Biergarten,Berlin,2018,123060.0
3,4c20c6424889a593e9df2620,Haus Sanssouci,Berlin,2016,102980.0
4,4c20c6424889a593e9df2620,Haus Sanssouci,Berlin,2017,111777.0
...,...,...,...,...,...
1273,4c138c3182a3c9b6fd95fbf8,Estragon,Mannheim,2017,6669.0
1274,4c138c3182a3c9b6fd95fbf8,Estragon,Mannheim,2018,7079.0
1275,4fa3a739e4b0bcbb45abacfb,ALTE AU Biergarten & Speisegaststaatte,Mannheim,2016,6570.0
1276,4fa3a739e4b0bcbb45abacfb,ALTE AU Biergarten & Speisegaststaatte,Mannheim,2017,6669.0


# Variable selection
Keep proportional variables and employment variables, drop the rest.

In [9]:
vars_to_keep = ['venue_id','venue_name','rating','likes_cnt','city','time','Women per 100 men','Population on the 1st of January, total']
for var in ['unemployment', 'proportion', 'activity', 'ratio', '\(jobs\)']:
    vars_to_keep.extend(df_missings[df_missings['variable'].str.lower().str.contains(var)]['variable'].to_list())
vars_to_keep

['venue_id',
 'venue_name',
 'rating',
 'likes_cnt',
 'city',
 'time',
 'Women per 100 men',
 'Population on the 1st of January, total',
 'Unemployment rate',
 'Unemployment rate, female',
 'Unemployment rate, male',
 'Proportion of population aged 0-4 years',
 'Proportion of population aged 10-14 years',
 'Proportion of population aged 15-19 years',
 'Proportion of population aged 20-24 years',
 'Proportion of population aged 25-34 years',
 'Proportion of population aged 35-44 years',
 'Proportion of population aged 45-54 years',
 'Proportion of population aged 5-9 years',
 'Proportion of population aged 65-74 years',
 'Proportion of population aged 75 years and over',
 'Proportion of total population aged 55-64',
 'EU foreigners as a proportion of population',
 'Foreigners as a proportion of population',
 'Nationals as a proportion of population',
 'Proportion of employment in agriculture fishery',
 'Proportion of employment in industries (NACE Rev.1.1 C-E)',
 'Activity rate',
 'Acti

### Calculate jobs per 1,000 people

In [10]:
df_jobs = df[vars_to_keep]
jobs_var_list = [
     'Employment (jobs) in public administration, defence, education, human health and social work activities (NACE Rev. 2, O to Q)',
     'Employment (jobs) in agriculture, fishery (NACE Rev. 2, A)',
     'Employment (jobs) in arts, entertainment and recreation; other service activities; activities of household and extra-territorial organizations and bodies (NACE Rev. 2, R to U)',
     'Employment (jobs) in construction (NACE Rev. 2, F)',
     'Employment (jobs) in financial and insurance activities (NACE Rev. 2, K)',
     'Employment (jobs) in information and communication (NACE Rev. 2, J)',
     'Employment (jobs) in mining, manufacturing, energy (NACE Rev. 2, B-E)',
     'Employment (jobs) in professional, scientific and technical activities; administrative and support service activities (NACE Rev. 2, M and N)',
     'Employment (jobs) in public administration, defence, education, human health and social work activities (NACE Rev. 2, O to Q)',
     'Employment (jobs) in real estate activities (NACE Rev. 2, L)',
     'Employment (jobs) in trade, transport, hotels, restaurants (NACE Rev. 2, G to I)']
for var in jobs_var_list:
    df_jobs[var] = df[var]/df['Population on the 1st of January, total']*1000
df_jobs

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app


Unnamed: 0,venue_id,venue_name,rating,likes_cnt,city,time,Women per 100 men,"Population on the 1st of January, total",Unemployment rate,"Unemployment rate, female","Unemployment rate, male",Proportion of population aged 0-4 years,Proportion of population aged 10-14 years,Proportion of population aged 15-19 years,Proportion of population aged 20-24 years,Proportion of population aged 25-34 years,Proportion of population aged 35-44 years,Proportion of population aged 45-54 years,Proportion of population aged 5-9 years,Proportion of population aged 65-74 years,Proportion of population aged 75 years and over,Proportion of total population aged 55-64,EU foreigners as a proportion of population,Foreigners as a proportion of population,Nationals as a proportion of population,Proportion of employment in agriculture fishery,Proportion of employment in industries (NACE Rev.1.1 C-E),Activity rate,"Activity rate, female","Activity rate, male",Age dependency ratio (population aged 0-19 and 65 and more to population aged 20-64),Old age dependency ratio (population 65 and over to population 20 to 64 years),Young-age dependency ratio (population aged 0-19 to population 20-64 years),"Employment (jobs) in public administration, defence, education, human health and social work activities (NACE Rev. 2, O to Q)","Employment (jobs) in agriculture, fishery (NACE Rev. 2, A)","Employment (jobs) in arts, entertainment and recreation; other service activities; activities of household and extra-territorial organizations and bodies (NACE Rev. 2, R to U)","Employment (jobs) in construction (NACE Rev. 2, F)","Employment (jobs) in financial and insurance activities (NACE Rev. 2, K)","Employment (jobs) in information and communication (NACE Rev. 2, J)","Employment (jobs) in mining, manufacturing, energy (NACE Rev. 2, B-E)","Employment (jobs) in professional, scientific and technical activities; administrative and support service activities (NACE Rev. 2, M and N)","Employment (jobs) in public administration, defence, education, human health and social work activities (NACE Rev. 2, O to Q).1","Employment (jobs) in real estate activities (NACE Rev. 2, L)","Employment (jobs) in trade, transport, hotels, restaurants (NACE Rev. 2, G to I)"
0,53541e32498e5522be7a5eed,Jockel Biergarten,5.6,44,Berlin,2016,103.9,3520031.0,7.9,7.1,8.6,5.1,3.9,4.0,5.4,17.1,13.5,15.3,4.5,9.9,9.4,11.9,6.2,15.5,84.5,0.0,7.7,60.6,56.2,65.2,58.1,30.5,27.7,143.653280,0.212214,40.855322,23.771666,10.739678,29.255424,38.830056,98.979526,143.653280,10.869507,108.793076
1,53541e32498e5522be7a5eed,Jockel Biergarten,5.6,44,Berlin,2017,103.6,3574830.0,7.1,6.5,7.6,5.2,4.0,4.1,5.4,17.1,13.5,14.9,4.5,9.5,9.8,12.0,6.4,16.7,83.3,0.0,7.6,60.9,56.2,65.8,59.0,30.6,28.5,144.205179,0.204206,39.396279,22.940951,10.480778,31.267781,38.310913,101.594202,144.205179,10.709041,108.254099
2,53541e32498e5522be7a5eed,Jockel Biergarten,5.6,44,Berlin,2018,103.4,3613495.0,6.2,5.6,6.7,5.3,4.1,4.1,5.5,17.0,13.8,14.3,4.6,9.3,10.0,12.2,6.7,17.6,82.4,0.1,7.5,61.4,56.6,66.5,59.3,30.6,28.7,144.622312,0.262903,40.418210,24.027984,10.558753,34.055672,38.417654,102.789958,144.622312,10.272050,107.364200
3,4c20c6424889a593e9df2620,Haus Sanssouci,7.7,9,Berlin,2016,103.9,3520031.0,7.9,7.1,8.6,5.1,3.9,4.0,5.4,17.1,13.5,15.3,4.5,9.9,9.4,11.9,6.2,15.5,84.5,0.0,7.7,60.6,56.2,65.2,58.1,30.5,27.7,143.653280,0.212214,40.855322,23.771666,10.739678,29.255424,38.830056,98.979526,143.653280,10.869507,108.793076
4,4c20c6424889a593e9df2620,Haus Sanssouci,7.7,9,Berlin,2017,103.6,3574830.0,7.1,6.5,7.6,5.2,4.0,4.1,5.4,17.1,13.5,14.9,4.5,9.5,9.8,12.0,6.4,16.7,83.3,0.0,7.6,60.9,56.2,65.8,59.0,30.6,28.5,144.205179,0.204206,39.396279,22.940951,10.480778,31.267781,38.310913,101.594202,144.205179,10.709041,108.254099
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1273,4c138c3182a3c9b6fd95fbf8,Estragon,7.5,35,Mannheim,2017,100.2,304781.0,4.6,4.5,4.7,4.7,4.0,4.9,7.6,16.4,12.9,14.8,4.1,8.8,9.9,11.9,11.6,22.8,77.2,0.2,19.4,59.6,53.8,65.4,57.1,29.3,27.9,151.229900,1.243516,30.198733,32.052523,21.415377,21.881285,137.000010,139.454231,151.229900,9.551120,162.864483
1274,4c138c3182a3c9b6fd95fbf8,Estragon,7.5,35,Mannheim,2018,100.4,307997.0,3.9,3.6,4.2,4.7,4.1,4.8,7.8,16.5,13.0,14.5,4.1,8.7,9.8,12.1,12.0,23.3,76.7,0.2,19.1,59.6,53.8,65.5,56.5,29.0,27.6,148.926775,1.282480,29.565223,32.948373,20.419030,22.983990,134.205853,140.806566,148.926775,9.087751,163.524320
1275,4fa3a739e4b0bcbb45abacfb,ALTE AU Biergarten & Speisegaststaatte,,6,Mannheim,2016,100.0,305780.0,4.8,5.0,4.7,4.5,4.1,5.0,7.7,16.5,12.9,15.0,4.1,9.0,9.6,11.6,11.2,22.6,77.4,0.2,19.5,58.9,53.5,64.3,56.9,29.1,27.8,151.324482,1.743083,30.901302,30.554647,22.045261,21.486036,138.338021,144.339067,151.324482,9.487213,160.373471
1276,4fa3a739e4b0bcbb45abacfb,ALTE AU Biergarten & Speisegaststaatte,,6,Mannheim,2017,100.2,304781.0,4.6,4.5,4.7,4.7,4.0,4.9,7.6,16.4,12.9,14.8,4.1,8.8,9.9,11.9,11.6,22.8,77.2,0.2,19.4,59.6,53.8,65.4,57.1,29.3,27.9,151.229900,1.243516,30.198733,32.052523,21.415377,21.881285,137.000010,139.454231,151.229900,9.551120,162.864483


In [11]:
# Double check for missing values
df_jobs.isnull().sum()

venue_id                                                                                                                                                                             0
venue_name                                                                                                                                                                           0
rating                                                                                                                                                                             348
likes_cnt                                                                                                                                                                            0
city                                                                                                                                                                                 0
time                                                                                 

In [12]:
# EU foreigners as a proportion of population is missing for all rows
df_jobs.drop('EU foreigners as a proportion of population', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


# The most recent data available

In [13]:
# Transform data to a format in which values to be filtered are on rows
tmp2 = pd.melt(df_jobs, id_vars=['venue_id','venue_name','city','time','rating','likes_cnt'])
# tmp2

Unnamed: 0,venue_id,venue_name,city,time,rating,likes_cnt,variable,value
0,53541e32498e5522be7a5eed,Jockel Biergarten,Berlin,2016,5.6,44,Women per 100 men,103.900000
1,53541e32498e5522be7a5eed,Jockel Biergarten,Berlin,2017,5.6,44,Women per 100 men,103.600000
2,53541e32498e5522be7a5eed,Jockel Biergarten,Berlin,2018,5.6,44,Women per 100 men,103.400000
3,4c20c6424889a593e9df2620,Haus Sanssouci,Berlin,2016,7.7,9,Women per 100 men,103.900000
4,4c20c6424889a593e9df2620,Haus Sanssouci,Berlin,2017,7.7,9,Women per 100 men,103.600000
...,...,...,...,...,...,...,...,...
47281,4c138c3182a3c9b6fd95fbf8,Estragon,Mannheim,2017,7.5,35,"Employment (jobs) in trade, transport, hotels, restaurants (NACE Rev. 2, G to I)",162.864483
47282,4c138c3182a3c9b6fd95fbf8,Estragon,Mannheim,2018,7.5,35,"Employment (jobs) in trade, transport, hotels, restaurants (NACE Rev. 2, G to I)",163.524320
47283,4fa3a739e4b0bcbb45abacfb,ALTE AU Biergarten & Speisegaststaatte,Mannheim,2016,,6,"Employment (jobs) in trade, transport, hotels, restaurants (NACE Rev. 2, G to I)",160.373471
47284,4fa3a739e4b0bcbb45abacfb,ALTE AU Biergarten & Speisegaststaatte,Mannheim,2017,,6,"Employment (jobs) in trade, transport, hotels, restaurants (NACE Rev. 2, G to I)",162.864483


In [14]:
# Sort the data such that the newest observation per variable is on the top
tmp3 = tmp2.sort_values(by=['city','venue_name','variable','time'], ascending=False)
# tmp3

In [15]:
# Get the most recent obs for each variable
tmp4 = tmp3.groupby(['city','venue_name','variable'], sort=False).first().reset_index()
# tmp4

In [16]:
# Separate columns for pivoting
tmp5 = tmp4[['venue_id', 'variable', 'value']]
# tmp5

In [17]:
# Pivot to get variables to columns
tmp6 = tmp5.pivot(index='venue_id', columns='variable', values='value')
# tmp6

In [18]:
# Merge ids and ratings by venue
tidy_data = tmp4.drop(['variable', 'value', 'time'], axis=1).merge(tmp6, on='venue_id').drop_duplicates().reset_index(drop=True)
tidy_data

Unnamed: 0,city,venue_name,venue_id,rating,likes_cnt,Activity rate,"Activity rate, female","Activity rate, male",Age dependency ratio (population aged 0-19 and 65 and more to population aged 20-64),"Employment (jobs) in agriculture, fishery (NACE Rev. 2, A)","Employment (jobs) in arts, entertainment and recreation; other service activities; activities of household and extra-territorial organizations and bodies (NACE Rev. 2, R to U)","Employment (jobs) in construction (NACE Rev. 2, F)","Employment (jobs) in financial and insurance activities (NACE Rev. 2, K)","Employment (jobs) in information and communication (NACE Rev. 2, J)","Employment (jobs) in mining, manufacturing, energy (NACE Rev. 2, B-E)","Employment (jobs) in professional, scientific and technical activities; administrative and support service activities (NACE Rev. 2, M and N)","Employment (jobs) in public administration, defence, education, human health and social work activities (NACE Rev. 2, O to Q)","Employment (jobs) in real estate activities (NACE Rev. 2, L)","Employment (jobs) in trade, transport, hotels, restaurants (NACE Rev. 2, G to I)",Foreigners as a proportion of population,Nationals as a proportion of population,Old age dependency ratio (population 65 and over to population 20 to 64 years),"Population on the 1st of January, total",Proportion of employment in agriculture fishery,Proportion of employment in industries (NACE Rev.1.1 C-E),Proportion of population aged 0-4 years,Proportion of population aged 10-14 years,Proportion of population aged 15-19 years,Proportion of population aged 20-24 years,Proportion of population aged 25-34 years,Proportion of population aged 35-44 years,Proportion of population aged 45-54 years,Proportion of population aged 5-9 years,Proportion of population aged 65-74 years,Proportion of population aged 75 years and over,Proportion of total population aged 55-64,Unemployment rate,"Unemployment rate, female","Unemployment rate, male",Women per 100 men,Young-age dependency ratio (population aged 0-19 to population 20-64 years)
0,Wuppertal,Wuppertaler Brauhaus,4cc4116c806246884232342f,7.0,37,54.3,48.4,60.5,66.8,0.373314,21.434430,16.567211,15.444441,9.100936,79.295229,71.396250,127.113323,5.913629,97.310444,18.8,81.2,34.8,353590.0,0.1,17.9,4.9,4.6,5.1,6.2,13.3,11.7,15.3,4.6,9.6,11.2,13.4,4.8,3.6,5.8,104.4,32.0
1,Wuppertal,Haus Zillertal,4b6ee171f964a520c8ce2ce3,,1,54.3,48.4,60.5,66.8,0.373314,21.434430,16.567211,15.444441,9.100936,79.295229,71.396250,127.113323,5.913629,97.310444,18.8,81.2,34.8,353590.0,0.1,17.9,4.9,4.6,5.1,6.2,13.3,11.7,15.3,4.6,9.6,11.2,13.4,4.8,3.6,5.8,104.4,32.0
2,Stuttgart,Wirtshaus & Hotel Garbe,4c1a61a9b9f876b0ed907846,8.8,83,59.1,54.5,63.8,54.3,1.286462,45.267984,23.998685,45.138390,46.306320,107.301069,169.348377,186.769352,7.606564,113.532667,24.6,75.4,27.6,632743.0,0.2,14.4,4.8,4.1,4.3,6.7,18.5,14.4,14.2,4.2,8.1,9.8,11.0,3.4,2.9,3.8,100.2,26.7
3,Stuttgart,Wichtel,4b54a421f964a5204bc427e3,8.1,91,59.1,54.5,63.8,54.3,1.286462,45.267984,23.998685,45.138390,46.306320,107.301069,169.348377,186.769352,7.606564,113.532667,24.6,75.4,27.6,632743.0,0.2,14.4,4.8,4.1,4.3,6.7,18.5,14.4,14.2,4.2,8.1,9.8,11.0,3.4,2.9,3.8,100.2,26.7
4,Stuttgart,Tschechen & Söhne,4be9a03a1838952148be0acf,8.0,30,59.1,54.5,63.8,54.3,1.286462,45.267984,23.998685,45.138390,46.306320,107.301069,169.348377,186.769352,7.606564,113.532667,24.6,75.4,27.6,632743.0,0.2,14.4,4.8,4.1,4.3,6.7,18.5,14.4,14.2,4.2,8.1,9.8,11.0,3.4,2.9,3.8,100.2,26.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
402,Berlin,ApartHotel Landhaus Lichterfelde mit Remise anno 1895,5716a94d498e134b509c2f1c,,0,61.4,56.6,66.5,59.3,0.262903,40.418210,24.027984,10.558753,34.055672,38.417654,102.789958,144.622312,10.272050,107.364200,17.6,82.4,30.6,3613495.0,0.1,7.5,5.3,4.1,4.1,5.5,17.0,13.8,14.3,4.6,9.3,10.0,12.2,6.2,5.6,6.7,103.4,28.7
403,Berlin,Ambrosius,4adcda85f964a520ef4821e3,5.4,12,61.4,56.6,66.5,59.3,0.262903,40.418210,24.027984,10.558753,34.055672,38.417654,102.789958,144.622312,10.272050,107.364200,17.6,82.4,30.6,3613495.0,0.1,7.5,5.3,4.1,4.1,5.5,17.0,13.8,14.3,4.6,9.3,10.0,12.2,6.2,5.6,6.7,103.4,28.7
404,Berlin,Alter Krug Dahlem,4b55ccc5f964a5209cf027e3,7.4,48,61.4,56.6,66.5,59.3,0.262903,40.418210,24.027984,10.558753,34.055672,38.417654,102.789958,144.622312,10.272050,107.364200,17.6,82.4,30.6,3613495.0,0.1,7.5,5.3,4.1,4.1,5.5,17.0,13.8,14.3,4.6,9.3,10.0,12.2,6.2,5.6,6.7,103.4,28.7
405,Berlin,Alter Fritz,4adcda74f964a520674521e3,7.4,14,61.4,56.6,66.5,59.3,0.262903,40.418210,24.027984,10.558753,34.055672,38.417654,102.789958,144.622312,10.272050,107.364200,17.6,82.4,30.6,3613495.0,0.1,7.5,5.3,4.1,4.1,5.5,17.0,13.8,14.3,4.6,9.3,10.0,12.2,6.2,5.6,6.7,103.4,28.7


# Pandas Profiling

In [None]:
# Drop some variables from profiling
tmp = tidy_data.drop(['venue_id','venue_name','city','rating','likes_cnt'], axis=1)

In [None]:
profile = ProfileReport(tmp, title='Pandas Profiling Report', 
                        html={'style':{'full_width':True}})

In [None]:
profile.to_widgets()

# Correlations

In [19]:
# Drop out highly correlating variables found in Profiling
vars_to_drop = [
    'Unemployment rate'
    ,'Foreigners as a proportion of population'
    ,'Activity rate, female'
    ,'Activity rate, male'
    ,'Age dependency ratio (population aged 0-19 and 65 and more to population aged 20-64)'
    ,'Old age dependency ratio (population 65 and over to population 20 to 64 years)'
    ,'Proportion of employment in agriculture fishery'    
]
df_no_correlations = tidy_data.drop(vars_to_drop, axis=1)
df_no_correlations.shape

(407, 34)

In [20]:
df_no_correlations.head()

Unnamed: 0,city,venue_name,venue_id,rating,likes_cnt,Activity rate,"Employment (jobs) in agriculture, fishery (NACE Rev. 2, A)","Employment (jobs) in arts, entertainment and recreation; other service activities; activities of household and extra-territorial organizations and bodies (NACE Rev. 2, R to U)","Employment (jobs) in construction (NACE Rev. 2, F)","Employment (jobs) in financial and insurance activities (NACE Rev. 2, K)","Employment (jobs) in information and communication (NACE Rev. 2, J)","Employment (jobs) in mining, manufacturing, energy (NACE Rev. 2, B-E)","Employment (jobs) in professional, scientific and technical activities; administrative and support service activities (NACE Rev. 2, M and N)","Employment (jobs) in public administration, defence, education, human health and social work activities (NACE Rev. 2, O to Q)","Employment (jobs) in real estate activities (NACE Rev. 2, L)","Employment (jobs) in trade, transport, hotels, restaurants (NACE Rev. 2, G to I)",Nationals as a proportion of population,"Population on the 1st of January, total",Proportion of employment in industries (NACE Rev.1.1 C-E),Proportion of population aged 0-4 years,Proportion of population aged 10-14 years,Proportion of population aged 15-19 years,Proportion of population aged 20-24 years,Proportion of population aged 25-34 years,Proportion of population aged 35-44 years,Proportion of population aged 45-54 years,Proportion of population aged 5-9 years,Proportion of population aged 65-74 years,Proportion of population aged 75 years and over,Proportion of total population aged 55-64,"Unemployment rate, female","Unemployment rate, male",Women per 100 men,Young-age dependency ratio (population aged 0-19 to population 20-64 years)
0,Wuppertal,Wuppertaler Brauhaus,4cc4116c806246884232342f,7.0,37,54.3,0.373314,21.43443,16.567211,15.444441,9.100936,79.295229,71.39625,127.113323,5.913629,97.310444,81.2,353590.0,17.9,4.9,4.6,5.1,6.2,13.3,11.7,15.3,4.6,9.6,11.2,13.4,3.6,5.8,104.4,32.0
1,Wuppertal,Haus Zillertal,4b6ee171f964a520c8ce2ce3,,1,54.3,0.373314,21.43443,16.567211,15.444441,9.100936,79.295229,71.39625,127.113323,5.913629,97.310444,81.2,353590.0,17.9,4.9,4.6,5.1,6.2,13.3,11.7,15.3,4.6,9.6,11.2,13.4,3.6,5.8,104.4,32.0
2,Stuttgart,Wirtshaus & Hotel Garbe,4c1a61a9b9f876b0ed907846,8.8,83,59.1,1.286462,45.267984,23.998685,45.13839,46.30632,107.301069,169.348377,186.769352,7.606564,113.532667,75.4,632743.0,14.4,4.8,4.1,4.3,6.7,18.5,14.4,14.2,4.2,8.1,9.8,11.0,2.9,3.8,100.2,26.7
3,Stuttgart,Wichtel,4b54a421f964a5204bc427e3,8.1,91,59.1,1.286462,45.267984,23.998685,45.13839,46.30632,107.301069,169.348377,186.769352,7.606564,113.532667,75.4,632743.0,14.4,4.8,4.1,4.3,6.7,18.5,14.4,14.2,4.2,8.1,9.8,11.0,2.9,3.8,100.2,26.7
4,Stuttgart,Tschechen & Söhne,4be9a03a1838952148be0acf,8.0,30,59.1,1.286462,45.267984,23.998685,45.13839,46.30632,107.301069,169.348377,186.769352,7.606564,113.532667,75.4,632743.0,14.4,4.8,4.1,4.3,6.7,18.5,14.4,14.2,4.2,8.1,9.8,11.0,2.9,3.8,100.2,26.7


In [21]:
df_no_correlations.to_csv('tidy_data.csv', sep=';', index=False, header=True)