In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', None)

In [2]:
df = pd.read_csv('../all_content_2020_v4.csv', dtype={'CODGEO': 'string'})

  exec(code_obj, self.user_global_ns, self.user_ns)


In [3]:
len(df.columns)

738

In [4]:
df.columns.to_list()

['Unnamed: 0',
 'CODGEO',
 'EU_circo',
 'num_region',
 'num_dep',
 'nom_commune',
 'nom_dep',
 'nom_region',
 'num_chef_lieu',
 'nom_chef_lieu',
 'code_postal',
 'latitude',
 'longitude',
 'pop_tot',
 'SUPERF',
 'P19_POP',
 'P19_POP1529',
 'P19_POP3044',
 'P19_POP4559',
 'P19_POP6074',
 'P19_POP7589',
 'P19_POP90P',
 'P19_POPH',
 'P19_H1529',
 'P19_H3044',
 'P19_H4559',
 'P19_H6074',
 'P19_H7589',
 'P19_H90P',
 'P19_H0019',
 'P19_H2064',
 'P19_H65P',
 'P19_POPF',
 'P19_F1529',
 'P19_F3044',
 'P19_F4559',
 'P19_F6074',
 'P19_F7589',
 'P19_F90P',
 'P19_F0019',
 'P19_F2064',
 'P19_F65P',
 'P19_POP01P',
 'P19_POP01P_IRAN1',
 'P19_POP01P_IRAN2',
 'P19_POP01P_IRAN3',
 'P19_POP01P_IRAN4',
 'P19_POP01P_IRAN5',
 'P19_POP01P_IRAN6',
 'P19_POP01P_IRAN7',
 'P19_POP0114_IRAN2P',
 'P19_POP0114_IRAN2',
 'P19_POP0114_IRAN3P',
 'P19_POP1524_IRAN2P',
 'P19_POP1524_IRAN2',
 'P19_POP1524_IRAN3P',
 'P19_POP2554_IRAN2P',
 'P19_POP2554_IRAN2',
 'P19_POP2554_IRAN3P',
 'P19_POP55P_IRAN2P',
 'P19_POP55P_IRAN2',

Est-ce que l'activité économique a une influence sur le revenu médian du lieu d'habitation ?
Les gens ne vivent pas là où ils travaillent.

## CHOISIR 4 OU 5 VARIABLES POUR LE DATASET FINAL ##

Dans notre dataset, nous disposons de 738 variables dont 168 concernant les entreprises réparties entre deux thématiques principales déclinées par plusieurs sous-thèmes:

- Démographie des entreprises, thème décliné en fonction de :
    - [Unité légales](https://www.insee.fr/fr/metadonnees/definition/c1044)
    - Créations
    - Activité
- Caractèristiques des [établissements](https://www.insee.fr/fr/metadonnees/definition/c1377), thème décliné en fonction de :
    - Activité
    - Taille
    - Effectifs
    - Sphère (=impact)
    

On peut en ressortir plusieurs indicateurs :

- Impact de l'activité sur le territoire
- % de sièges sociaux = centre de décision = cadres ?
- Dynamisme des créations




In [5]:
ets = pd.DataFrame({'CODGEO': df.CODGEO, 
                    'ets_total':df.ETTOT20, 
                    'ets_local': df.ETPRES20, 
                    'ets_externe':df.ETNPRES20, 
                    'ets_creation': df.ETCTOT20,
                    'ent_creation': df.ENCTOT20,
                    'ets_industrie': df.ETBE20,
                    'ets_effectif50p_p': df.ETTEF5020})

In [6]:
ets.ets_total = ets.ets_total.replace(0.0, np.nan)

In [7]:
ets = ets.dropna(subset=['ets_total']).reset_index(drop=True)

In [8]:
ets.head()

Unnamed: 0,CODGEO,ets_total,ets_local,ets_externe,ets_creation,ent_creation,ets_industrie,ets_effectif50p_p
0,1001,14.0,9.0,5.0,12,11,0.0,0.0
1,1002,4.0,2.0,2.0,3,3,0.0,0.0
2,1004,586.0,462.0,124.0,192,176,31.0,21.0
3,1005,46.0,36.0,10.0,24,23,4.0,0.0
4,1006,2.0,1.0,1.0,0,0,0.0,0.0


In [9]:
ets['ets_impact'] = ets.apply(lambda x: 'local' if x['ets_local'] > x['ets_externe'] else ('externe' if x['ets_externe'] > x['ets_local'] else 'équilibré') , axis=1)

In [10]:
ets['ets_impact_local_p'] = ets.apply(lambda x:  np.round(x['ets_local'] / x['ets_total'] * 100), axis=1)

In [11]:
ets['ets_creation_p'] = ets.apply(lambda x: np.round(x['ets_creation'] / x['ets_total'] * 100), axis=1)

In [12]:
ets['ets_industrie_p'] = ets.apply(lambda x: np.round(x['ets_industrie'] / x['ets_total'] * 100), axis=1)

In [13]:
ets['ets_taille50p_p'] = ets.apply(lambda x: np.round(x['ets_effectif50p_p'] / x['ets_total'] * 100), axis=1)

In [14]:
ets

Unnamed: 0,CODGEO,ets_total,ets_local,ets_externe,ets_creation,ent_creation,ets_industrie,ets_effectif50p_p,ets_impact,ets_impact_local_p,ets_creation_p,ets_industrie_p,ets_taille50p_p
0,01001,14.0,9.0,5.0,12,11,0.0,0.0,local,64.0,86.0,0.0,0.0
1,01002,4.0,2.0,2.0,3,3,0.0,0.0,équilibré,50.0,75.0,0.0,0.0
2,01004,586.0,462.0,124.0,192,176,31.0,21.0,local,79.0,33.0,5.0,4.0
3,01005,46.0,36.0,10.0,24,23,4.0,0.0,local,78.0,52.0,9.0,0.0
4,01006,2.0,1.0,1.0,0,0,0.0,0.0,équilibré,50.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
34825,97420,491.0,341.0,150.0,188,169,38.0,11.0,local,69.0,38.0,8.0,2.0
34826,97421,154.0,110.0,44.0,40,36,9.0,3.0,local,71.0,26.0,6.0,2.0
34827,97422,1832.0,1342.0,490.0,757,668,126.0,31.0,local,73.0,41.0,7.0,2.0
34828,97423,124.0,96.0,28.0,59,52,6.0,4.0,local,77.0,48.0,5.0,3.0


In [15]:
ets.ets_impact.value_counts()

local        23339
externe       7919
équilibré     3572
Name: ets_impact, dtype: int64

In [16]:
ets = ets.drop(['ets_local', 'ets_externe', 'ets_creation', 'ets_impact', 'ent_creation', 'ets_industrie', 'ets_effectif50p_p'], axis=1)

In [17]:
ets.head()

Unnamed: 0,CODGEO,ets_total,ets_impact_local_p,ets_creation_p,ets_industrie_p,ets_taille50p_p
0,1001,14.0,64.0,86.0,0.0,0.0
1,1002,4.0,50.0,75.0,0.0,0.0
2,1004,586.0,79.0,33.0,5.0,4.0
3,1005,46.0,78.0,52.0,9.0,0.0
4,1006,2.0,50.0,0.0,0.0,0.0


In [18]:
ets[ets.ets_total > 100].sort_values(by='ets_taille50p_p', ascending=False)

Unnamed: 0,CODGEO,ets_total,ets_impact_local_p,ets_creation_p,ets_industrie_p,ets_taille50p_p
334,01390,149.0,34.0,22.0,19.0,25.0
30145,77291,140.0,64.0,64.0,10.0,24.0
30481,78297,684.0,56.0,77.0,6.0,17.0
34360,91534,140.0,53.0,44.0,11.0,14.0
16278,44150,173.0,40.0,22.0,12.0,14.0
...,...,...,...,...,...,...
5578,17051,174.0,82.0,29.0,6.0,0.0
2003,06140,111.0,76.0,50.0,5.0,0.0
5645,17121,118.0,83.0,34.0,2.0,0.0
23395,61474,126.0,44.0,19.0,7.0,0.0


### Questions ###

- Un taux d'industrie élevé a-t-il un impact négatif sur la sphère locale ?

In [19]:
np.average(ets['ets_creation_p'])

40.88664944013781

In [20]:
len(ets)

34830

In [21]:
ets.isna().sum()

CODGEO                0
ets_total             0
ets_impact_local_p    0
ets_creation_p        0
ets_industrie_p       1
ets_taille50p_p       1
dtype: int64

In [22]:
aires = pd.read_excel('../Aires_attraction_villes/AAV2020_au_01-01-2020_v1.xlsx')