# <font color="#0000a0">Projet 6 : Effectuez une prédiction de revenus</font>

##  <font color="#0000a0">Objectif projet : proposer une régression linéaire avec 3 variables :</font>

- le revenu des parents ;
- le revenu moyen du pays dans lequel habite le prospect ;
- l'indice de Gini calculé sur les revenus des habitants du pays en question. 

## <font color="#0000a0">Dans ce notebook :</font>
1. Attribution d'une classe parent à chaque enfant 
2. ANOVA sur la variable revenus des enfants en fonction du pays
3. Régression linéaire 1 (2 variables explicatives : revenu moyen et indice de gini)
4. Régression linéaire 2 (3 variables explicatives : revenu moyen, indice de gini, classe des parents)
5. Décomposition de la variance
6. Réponses aux questions

In [261]:
import scipy.stats as st
import pandas as pd
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
import json


In [206]:
# Import du dataframe
df = pd.read_excel('output/df3.xlsx', sheet_name='Sheet1')
del df['Unnamed: 0']
df = df.sort_values(by=['country','quantile'])
df

Unnamed: 0,country,year_survey,quantile,income,gdpppp,country_name,gini,IGEincome
0,ALB,2008,1,728.89795,7297.0,Albanie,30.462429,0.815874
1,ALB,2008,2,916.66235,7297.0,Albanie,30.462429,0.815874
2,ALB,2008,3,1010.91600,7297.0,Albanie,30.462429,0.815874
3,ALB,2008,4,1086.90780,7297.0,Albanie,30.462429,0.815874
4,ALB,2008,5,1132.69970,7297.0,Albanie,30.462429,0.815874
...,...,...,...,...,...,...,...,...
11494,ZAF,2008,96,24553.56800,9602.0,Afrique du Sud,66.977850,0.677000
11495,ZAF,2008,97,28858.03100,9602.0,Afrique du Sud,66.977850,0.677000
11496,ZAF,2008,98,35750.29000,9602.0,Afrique du Sud,66.977850,0.677000
11497,ZAF,2008,99,46297.31600,9602.0,Afrique du Sud,66.977850,0.677000


## <font color="#0000a0">1. Attribution d'une classe parent à chaque enfant </font>

### Clonage de 1000 individus


In [207]:
df_try = df.append([df]*999,ignore_index=True)
df_try

Unnamed: 0,country,year_survey,quantile,income,gdpppp,country_name,gini,IGEincome
0,ALB,2008,1,728.89795,7297.0,Albanie,30.462429,0.815874
1,ALB,2008,2,916.66235,7297.0,Albanie,30.462429,0.815874
2,ALB,2008,3,1010.91600,7297.0,Albanie,30.462429,0.815874
3,ALB,2008,4,1086.90780,7297.0,Albanie,30.462429,0.815874
4,ALB,2008,5,1132.69970,7297.0,Albanie,30.462429,0.815874
...,...,...,...,...,...,...,...,...
11599995,ZAF,2008,96,24553.56800,9602.0,Afrique du Sud,66.977850,0.677000
11599996,ZAF,2008,97,28858.03100,9602.0,Afrique du Sud,66.977850,0.677000
11599997,ZAF,2008,98,35750.29000,9602.0,Afrique du Sud,66.977850,0.677000
11599998,ZAF,2008,99,46297.31600,9602.0,Afrique du Sud,66.977850,0.677000


### Initialisation d'une classe parent à 1

In [208]:
df_try['c_parent'] = 1
df_try

Unnamed: 0,country,year_survey,quantile,income,gdpppp,country_name,gini,IGEincome,c_parent
0,ALB,2008,1,728.89795,7297.0,Albanie,30.462429,0.815874,1
1,ALB,2008,2,916.66235,7297.0,Albanie,30.462429,0.815874,1
2,ALB,2008,3,1010.91600,7297.0,Albanie,30.462429,0.815874,1
3,ALB,2008,4,1086.90780,7297.0,Albanie,30.462429,0.815874,1
4,ALB,2008,5,1132.69970,7297.0,Albanie,30.462429,0.815874,1
...,...,...,...,...,...,...,...,...,...
11599995,ZAF,2008,96,24553.56800,9602.0,Afrique du Sud,66.977850,0.677000,1
11599996,ZAF,2008,97,28858.03100,9602.0,Afrique du Sud,66.977850,0.677000,1
11599997,ZAF,2008,98,35750.29000,9602.0,Afrique du Sud,66.977850,0.677000,1
11599998,ZAF,2008,99,46297.31600,9602.0,Afrique du Sud,66.977850,0.677000,1


In [209]:
# Nouveau new_df avec seulement pays et IGE income pour les distributions conditionnelles
df_new = df[['country','IGEincome']].drop_duplicates()
df_new

Unnamed: 0,country,IGEincome
0,ALB,0.815874
100,ARG,0.500000
200,ARM,0.400000
300,AUT,0.245267
400,AZE,0.500000
...,...,...
10999,VEN,0.500000
11099,VNM,0.480000
5800,XKX,0.400000
11299,YEM,0.500000


### Fonctions nécessaires à la génération des distibutions conditionnelles

In [210]:
def generate_incomes(n, pj):
    # On génère les revenus des parents (exprimés en logs) selon une loi normale.
    # La moyenne et variance n'ont aucune incidence sur le résultat final (ie. sur le caclul de la classe de revenu)
    ln_y_parent = st.norm(0,1).rvs(size=n)
    # Génération d'une réalisation du terme d'erreur epsilon
    residues = st.norm(0,1).rvs(size=n)
    return np.exp(pj*ln_y_parent + residues), np.exp(ln_y_parent)

def quantiles(l, nb_quantiles):
    size = len(l)
    l_sorted = l.copy()
    l_sorted = l_sorted.sort_values()
    quantiles = np.round(np.arange(1, nb_quantiles+1, nb_quantiles/size) -0.5 +1./size)
    q_dict = {a:int(b) for a,b in zip(l_sorted,quantiles)}
    return pd.Series([q_dict[e] for e in l])

def compute_quantiles(y_child, y_parents, nb_quantiles):
    y_child = pd.Series(y_child)
    y_parents = pd.Series(y_parents)
    c_i_child = quantiles(y_child, nb_quantiles)
    c_i_parent = quantiles(y_parents, nb_quantiles)
    sample = pd.concat([y_child, y_parents, c_i_child, c_i_parent], axis=1)
    sample.columns = ["y_child", "y_parents", "c_i_child","c_i_parent"]
    return sample

def distribution(counts, nb_quantiles):
    distrib = []
    total = counts["counts"].sum()
    
    if total == 0 :
        return [0] * nb_quantiles
    
    for q_p in range(1, nb_quantiles+1):
        subset = counts[counts.c_i_parent == q_p]
        if len(subset):
            nb = subset["counts"].values[0]
            distrib += [nb / total]
        else:
            distrib += [0]
    return distrib   

def conditional_distributions(sample, nb_quantiles):
    counts = sample.groupby(["c_i_child","c_i_parent"]).apply(len)
    counts = counts.reset_index()
    counts.columns = ["c_i_child","c_i_parent","counts"]
    
    mat = []
    for child_quantile in np.arange(nb_quantiles)+1:
        subset = counts[counts.c_i_child == child_quantile]
        mat += [distribution(subset, nb_quantiles)]
    return np.array(mat) 

def plot_conditional_distributions(p, cd, nb_quantiles):
    plt.figure()
    plt.gcf().set_size_inches(15, 10)
    
    # La ligne suivante sert à afficher un graphique en "stack bars", sur ce modèle : https://matplotlib.org/gallery/lines_bars_and_markers/bar_stacked.html
    cumul = np.array([0] * nb_quantiles)
    
    for i, child_quantile in enumerate(cd):
        plt.bar(np.arange(nb_quantiles)+1, child_quantile, bottom=cumul, width=0.95, label = str(i+1) +"e")
        cumul = cumul + np.array(child_quantile)

    plt.axis([.5, nb_quantiles*1.3 ,0 ,1])
    plt.title("p=" + str(p))
    plt.legend()
    plt.xlabel("quantile parents")
    plt.ylabel("probabilité du quantile enfant")
    plt.show()
    
def proba_cond(c_i_parent, c_i_child, mat):
    return mat[c_i_child, c_i_parent]

### Calcul des distibutions conditionnelles pour chaque pays 

In [45]:
# Distribution conditionnelles pour les 116 pays, 100 quantiles et 1000 individus
#cd_country = {}
#for idx, row in df_new.iterrows() :
    #pj = row['IGEincome']
    #nb_quantiles = 100       # nombre de quantiles
    #n  = 1000*nb_quantiles   # taille de l'échantillon 1000 * 10 

    #y_child, y_parents = generate_incomes(n, pj) 
    #sample = compute_quantiles(y_child, y_parents, nb_quantiles)
    #cd = conditional_distributions(sample, nb_quantiles) 

    #cd_country[row['country']] = cd.tolist()

### Stockage des distributions conditionnelles dans fichier JSON

In [46]:
with open("cd_country.json", "w") as file:
    json.dump(cd_country, file)

### Passage du fichier JSON en dictionnaire

In [211]:
# mettre les distributions conditionnelles dans variable cd
with open('cd_country.json', 'r') as f:
    cd = json.load(f) 

In [212]:
type(cd)

dict

### Calcul du nombre de classe parents pour chaque classe enfant 

In [220]:
c_parent = []
for country,value in cd.items():
    for c_enfant in value:
        for idx,col in enumerate(c_enfant):
            n = int(col*1000)
            c_parent += [idx+1] * n       
len(c_parent)

11600000

In [214]:
c_parent

[1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,


### Attribution de ces classes parents dans le dataframe

In [221]:
df_try.sort_values(['country','quantile'], ascending = True)

Unnamed: 0,country,year_survey,quantile,income,gdpppp,country_name,gini,IGEincome,c_parent
0,ALB,2008,1,728.89795,7297.0,Albanie,30.462429,0.815874,1
11600,ALB,2008,1,728.89795,7297.0,Albanie,30.462429,0.815874,1
23200,ALB,2008,1,728.89795,7297.0,Albanie,30.462429,0.815874,1
34800,ALB,2008,1,728.89795,7297.0,Albanie,30.462429,0.815874,1
46400,ALB,2008,1,728.89795,7297.0,Albanie,30.462429,0.815874,1
...,...,...,...,...,...,...,...,...,...
11553599,ZAF,2008,100,82408.55000,9602.0,Afrique du Sud,66.977850,0.677000,100
11565199,ZAF,2008,100,82408.55000,9602.0,Afrique du Sud,66.977850,0.677000,100
11576799,ZAF,2008,100,82408.55000,9602.0,Afrique du Sud,66.977850,0.677000,100
11588399,ZAF,2008,100,82408.55000,9602.0,Afrique du Sud,66.977850,0.677000,100


In [215]:
# Imputation des classes parents
df_try = df_try.sort_values(['country','quantile'], ascending = True)
df_try['c_parent'] = c_parent

In [222]:
df_try.loc[df_try['quantile'] == 2]

Unnamed: 0,country,year_survey,quantile,income,gdpppp,country_name,gini,IGEincome,c_parent
1,ALB,2008,2,916.66235,7297.0,Albanie,30.462429,0.815874,1
11601,ALB,2008,2,916.66235,7297.0,Albanie,30.462429,0.815874,1
23201,ALB,2008,2,916.66235,7297.0,Albanie,30.462429,0.815874,1
34801,ALB,2008,2,916.66235,7297.0,Albanie,30.462429,0.815874,1
46401,ALB,2008,2,916.66235,7297.0,Albanie,30.462429,0.815874,1
...,...,...,...,...,...,...,...,...,...
11553501,ZAF,2008,2,138.34155,9602.0,Afrique du Sud,66.977850,0.677000,83
11565101,ZAF,2008,2,138.34155,9602.0,Afrique du Sud,66.977850,0.677000,88
11576701,ZAF,2008,2,138.34155,9602.0,Afrique du Sud,66.977850,0.677000,92
11588301,ZAF,2008,2,138.34155,9602.0,Afrique du Sud,66.977850,0.677000,94


In [190]:
# Enregistrement du fichier
df_try.to_csv('df_try_final.csv', index=False)

## <font color="#0000a0">2. ANOVA sur la variable revenus des enfants en fonction du pays </font>


In [263]:
import statsmodels.formula.api as smf
import statsmodels.api as sm
from statsmodels import regression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

**H0: Tous les pays ont le même revenu**: Le pays n'a pas d'influence sur le revenu enfant moyen

**H1: Les revenus dépendent du pays**: Le pays a une influence sur le revenu

In [230]:
# Préparation des données à l'ANOVA
anova1 = df_try.groupby(['country','quantile']).mean()
anova1.reset_index(inplace = True)

In [231]:
# Application de l'ANOVA
anova_pays = smf.ols('income ~ country', data=anova1).fit(alpha=0.05)
anova_pays.summary().tables[0]

0,1,2,3
Dep. Variable:,income,R-squared:,0.496
Model:,OLS,Adj. R-squared:,0.491
Method:,Least Squares,F-statistic:,98.43
Date:,"Sun, 24 May 2020",Prob (F-statistic):,0.0
Time:,17:24:13,Log-Likelihood:,-118620.0
No. Observations:,11600,AIC:,237500.0
Df Residuals:,11484,BIC:,238300.0
Df Model:,115,,
Covariance Type:,nonrobust,,


In [233]:
# Affichage des resultats du test de Fisher  (influence de la variable pays)
test_fisher = sm.stats.anova_lm(anova_pays, typ=1)
test_fisher

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
country,115.0,510237500000.0,4436848000.0,98.429271,0.0
Residual,11484.0,517658600000.0,45076510.0,,


**Conclusion :**
- La p-valeur du Test de Fischer est nulle, donc inférieure au seuil de 5%
- On rejette donc l'hypothèse H0 et on conclut que le pays a une influence sur le revenu

## <font color="#0000a0">3. Régression linéaire 1 (2 variables explicatives : revenu moyen et indice de gini)</font>

### Option sans passage au logarithme

In [236]:
# Test de significativité global
reg = smf.ols('income ~ gdpppp + gini', data=anova1)
reg_multi = reg.fit()
reg_multi.summary().tables[0]

0,1,2,3
Dep. Variable:,income,R-squared:,0.448
Model:,OLS,Adj. R-squared:,0.448
Method:,Least Squares,F-statistic:,4701.0
Date:,"Sun, 24 May 2020",Prob (F-statistic):,0.0
Time:,17:39:27,Log-Likelihood:,-119150.0
No. Observations:,11600,AIC:,238300.0
Df Residuals:,11597,BIC:,238300.0
Df Model:,2,,
Covariance Type:,nonrobust,,


In [245]:
# Test de significativité des variables
reg_multi.summary().tables[1]

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-656.7763,338.576,-1.940,0.052,-1320.443,6.890
gdpppp,0.4857,0.005,89.915,0.000,0.475,0.496
gini,18.0278,7.914,2.278,0.023,2.516,33.540


In [244]:
# Modéle avec scikit learn
Y=anova1["income"].values
X=anova1[["gdpppp","gini"]]

X_train,X_test,Y_train,Y_test=train_test_split(X,Y)
model=LinearRegression().fit(X_train,Y_train)
print('score:',model.score(X_test,Y_test))
print('coefficients:',model.coef_)
print('intercept:',model.intercept_)


score: 0.43316020613724515
coefficients: [ 0.48800157 23.07778316]
intercept: -830.8574930976129


### Option avec passage au logarithme


In [247]:
# Ajout de variables logarithmiques 
anova1['ln_income'] = np.log(anova1["income"])
anova1['ln_gdpppp'] = np.log(anova1["gdpppp"])
anova1

Unnamed: 0,country,quantile,year_survey,income,gdpppp,gini,IGEincome,c_parent,ln_income,ln_gdpppp
0,ALB,1,2008.0,728.89795,7297.0,30.462429,0.815874,10.069,6.591534,8.895219
1,ALB,2,2008.0,916.66235,7297.0,30.462429,0.815874,14.923,6.820739,8.895219
2,ALB,3,2008.0,1010.91600,7297.0,30.462429,0.815874,16.663,6.918612,8.895219
3,ALB,4,2008.0,1086.90780,7297.0,30.462429,0.815874,18.699,6.991092,8.895219
4,ALB,5,2008.0,1132.69970,7297.0,30.462429,0.815874,21.357,7.032359,8.895219
...,...,...,...,...,...,...,...,...,...,...
11595,ZAF,96,2008.0,24553.56800,9602.0,66.977850,0.677000,77.382,10.108612,9.169727
11596,ZAF,97,2008.0,28858.03100,9602.0,66.977850,0.677000,79.408,10.270144,9.169727
11597,ZAF,98,2008.0,35750.29000,9602.0,66.977850,0.677000,80.337,10.484314,9.169727
11598,ZAF,99,2008.0,46297.31600,9602.0,66.977850,0.677000,83.325,10.742839,9.169727


In [248]:
# Test global avec stat models
reg_log = smf.ols('ln_income ~ ln_gdpppp + gini', data=anova1)
reg_multi_log = reg_log.fit()
reg_multi_log .summary().tables[0]

0,1,2,3
Dep. Variable:,ln_income,R-squared:,0.652
Model:,OLS,Adj. R-squared:,0.652
Method:,Least Squares,F-statistic:,10870.0
Date:,"Sun, 24 May 2020",Prob (F-statistic):,0.0
Time:,17:51:18,Log-Likelihood:,-14080.0
No. Observations:,11600,AIC:,28170.0
Df Residuals:,11597,BIC:,28190.0
Df Model:,2,,
Covariance Type:,nonrobust,,


In [249]:
# Test des variables avec stat models
reg_multi_log.summary().tables[1]

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.7843,0.074,10.586,0.000,0.639,0.930
ln_gdpppp,0.8658,0.006,134.992,0.000,0.853,0.878
gini,-0.0149,0.001,-16.797,0.000,-0.017,-0.013


In [250]:
# Modéle avec scikit learn
Y=anova1["ln_income"].values
X=anova1[["ln_gdpppp","gini"]]

X_train,X_test,Y_train,Y_test=train_test_split(X,Y)
model2=LinearRegression().fit(X_train,Y_train)
print('score:',model2.score(X_test,Y_test))
print('coefficients:',model2.coef_)
print('intercept:',model2.intercept_)


score: 0.650797338157682
coefficients: [ 0.86817844 -0.01487646]
intercept: 0.7667161746807869


## <font color="#0000a0">4. Régression linéaire 2 (3 variables explicatives : revenu moyen, indice de gini et la classe des parents)</font>

### Option sans passage au logarithme

In [252]:
# Test de significativité global
reg3 = smf.ols('income ~ gdpppp + gini + c_parent ', data=df_try)
reg_multi3 = reg3.fit()
reg_multi3.summary().tables[0]

0,1,2,3
Dep. Variable:,income,R-squared:,0.471
Model:,OLS,Adj. R-squared:,0.471
Method:,Least Squares,F-statistic:,3441000.0
Date:,"Sun, 24 May 2020",Prob (F-statistic):,0.0
Time:,18:00:04,Log-Likelihood:,-118910000.0
No. Observations:,11600000,AIC:,237800000.0
Df Residuals:,11599996,BIC:,237800000.0
Df Model:,3,,
Covariance Type:,nonrobust,,


In [253]:
# Test de significativité des variables
reg_multi3.summary().tables[1]

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-3162.4999,11.053,-286.119,0.000,-3184.164,-3140.836
gdpppp,0.4857,0.000,2905.265,0.000,0.485,0.486
gini,18.0278,0.245,73.609,0.000,17.548,18.508
c_parent,49.6183,0.070,712.413,0.000,49.482,49.755


In [259]:
# Modèle avec scikit learn

Y=df_try["income"].values
X=df_try[["gdpppp","gini",'c_parent']]

X_train,X_test,Y_train,Y_test=train_test_split(X,Y)
model3=LinearRegression().fit(X_train,Y_train)
print('score:',model3.score(X_test,Y_test))
print('coefficients:',model3.coef_)
print('intercept:',model3.intercept_)

score: 0.4694662970796415
coefficients: [ 0.48555575 18.19631629 49.67504907]
intercept: -3170.36740446002


### Option avec passage au logarithme

In [254]:
# Ajout des variables logarithmiques
df_try['ln_income'] = np.log(df_try["income"])
df_try['ln_gdpppp'] = np.log(df_try["gdpppp"])
df_try

Unnamed: 0,country,year_survey,quantile,income,gdpppp,country_name,gini,IGEincome,c_parent,ln_income,ln_gdpppp
0,ALB,2008,1,728.89795,7297.0,Albanie,30.462429,0.815874,1,6.591534,8.895219
11600,ALB,2008,1,728.89795,7297.0,Albanie,30.462429,0.815874,1,6.591534,8.895219
23200,ALB,2008,1,728.89795,7297.0,Albanie,30.462429,0.815874,1,6.591534,8.895219
34800,ALB,2008,1,728.89795,7297.0,Albanie,30.462429,0.815874,1,6.591534,8.895219
46400,ALB,2008,1,728.89795,7297.0,Albanie,30.462429,0.815874,1,6.591534,8.895219
...,...,...,...,...,...,...,...,...,...,...,...
11553599,ZAF,2008,100,82408.55000,9602.0,Afrique du Sud,66.977850,0.677000,100,11.319444,9.169727
11565199,ZAF,2008,100,82408.55000,9602.0,Afrique du Sud,66.977850,0.677000,100,11.319444,9.169727
11576799,ZAF,2008,100,82408.55000,9602.0,Afrique du Sud,66.977850,0.677000,100,11.319444,9.169727
11588399,ZAF,2008,100,82408.55000,9602.0,Afrique du Sud,66.977850,0.677000,100,11.319444,9.169727


In [257]:
reg4 = smf.ols('ln_income ~ ln_gdpppp + gini + c_parent ', data=df_try)
reg_multi4 = reg4.fit()
reg_multi4.summary().tables[0]

0,1,2,3
Dep. Variable:,ln_income,R-squared:,0.699
Model:,OLS,Adj. R-squared:,0.699
Method:,Least Squares,F-statistic:,8968000.0
Date:,"Sun, 24 May 2020",Prob (F-statistic):,0.0
Time:,18:07:06,Log-Likelihood:,-13246000.0
No. Observations:,11600000,AIC:,26490000.0
Df Residuals:,11599996,BIC:,26490000.0
Df Model:,3,,
Covariance Type:,nonrobust,,


In [258]:
reg_multi4.summary().tables[1]

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.2628,0.002,118.665,0.000,0.258,0.267
ln_gdpppp,0.8658,0.000,4587.634,0.000,0.865,0.866
gini,-0.0149,2.61e-05,-570.852,0.000,-0.015,-0.015
c_parent,0.0103,7.71e-06,1339.362,0.000,0.010,0.010


In [260]:
Y=df_try["ln_income"].values
X=df_try[["ln_gdpppp","gini",'c_parent']]

X_train,X_test,Y_train,Y_test=train_test_split(X,Y)
model4=LinearRegression().fit(X_train,Y_train)
print('score:',model4.score(X_test,Y_test))
print('coefficients:',model4.coef_)
print('intercept:',model4.intercept_)

score: 0.6983123067552697
coefficients: [ 0.86590391 -0.01485522  0.01032432]
intercept: 0.2609951865611402


## <font color="#0000a0">5. Décomposition de la variance</font>

## <font color="#0000a0">6. Réponses aux questions</font>