# Data Analysis on the impact of Industrial Trends on Mental Health

In [1]:
import glob

import numpy as np
import pandas as pd

df = []
files = glob.glob("Cleaned Datasets/*.csv")
for f in files:
    df.append(pd.read_csv(f))
# df_GDP =  pd.read_csv("Cleaned Datasets/E-GDP.csv")
# df_GDPpc = pd.read_csv("Cleaned Datasets/E-GDPperCapita.csv")

files





['Cleaned Datasets\\E-GDP.csv',
 'Cleaned Datasets\\E-GDPperCapita.csv',
 'Cleaned Datasets\\E-Inflation.csv',
 'Cleaned Datasets\\E-Unemployment.csv',
 'Cleaned Datasets\\S-Anxiety.csv',
 'Cleaned Datasets\\S-Bipolar.csv',
 'Cleaned Datasets\\S-Eating.csv']

#### Dictionary to hold DataSet Names

In [2]:
file_to_index= dict((f.split('\\')[1].rstrip(".csv"),i) for i,f in enumerate(files))
index_to_file =dict((i,f.split('\\')[1].rstrip(".csv")) for i,f in enumerate(files))
index_to_file


{0: 'E-GDP',
 1: 'E-GDPperCapita',
 2: 'E-Inflation',
 3: 'E-Unemployment',
 4: 'S-Anxiety',
 5: 'S-Bipolar',
 6: 'S-Eating'}

# CLEANING

#### Collecting Country Names with NaN values acroos all Dataframes

In [3]:
del_list = set()
for  i in range(len(df)):
    nanval = df[i][df[i].isna().any(axis=1)]
    for name in nanval["Country Name"]:
        del_list.add(name)
del_list


{'American Samoa',
 'Andorra',
 'Antigua and Barbuda',
 'Argentina',
 'Bermuda',
 'Comoros',
 'Congo, Dem. Rep.',
 'Cuba',
 'Dominica',
 'Eritrea',
 'Greenland',
 'Grenada',
 'Guam',
 'Kiribati',
 "Korea, Dem. People's Rep.",
 'Liberia',
 'Libya',
 'Marshall Islands',
 'Micronesia, Fed. Sts.',
 'Monaco',
 'Nauru',
 'Northern Mariana Islands',
 'Palau',
 'Puerto Rico',
 'San Marino',
 'Seychelles',
 'Somalia',
 'South Sudan',
 'St. Kitts and Nevis',
 'Suriname',
 'Syrian Arab Republic',
 'Tajikistan',
 'Turkmenistan',
 'Tuvalu',
 'Venezuela, RB',
 'Virgin Islands (U.S.)',
 'Yemen, Rep.'}

#### Removing all Countries from all DataFrames tha has a NaN value in any DataFrame to keep Countries across all DataFrames Consistent

In [4]:
for  i in range(len(df)):
        df[i] = df[i][df[i]["Country Name"].isin(del_list) == False]


In [5]:
df[5]

Unnamed: 0,Country Name,Code,2015,2016,2017,2018,2019,Average
0,Afghanistan,AFG,0.699595,0.699618,0.699622,0.699638,0.699645,0.699623
1,Albania,ALB,0.542202,0.542146,0.542102,0.542057,0.542009,0.542103
2,Algeria,DZA,0.756340,0.756327,0.756312,0.756294,0.756275,0.756310
5,Angola,AGO,0.553900,0.553900,0.553900,0.553900,0.553899,0.553900
8,Armenia,ARM,0.541710,0.541649,0.541587,0.541527,0.541465,0.541587
...,...,...,...,...,...,...,...,...
195,Venezuela,VEN,0.828631,0.829154,0.830110,0.830518,0.830463,0.829775
196,Vietnam,VNM,0.336145,0.336134,0.336123,0.336112,0.336103,0.336123
197,Yemen,YEM,0.725891,0.725918,0.725949,0.725967,0.725980,0.725941
198,Zambia,ZMB,0.574490,0.574482,0.574473,0.574464,0.574454,0.574472


#### Reset Indexes of DataFrae

In [6]:
for  i in range(len(df)):
    df[i] =df[i].reset_index()

#### Dictionary to hold Indexes of 163 countries

In [7]:
country_to_index =dict((c,i) for i,c in enumerate(df[0]["Country Name"]))
index_to_country =dict((i,c) for i,c in enumerate(df[0]["Country Name"]))

#### Force Values of column Average to be int


In [8]:
df[0].dtypes

index             int64
Country Name     object
Country Code     object
2015            float64
2016            float64
2017            float64
2018            float64
2019            float64
Average          object
dtype: object

In [9]:
for  i in range(len(df)):
    df[i]["Average"]=df[i]["Average"].astype('float64')

In [10]:
df[0].dtypes

index             int64
Country Name     object
Country Code     object
2015            float64
2016            float64
2017            float64
2018            float64
2019            float64
Average         float64
dtype: object

#### save re-cleaned datasets

In [11]:
# for f in  range(len(df)):
#     df[f].to_csv("./Re-Cleaned Datasets/"+ index_to_file[f]+".csv")

# Correlation
##### 3D list to hold  4 Economy X 3 Stress X 163(after cleaning) Countries Correlation Matrices

In [12]:
# dfcorr = [[[0 for country in range(163)]for stress in range(3)] for economy in range(4)]
# len(dfcorr[0][0])

dfcorr = np.zeros((4,3,163))
dfcorr.shape


(4, 3, 163)

## Pearson's Correlation
### Using Pearson's correlation on each country for years 2015-2019  from each pair of datasets

In [13]:
index =0
for  e in range(4):
    for s in range(3):
        for c in range(163):
            dfcorr[e][s][c] = df[e].iloc[[c],3:8].T.corrwith(df[s+4].iloc[[c],3:8].T,method='pearson').iloc[0]
dfcorr



array([[[-0.18868536,  0.64334526,  0.66066771, ..., -0.63422211,
          0.15996501, -0.00923054],
        [-0.57602256, -0.96136187, -0.73648971, ..., -0.97192737,
         -0.86732135, -0.41977413],
        [-0.70268126,  0.96674398,  0.66220564, ...,  0.97676835,
          0.87937896, -0.42021701]],

       [[-0.68434336, -0.07020769,  0.93370258, ..., -0.57954736,
         -0.25618647, -0.04343658],
        [-0.94484825,  0.39144005, -0.97263716, ..., -0.71057857,
         -0.62539444, -0.32896133],
        [-0.95793929, -0.39061429,  0.83318036, ...,  0.72238714,
          0.66754256, -0.32936785]],

       [[-0.02458244, -0.46687858, -0.23260293, ..., -0.12099645,
         -0.26435826,  0.82958832],
        [ 0.2374305 , -0.11373073,  0.11255341, ...,  0.4219596 ,
          0.47672264, -0.74208249],
        [ 0.41229121,  0.11930685,  0.06667825, ..., -0.3094646 ,
         -0.48503391, -0.7183715 ]],

       [[ 0.60045549, -0.71371118, -0.92156638, ..., -0.34174513,
         -

In [14]:
dfcorr[0][0][0]

-0.18868535757524701

# Analysis
#### Estimate factor importance by the percentage of correlation greater than 50% :
 $ \frac{card(|corr| > 0.5)}{163} $, where corr ∈ dfcorr

In [15]:
impact = np.zeros((4,3))

for  e in range(4):
    for s in range(3):
        count =0;
        for c in range(163):
            if abs(dfcorr[e][s][c])>=0.5:
                count += 1
        impact[e][s]= count/163


impact

array([[0.59509202, 0.90797546, 0.87730061],
       [0.60736196, 0.84662577, 0.80368098],
       [0.49079755, 0.62576687, 0.63190184],
       [0.63190184, 0.81595092, 0.82208589]])

#### Sorting impact to get factor importance

In [16]:
importance ={}
for e in range(4):
    for s in range(3):
        importance[index_to_file[e]+"/"+index_to_file[s+4]] = impact[e][s]
sorted_importance =sorted(importance.items(),  key=lambda x:x[1],reverse=True)
sorted_importance

[('E-GDP/S-Bipolar', 0.9079754601226994),
 ('E-GDP/S-Eating', 0.8773006134969326),
 ('E-GDPperCapita/S-Bipolar', 0.8466257668711656),
 ('E-Unemployment/S-Eating', 0.8220858895705522),
 ('E-Unemployment/S-Bipolar', 0.8159509202453987),
 ('E-GDPperCapita/S-Eating', 0.803680981595092),
 ('E-Inflation/S-Eating', 0.6319018404907976),
 ('E-Unemployment/S-Anxiety', 0.6319018404907976),
 ('E-Inflation/S-Bipolar', 0.6257668711656442),
 ('E-GDPperCapita/S-Anxiety', 0.6073619631901841),
 ('E-GDP/S-Anxiety', 0.5950920245398773),
 ('E-Inflation/S-Anxiety', 0.49079754601226994)]

In [17]:
importance

{'E-GDP/S-Anxiety': 0.5950920245398773,
 'E-GDP/S-Bipolar': 0.9079754601226994,
 'E-GDP/S-Eating': 0.8773006134969326,
 'E-GDPperCapita/S-Anxiety': 0.6073619631901841,
 'E-GDPperCapita/S-Bipolar': 0.8466257668711656,
 'E-GDPperCapita/S-Eating': 0.803680981595092,
 'E-Inflation/S-Anxiety': 0.49079754601226994,
 'E-Inflation/S-Bipolar': 0.6257668711656442,
 'E-Inflation/S-Eating': 0.6319018404907976,
 'E-Unemployment/S-Anxiety': 0.6319018404907976,
 'E-Unemployment/S-Bipolar': 0.8159509202453987,
 'E-Unemployment/S-Eating': 0.8220858895705522}

#### Subdivision of positive and negative impacts among |correlation| >0.5 for each pair of datasets


In [18]:
file_to_index

{'E-GDP': 0,
 'E-GDPperCapita': 1,
 'E-Inflation': 2,
 'E-Unemployment': 3,
 'S-Anxiety': 4,
 'S-Bipolar': 5,
 'S-Eating': 6}

In [19]:
impact_subdivsion = {}

for e in range(len(impact)):
    for s in range(len(impact[0])):
        poscount=0
        negcount=0
        impact_subdivsion[index_to_file[e]+"/"+index_to_file[s+4]]={}
        for c in range(163):
            if abs(dfcorr[e][s][c])>=0.5:
                if dfcorr[e][s][c]>=0:
                        poscount += 1
                if dfcorr[e][s][c]<0.5:
                    negcount += 1
            else:
                continue
        impact_subdivsion[index_to_file[e]+"/"+index_to_file[s+4]]['positive'] = poscount
        impact_subdivsion[index_to_file[e]+"/"+index_to_file[s+4]]['negative'] = negcount

impact_subdivsion


{'E-GDP/S-Anxiety': {'positive': 56, 'negative': 41},
 'E-GDP/S-Bipolar': {'positive': 38, 'negative': 110},
 'E-GDP/S-Eating': {'positive': 132, 'negative': 11},
 'E-GDPperCapita/S-Anxiety': {'positive': 55, 'negative': 44},
 'E-GDPperCapita/S-Bipolar': {'positive': 36, 'negative': 102},
 'E-GDPperCapita/S-Eating': {'positive': 117, 'negative': 14},
 'E-Inflation/S-Anxiety': {'positive': 36, 'negative': 44},
 'E-Inflation/S-Bipolar': {'positive': 45, 'negative': 57},
 'E-Inflation/S-Eating': {'positive': 64, 'negative': 39},
 'E-Unemployment/S-Anxiety': {'positive': 59, 'negative': 44},
 'E-Unemployment/S-Bipolar': {'positive': 70, 'negative': 63},
 'E-Unemployment/S-Eating': {'positive': 57, 'negative': 77}}

# Result Summary

#### Ranked importance of factors

In [21]:
sorted_importance

[('E-GDP/S-Bipolar', 0.9079754601226994),
 ('E-GDP/S-Eating', 0.8773006134969326),
 ('E-GDPperCapita/S-Bipolar', 0.8466257668711656),
 ('E-Unemployment/S-Eating', 0.8220858895705522),
 ('E-Unemployment/S-Bipolar', 0.8159509202453987),
 ('E-GDPperCapita/S-Eating', 0.803680981595092),
 ('E-Inflation/S-Eating', 0.6319018404907976),
 ('E-Unemployment/S-Anxiety', 0.6319018404907976),
 ('E-Inflation/S-Bipolar', 0.6257668711656442),
 ('E-GDPperCapita/S-Anxiety', 0.6073619631901841),
 ('E-GDP/S-Anxiety', 0.5950920245398773),
 ('E-Inflation/S-Anxiety', 0.49079754601226994)]

In [23]:
impact_subdivsion


{'E-GDP/S-Anxiety': {'positive': 56, 'negative': 41},
 'E-GDP/S-Bipolar': {'positive': 38, 'negative': 110},
 'E-GDP/S-Eating': {'positive': 132, 'negative': 11},
 'E-GDPperCapita/S-Anxiety': {'positive': 55, 'negative': 44},
 'E-GDPperCapita/S-Bipolar': {'positive': 36, 'negative': 102},
 'E-GDPperCapita/S-Eating': {'positive': 117, 'negative': 14},
 'E-Inflation/S-Anxiety': {'positive': 36, 'negative': 44},
 'E-Inflation/S-Bipolar': {'positive': 45, 'negative': 57},
 'E-Inflation/S-Eating': {'positive': 64, 'negative': 39},
 'E-Unemployment/S-Anxiety': {'positive': 59, 'negative': 44},
 'E-Unemployment/S-Bipolar': {'positive': 70, 'negative': 63},
 'E-Unemployment/S-Eating': {'positive': 57, 'negative': 77}}